MzXMLFileImportMethod.java

/*
 * (C) Copyright 2015-2017 by MSDK Development Team
 *
 * This software is dual-licensed under either
 *
 * (a) the terms of the GNU Lesser General Public License version 2.1 as published by the Free
 * Software Foundation
 *
 * or (per the licensee's choosing)
 *
 * (b) the terms of the Eclipse Public License v1.0 as published by the Eclipse Foundation.
 */

package io.github.msdk.io.mzxml;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Optional;
import java.util.logging.Logger;
import java.util.zip.DataFormatException;

import javax.annotation.Nonnull;
import javax.xml.bind.DatatypeConverter;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import com.google.common.collect.Range;

import io.github.msdk.MSDKException;
import io.github.msdk.MSDKMethod;
import io.github.msdk.datamodel.files.FileType;
import io.github.msdk.datamodel.impl.SimpleIsolationInfo;
import io.github.msdk.datamodel.impl.SimpleMsScan;
import io.github.msdk.datamodel.impl.SimpleRawDataFile;
import io.github.msdk.datamodel.msspectra.MsSpectrumType;
import io.github.msdk.datamodel.rawdata.IsolationInfo;
import io.github.msdk.datamodel.rawdata.MsScanType;
import io.github.msdk.datamodel.rawdata.PolarityType;
import io.github.msdk.datamodel.rawdata.RawDataFile;
import io.github.msdk.spectra.spectrumtypedetection.SpectrumTypeDetectionAlgorithm;

/**
 * This class reads mzXML file format.
 */
public class MzXMLFileImportMethod implements MSDKMethod<RawDataFile> {

  private Logger logger = Logger.getLogger(this.getClass().getName());

  private final @Nonnull File sourceFile;
  private final @Nonnull FileType fileType = FileType.MZXML;

  private SimpleRawDataFile newRawDataFile;

  private int totalScans = 0, parsedScans;

  private int peaksCount = 0;
  private boolean compressFlag = false;

  private final MzXMLHandler handler = new MzXMLHandler();

  private String precision;
  private Integer precursorCharge;

  // Buffers
  private final StringBuilder charBuffer = new StringBuilder(1 << 18);
  private double mzValues[] = new double[1000];
  private float intensityValues[] = new float[1000];

  // Retention time parser
  private DatatypeFactory dataTypeFactory;

  private boolean canceled = false;

  /*
   * This variable hold the present scan or fragment, it is send to the stack when another
   * scan/fragment appears as a parser.startElement
   */
  private SimpleMsScan buildingScan;

  /**
   * <p>
   * Constructor for MzXMLFileImportMethod.
   * </p>
   *
   * @param sourceFile a {@link java.io.File} object.
   */
  public MzXMLFileImportMethod(@Nonnull File sourceFile) {
    this.sourceFile = sourceFile;
  }

  /** {@inheritDoc} */
  @Override
  public RawDataFile execute() throws MSDKException {

    try {

      logger.info("Started parsing file " + sourceFile);

      // Create the XMLBasedRawDataFile object
      newRawDataFile =
          new SimpleRawDataFile(sourceFile.getName(), Optional.of(sourceFile), fileType);

      // Use the default (non-validating) parser
      SAXParserFactory factory = SAXParserFactory.newInstance();
      dataTypeFactory = DatatypeFactory.newInstance();
      SAXParser saxParser = factory.newSAXParser();
      saxParser.parse(sourceFile, handler);

      logger.info("Finished parsing " + sourceFile + ", parsed " + parsedScans + " scans");

      return newRawDataFile;

    } catch (Throwable e) {

      // We may already have set the status to CANCELED. In that case the
      // caught exception simply indicates end of SAX parsing.
      if (canceled)
        return null;
      else
        throw new MSDKException(e);

    }

  }

  /** {@inheritDoc} */
  @Override
  public Float getFinishedPercentage() {
    return totalScans == 0 ? 0 : (float) parsedScans / totalScans;
  }

  /** {@inheritDoc} */
  @Override
  public RawDataFile getResult() {
    return newRawDataFile;
  }

  /** {@inheritDoc} */
  @Override
  public void cancel() {
    this.canceled = true;
  }

  private class MzXMLHandler extends DefaultHandler {

    public void startElement(String namespaceURI, String lName, // local
        // name
        String qName, // qualified name
        Attributes attrs) throws SAXException {

      if (canceled)
        throw new SAXException("Parsing Cancelled");

      // <msRun>
      if (qName.equals("msRun")) {
        String s = attrs.getValue("scanCount");
        if (s != null)
          totalScans = Integer.parseInt(s);
      }

      // <scan>
      if (qName.equalsIgnoreCase("scan")) {

        /*
         * Only num, msLevel & peaksCount values are required according with mzXML standard, the
         * others are optional
         */
        int scanNumber = Integer.parseInt(attrs.getValue("num"));
        int msLevel = Integer.parseInt(attrs.getValue("msLevel"));
        peaksCount = Integer.parseInt(attrs.getValue("peaksCount"));

        // MS function
        String msFuncName = attrs.getValue("scanType");
        buildingScan = new SimpleMsScan(scanNumber);
        buildingScan.setRawDataFile(newRawDataFile);
        buildingScan.setMsLevel(msLevel);
        buildingScan.setMsFunction(msFuncName);
        // Scan type & definition
        buildingScan.setMsScanType(MsScanType.UNKNOWN);
        String filterLine = attrs.getValue("filterLine");
        buildingScan.setScanDefinition(filterLine);

        // Polarity
        PolarityType polarity;
        String polarityAttr = attrs.getValue("polarity");
        switch (polarityAttr) {
          case "+":
            polarity = PolarityType.POSITIVE;
            break;
          case "-":
            polarity = PolarityType.NEGATIVE;
            break;
          default:
            polarity = PolarityType.UNKNOWN;
            break;
        }
        buildingScan.setPolarity(polarity);

        // Parse retention time
        String retentionTimeStr = attrs.getValue("retentionTime");
        if (retentionTimeStr != null) {
          Date currentDate = new Date();
          Duration dur = dataTypeFactory.newDuration(retentionTimeStr);
          final float rt = (float) (dur.getTimeInMillis(currentDate) / 1000.0);
          buildingScan.setRetentionTime(rt);
        }

      }

      // <peaks>
      if (qName.equalsIgnoreCase("peaks")) {
        // clean the current char buffer for the new element
        charBuffer.setLength(0);
        compressFlag = false;
        String compressionType = attrs.getValue("compressionType");
        if ((compressionType == null) || (compressionType.equals("none")))
          compressFlag = false;
        else
          compressFlag = true;
        precision = attrs.getValue("precision");

      }

      // <precursorMz>
      if (qName.equalsIgnoreCase("precursorMz")) {
        // clean the current char buffer for the new element
        charBuffer.setLength(0);
        String precursorChargeAttr = attrs.getValue("precursorCharge");
        if (precursorChargeAttr != null)
          precursorCharge = Integer.parseInt(precursorChargeAttr);
      }

    }

    /**
     * endElement()
     */
    public void endElement(String namespaceURI, String sName, // simple name
        String qName // qualified name
    ) throws SAXException {

      // </scan>
      if (qName.equalsIgnoreCase("scan")) {
        newRawDataFile.addScan(buildingScan);
        parsedScans++;
        return;
      }

      // <precursorMz>
      if (qName.equalsIgnoreCase("precursorMz")) {
        final String textContent = charBuffer.toString();
        double precursorMz = 0d;
        if (!textContent.isEmpty())
          precursorMz = Double.parseDouble(textContent);
        IsolationInfo newIsolation = new SimpleIsolationInfo(Range.singleton(precursorMz), null,
            precursorMz, precursorCharge, null);
        buildingScan.getIsolations().add(newIsolation);

        return;
      }

      // <peaks>
      if (qName.equalsIgnoreCase("peaks")) {

        // Base64 decoder
        byte[] peakBytes = DatatypeConverter.parseBase64Binary(charBuffer.toString());

        if (compressFlag) {
          try {
            peakBytes = ZlibCompressionUtil.decompress(peakBytes);
          } catch (DataFormatException e) {
            throw new SAXException(e);
          }
        }

        // make a data input stream
        DataInputStream peakStream = new DataInputStream(new ByteArrayInputStream(peakBytes));

        if (peaksCount > mzValues.length) {
          mzValues = new double[peaksCount];
          intensityValues = new float[peaksCount];
        }

        try {
          for (int i = 0; i < peaksCount; i++) {

            // Always respect this order pairOrder="m/z-int"
            if ("64".equals(precision)) {
              mzValues[i] = peakStream.readDouble();
              intensityValues[i] = (float) peakStream.readDouble();
            } else {
              mzValues[i] = (double) peakStream.readFloat();
              intensityValues[i] = peakStream.readFloat();
            }

          }
        } catch (IOException eof) {
          throw new SAXException(eof);
        }
        // Set the final data points to the scan
        buildingScan.setDataPoints(mzValues, intensityValues, peaksCount);

        // Auto-detect whether this scan is centroided
        MsSpectrumType spectrumType = SpectrumTypeDetectionAlgorithm.detectSpectrumType(mzValues,
            intensityValues, peaksCount);
        buildingScan.setSpectrumType(spectrumType);

        return;
      }
    }

    /**
     * characters()
     * 
     * @see org.xml.sax.ContentHandler#characters(char[], int, int)
     */
    public void characters(char buf[], int offset, int len) throws SAXException {
      charBuffer.append(buf, offset, len);
    }
  }

}