MzXMLFileParser.java

package io.github.msdk.io.mzxml;

import java.io.DataInputStream;
import java.io.File;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Base64;
import java.util.Date;
import java.util.zip.InflaterInputStream;

import javax.annotation.Nonnull;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Range;

import io.github.msdk.MSDKException;
import io.github.msdk.MSDKMethod;
import io.github.msdk.datamodel.impl.SimpleIsolationInfo;
import io.github.msdk.datamodel.impl.SimpleMsScan;
import io.github.msdk.datamodel.msspectra.MsSpectrumType;
import io.github.msdk.datamodel.rawdata.IsolationInfo;
import io.github.msdk.datamodel.rawdata.MsScanType;
import io.github.msdk.datamodel.rawdata.PolarityType;
import io.github.msdk.datamodel.rawdata.RawDataFile;
import io.github.msdk.spectra.spectrumtypedetection.SpectrumTypeDetectionAlgorithm;
import it.unimi.dsi.io.ByteBufferInputStream;
import javolution.text.CharArray;
import javolution.xml.internal.stream.XMLStreamReaderImpl;
import javolution.xml.stream.XMLStreamConstants;
import javolution.xml.stream.XMLStreamReader;

/**
 * <p>MzXMLFileParser class.</p>
 *
 */
public class MzXMLFileParser implements MSDKMethod<RawDataFile> {

  private final @Nonnull File mzXMLFile;
  private MzXMLRawDataFile newRawFile;
  private volatile boolean canceled;
  private Float progress;
  private int lastLoggedProgress;
  private Logger logger;

  private SimpleMsScan buildingScan;
  private DatatypeFactory dataTypeFactory;

  final static String TAG_MS_RUN = "msRun";
  final static String TAG_SCAN = "scan";
  final static String TAG_PEAKS = "peaks";
  final static String TAG_PRECURSOR_MZ = "precursorMz";

  /**
   * <p>Constructor for MzXMLFileParser.</p>
   *
   * @param mzXMLFile a {@link java.io.File} object.
   */
  public MzXMLFileParser(File mzXMLFile) {
    this.mzXMLFile = mzXMLFile;
    this.canceled = false;
    this.progress = 0f;
    this.lastLoggedProgress = 0;
    this.logger = LoggerFactory.getLogger(this.getClass());
  }

  /**
   * <p>Constructor for MzXMLFileParser.</p>
   *
   * @param mzXMLFileName a {@link java.lang.String} object.
   */
  public MzXMLFileParser(String mzXMLFileName) {
    this(new File(mzXMLFileName));
  }

  /**
   * <p>Constructor for MzXMLFileParser.</p>
   *
   * @param mzXMLFilePath a {@link java.nio.file.Path} object.
   */
  public MzXMLFileParser(Path mzXMLFilePath) {
    this(mzXMLFilePath.toFile());
  }

  /**
   * <p>execute.</p>
   *
   * @return a {@link io.github.msdk.datamodel.rawdata.RawDataFile} object.
   * @throws io.github.msdk.MSDKException if any.
   */
  public RawDataFile execute() throws MSDKException {

    try {
      MzXMLFileMemoryMapper mapper = new MzXMLFileMemoryMapper();
      ByteBufferInputStream is = mapper.mapToMemory(mzXMLFile);

      final XMLStreamReaderImpl xmlStreamReader = new XMLStreamReaderImpl();
      xmlStreamReader.setInput(is, "UTF-8");

      newRawFile = new MzXMLRawDataFile(mzXMLFile);
      dataTypeFactory = DatatypeFactory.newInstance();
      Vars vars = new Vars();

      int eventType;
      try {

        do {
          // check if parsing has been cancelled?
          if (canceled)
            return null;

          eventType = xmlStreamReader.next();

          progress = ((float) xmlStreamReader.getLocation().getCharacterOffset() / is.length());

          // Log progress after every 10% completion
          if ((int) (progress * 100) >= lastLoggedProgress + 10) {
            lastLoggedProgress = (int) (progress * 10) * 10;
            logger.debug("Parsing in progress... " + lastLoggedProgress + "% completed");
          }

          switch (eventType) {
            case XMLStreamConstants.START_ELEMENT:
              final CharArray openingTagName = xmlStreamReader.getLocalName();
              vars.currentTag = openingTagName;

              if (openingTagName.contentEquals(TAG_SCAN)) {
                CharArray scanNumber = getRequiredAttribute(xmlStreamReader, "num");
                CharArray msLevel = getRequiredAttribute(xmlStreamReader, "msLevel");
                CharArray peaksCount = getRequiredAttribute(xmlStreamReader, "peaksCount");

                int scanNumberInt = scanNumber.toInt();
                int msLevelInt = msLevel.toInt();
                vars.peaksCount = peaksCount.toInt();

                CharArray msFuncName = xmlStreamReader.getAttributeValue(null, "scanType");

                buildingScan = new SimpleMsScan(scanNumberInt);

                // MS function
                if (msFuncName != null) {
                  buildingScan.setRawDataFile(newRawFile);
                  buildingScan.setMsLevel(msLevelInt);
                  buildingScan.setMsFunction(msFuncName.toString());

                  // Scan type & definition
                  buildingScan.setMsScanType(MsScanType.UNKNOWN);

                  // String filterLine = attrs.getValue("filterLine"); //Copied from the current
                  // parser, always null
                  buildingScan.setScanDefinition(null);
                }

                // Polarity
                PolarityType polarity = PolarityType.UNKNOWN;
                CharArray polarityAttr = xmlStreamReader.getAttributeValue(null, "polarity");
                if (polarityAttr != null) {
                  switch (polarityAttr.charAt(0)) {
                    case '+':
                      polarity = PolarityType.POSITIVE;
                      break;
                    case '-':
                      polarity = PolarityType.NEGATIVE;
                      break;
                  }
                }
                buildingScan.setPolarity(polarity);

                // Parse retention time
                CharArray retentionTimeStr =
                    xmlStreamReader.getAttributeValue(null, "retentionTime");
                if (retentionTimeStr != null) {
                  Date currentDate = new Date();
                  Duration dur = dataTypeFactory.newDuration(retentionTimeStr.toString());
                  final float rt = (float) (dur.getTimeInMillis(currentDate) / 1000.0);
                  buildingScan.setRetentionTime(rt);
                }

              } else if (openingTagName.contentEquals(TAG_PEAKS)) {
                vars.compressionFlag = false;
                CharArray compressionType =
                    xmlStreamReader.getAttributeValue(null, "compressionType");
                if (compressionType != null && !compressionType.contentEquals("none"))
                  vars.compressionFlag = true;

                CharArray precision = getRequiredAttribute(xmlStreamReader, "precision");
                vars.precision = precision.toString();

                vars.peaksStart = xmlStreamReader.getLocation().getCharacterOffset();

              } else if (openingTagName.contentEquals(TAG_PRECURSOR_MZ)) {
                CharArray precursorCharge =
                    xmlStreamReader.getAttributeValue(null, "precursorCharge");
                if (precursorCharge != null)
                  vars.precursorCharge = precursorCharge.toInt();

              }

              break;

            case XMLStreamConstants.END_ELEMENT:

              final CharArray closingTagName = xmlStreamReader.getLocalName();

              switch (closingTagName.toString()) {
                case TAG_SCAN:
                  newRawFile.addScan(buildingScan);
                  break;
                case TAG_PEAKS:
                  double[] mzValues = new double[vars.peaksCount];
                  float[] intensityValues = new float[vars.peaksCount];

                  // Base64 decoder
                  InputStream decodedIs = Base64.getDecoder().wrap(vars.peaksChars);
                  InflaterInputStream iis = null;
                  DataInputStream peakStream = null;

                  // Decompress if the array is compressed
                  if (vars.compressionFlag) {
                    iis = new InflaterInputStream(decodedIs);
                    peakStream = new DataInputStream(iis);
                  } else {
                    peakStream = new DataInputStream(decodedIs);
                  }

                  for (int i = 0; i < vars.peaksCount; i++) {

                    // Always respect this order pairOrder="m/z-int"
                    if ("64".equals(vars.precision)) {
                      mzValues[i] = peakStream.readDouble();
                      intensityValues[i] = (float) peakStream.readDouble();
                    } else {
                      mzValues[i] = (double) peakStream.readFloat();
                      intensityValues[i] = peakStream.readFloat();
                    }

                  }

                  // Set the final data points to the scan
                  buildingScan.setDataPoints(mzValues, intensityValues, vars.peaksCount);

                  // Auto-detect whether this scan is centroided
                  MsSpectrumType spectrumType = SpectrumTypeDetectionAlgorithm
                      .detectSpectrumType(mzValues, intensityValues, vars.peaksCount);
                  buildingScan.setSpectrumType(spectrumType);
                  break;
              }
              break;

            case XMLStreamConstants.CHARACTERS:
              if (vars.currentTag != null) {
                switch (vars.currentTag.toString()) {
                  case TAG_PEAKS:
                    vars.peaksChars = new ByteBufferInputStreamAdapter(is.copy(), vars.peaksStart,
                        xmlStreamReader.getTextLength());
                    break;
                  case TAG_PRECURSOR_MZ:
                    IsolationInfo newIsolation = new SimpleIsolationInfo(
                        Range.singleton(xmlStreamReader.getText().toDouble()), null,
                        xmlStreamReader.getText().toDouble(), vars.precursorCharge, null);
                    buildingScan.getIsolations().add(newIsolation);
                    break;
                }
              }
              break;
          }

        } while (eventType != XMLStreamConstants.END_DOCUMENT);

      } finally {
        if (xmlStreamReader != null) {
          xmlStreamReader.close();
        }
      }
    } catch (Exception e) {
      throw (new MSDKException(e));
    }
    return newRawFile;

  }

  /**
   * <p>
   * Gets the required attribute from xmlStreamReader, throws an exception of the attribute is not
   * found
   * </p>
   *
   * @return a CharArray containing the value of the attribute.
   * @param xmlStreamReader a {@link javolution.xml.stream.XMLStreamReader} object.
   * @param attr a {@link java.lang.String} object.
   */
  public CharArray getRequiredAttribute(XMLStreamReader xmlStreamReader, String attr) {
    CharArray attrValue = xmlStreamReader.getAttributeValue(null, attr);
    if (attrValue == null)
      throw new IllegalStateException("Tag " + xmlStreamReader.getLocalName() + " must provide an `"
          + attr + "`attribute (Line " + xmlStreamReader.getLocation().getLineNumber() + ")");
    return attrValue;
  }

  /** {@inheritDoc} */
  @Override
  public Float getFinishedPercentage() {
    return progress;
  }

  /** {@inheritDoc} */
  @Override
  public RawDataFile getResult() {
    return newRawFile;
  }

  /** {@inheritDoc} */
  @Override
  public void cancel() {
    this.canceled = true;
  }

}



class Vars {
  String precision;
  Integer precursorCharge;
  int peaksCount;
  boolean compressionFlag;
  CharArray currentTag;
  Integer peaksStart;
  InputStream peaksChars;

  Vars() {
    precision = null;
    precursorCharge = null;
    peaksCount = 0;
    compressionFlag = false;
    currentTag = null;
    peaksStart = 0;
    peaksChars = null;
  }
}