MzMLParser.java

/*
 * (C) Copyright 2015-2017 by MSDK Development Team
 *
 * This software is dual-licensed under either
 *
 * (a) the terms of the GNU Lesser General Public License version 2.1 as published by the Free
 * Software Foundation
 *
 * or (per the licensee's choosing)
 *
 * (b) the terms of the Eclipse Public License v1.0 as published by the Eclipse Foundation.
 */

package io.github.msdk.io.mzml.data;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;

import io.github.msdk.datamodel.chromatograms.Chromatogram;
import io.github.msdk.datamodel.rawdata.MsScan;
import io.github.msdk.io.mzml.MzMLFileImportMethod;
import io.github.msdk.io.mzml.util.TagTracker;
import javolution.text.CharArray;
import javolution.xml.internal.stream.XMLStreamReaderImpl;
import javolution.xml.stream.XMLStreamConstants;
import javolution.xml.stream.XMLStreamReader;

/**
 * <p>
 * Used to parse mzML meta-data and initialize {@link io.github.msdk.io.mzml.data.MzMLBinaryDataInfo
 * MzMLBinaryDataInfo}
 * </p>
 */
public class MzMLParser {

  private Vars vars;
  private TagTracker tracker;
  private final MzMLRawDataFile newRawFile;
  private final MzMLFileImportMethod importer;

  /**
   * <p>
   * Constructor for {@link io.github.msdk.io.mzml.data.MzMLParser MzMLParser}
   * </p>
   *
   * @param importer an instance of an initialized
   *        {@link io.github.msdk.io.mzml.MzMLFileImportMethod MzMLFileImportMethod}
   */
  public MzMLParser(MzMLFileImportMethod importer) {
    this.vars = new Vars();
    this.tracker = new TagTracker();
    this.importer = importer;
    this.newRawFile = new MzMLRawDataFile(importer.getMzMLFile(), vars.msFunctionsList,
        vars.spectrumList, vars.chromatogramsList);
  }

  /**
   * <p>
   * Carry out the required parsing of the mzML data when the
   * {@link javolution.xml.internal.stream.XMLStreamReaderImpl XMLStreamReaderImpl} enters the given
   * tag
   * </p>
   *
   * @param xmlStreamReader an instance of {@link javolution.xml.internal.stream.XMLStreamReaderImpl
   * XMLStreamReaderImpl
   * @param is {@link java.io.InputStream InputStream} of the mzML data
   * @param openingTagName The tag <code>xmlStreamReader</code> entered
   */
  public void processOpeningTag(XMLStreamReaderImpl xmlStreamReader, InputStream is,
      CharArray openingTagName) {
    tracker.enter(openingTagName);

    if (tracker.inside(MzMLTags.TAG_REF_PARAM_GROUP_LIST)) {

      if (openingTagName.contentEquals(MzMLTags.TAG_REF_PARAM_GROUP)) {
        final CharArray id = getRequiredAttribute(xmlStreamReader, "id");
        vars.referenceableParamGroup = new MzMLReferenceableParamGroup(id.toString());

      } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
        MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
        vars.referenceableParamGroup.addCVParam(cvParam);

      }
    }

    if (tracker.inside(MzMLTags.TAG_SPECTRUM_LIST)) {
      if (openingTagName.contentEquals(MzMLTags.TAG_SPECTRUM)) {
        String id = getRequiredAttribute(xmlStreamReader, "id").toString();
        Integer index = getRequiredAttribute(xmlStreamReader, "index").toInt();
        vars.defaultArrayLength =
            getRequiredAttribute(xmlStreamReader, "defaultArrayLength").toInt();
        Integer scanNumber = getScanNumber(id).orElse(index + 1);
        vars.spectrum = new MzMLMsScan(newRawFile, is, id, scanNumber, vars.defaultArrayLength);


      } else if (openingTagName.contentEquals(MzMLTags.TAG_BINARY_DATA_ARRAY)) {
        vars.skipBinaryDataArray = false;
        int encodedLength = getRequiredAttribute(xmlStreamReader, "encodedLength").toInt();
        final CharArray arrayLength = xmlStreamReader.getAttributeValue(null, "arrayLength");
        if (arrayLength != null) {
          vars.binaryDataInfo = new MzMLBinaryDataInfo(encodedLength, arrayLength.toInt());
        } else {
          vars.binaryDataInfo = new MzMLBinaryDataInfo(encodedLength, vars.defaultArrayLength);
        }


      } else if (openingTagName.contentEquals(MzMLTags.TAG_SCAN)) {
        vars.scan = new MzMLScan();

      } else if (openingTagName.contentEquals(MzMLTags.TAG_SCAN_WINDOW_LIST)) {
        vars.scanWindowList = new MzMLScanWindowList();

      } else if (openingTagName.contentEquals(MzMLTags.TAG_SCAN_WINDOW)) {
        vars.scanWindow = new MzMLScanWindow();

      } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
        if (!tracker.inside(MzMLTags.TAG_BINARY_DATA_ARRAY_LIST)
            && !tracker.inside(MzMLTags.TAG_PRODUCT_LIST)
            && !tracker.inside(MzMLTags.TAG_PRECURSOR_LIST)
            && !tracker.inside(MzMLTags.TAG_SCAN_LIST) && vars.spectrum != null) {
          MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);

          // do not import the tic from the mzml file, we can calculate it on demand later
          if (!cvParam.getAccession().equals(MzMLCV.cvTIC))
            vars.spectrum.getCVParams().addCVParam(cvParam);

        } else if (tracker.inside(MzMLTags.TAG_SCAN_LIST)) {
          MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
          if (!tracker.inside(MzMLTags.TAG_SCAN_WINDOW))
            if (!tracker.inside(MzMLTags.TAG_SCAN))
              vars.spectrum.getScanList().addCVParam(cvParam);
            else
              vars.scan.addCVParam(cvParam);
          else
            vars.scanWindow.addCVParam(cvParam);

        } else if (tracker.inside(MzMLTags.TAG_SPECTRUM)
            && tracker.inside(MzMLTags.TAG_BINARY_DATA_ARRAY) && !vars.skipBinaryDataArray) {
          String accession = getRequiredAttribute(xmlStreamReader, "accession").toString();
          if (vars.binaryDataInfo.isBitLengthAccession(accession)) {
            vars.binaryDataInfo.setBitLength(accession);
          } else if (vars.binaryDataInfo.isCompressionTypeAccession(accession)) {
            manageCompression(vars.binaryDataInfo, accession);
          } else if (vars.binaryDataInfo.isArrayTypeAccession(accession)) {
            vars.binaryDataInfo.setArrayType(accession);
          } else {
            vars.skipBinaryDataArray = true;
          }

        }


      } else if (openingTagName.contentEquals(MzMLTags.TAG_BINARY)) {
        if (vars.spectrum != null && !vars.skipBinaryDataArray) {
          int bomOffset = xmlStreamReader.getLocation().getBomLength();
          // TODO Fetch long value from getCharacterOffset()
          vars.binaryDataInfo
              .setPosition(xmlStreamReader.getLocation().getCharacterOffset() + bomOffset);
        }
        if (!vars.skipBinaryDataArray) {
          if (MzMLCV.cvMzArray.equals(vars.binaryDataInfo.getArrayType().getAccession())) {
            vars.spectrum.setMzBinaryDataInfo(vars.binaryDataInfo);
          }
          if (MzMLCV.cvIntensityArray.equals(vars.binaryDataInfo.getArrayType().getAccession())) {
            vars.spectrum.setIntensityBinaryDataInfo(vars.binaryDataInfo);
          }
        }


      } else if (openingTagName.contentEquals(MzMLTags.TAG_REF_PARAM_GROUP_REF)) {
        String refValue = getRequiredAttribute(xmlStreamReader, "ref").toString();
        for (MzMLReferenceableParamGroup ref : vars.referenceableParamGroupList) {
          if (ref.getParamGroupName().equals(refValue)) {
            vars.spectrum.getCVParams().getCVParamsList().addAll(ref.getCVParamsList());
            break;
          }
        }

      }

      if (tracker.inside(MzMLTags.TAG_PRECURSOR_LIST)) {

        if (openingTagName.contentEquals(MzMLTags.TAG_PRECURSOR)) {
          final CharArray spectrumRef =
              xmlStreamReader.getAttributeValue(null, MzMLTags.ATTR_SPECTRUM_REF);
          String spectrumRefString = spectrumRef == null ? null : spectrumRef.toString();
          vars.precursor = new MzMLPrecursorElement(spectrumRefString);

        } else if (openingTagName.contentEquals(MzMLTags.TAG_ISOLATION_WINDOW)) {
          vars.isolationWindow = new MzMLIsolationWindow();

        } else if (openingTagName.contentEquals(MzMLTags.TAG_SELECTED_ION_LIST)) {
          vars.selectedIonList = new MzMLPrecursorSelectedIonList();

        } else if (openingTagName.contentEquals(MzMLTags.TAG_ACTIVATION)) {
          vars.activation = new MzMLPrecursorActivation();

        } else if (tracker.inside(MzMLTags.TAG_ISOLATION_WINDOW)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.isolationWindow.addCVParam(cvParam);
          }

        } else if (tracker.inside(MzMLTags.TAG_SELECTED_ION_LIST)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_SELECTED_ION)) {
            vars.selectedIon = new MzMLPrecursorSelectedIon();
          } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.selectedIon.addCVParam(cvParam);
          }

        } else if (tracker.inside(MzMLTags.TAG_ACTIVATION)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.activation.addCVParam(cvParam);
          }
        }
      }

      if (tracker.inside(MzMLTags.TAG_PRODUCT_LIST)) {
        if (openingTagName.contentEquals(MzMLTags.TAG_ISOLATION_WINDOW)) {
          vars.isolationWindow = new MzMLIsolationWindow();

        } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
          MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
          vars.isolationWindow.addCVParam(cvParam);
        }

      }

    } else if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM_LIST)) {
      if (openingTagName.contentEquals(MzMLTags.TAG_CHROMATOGRAM)) {
        String chromatogramId = getRequiredAttribute(xmlStreamReader, "id").toString();
        Integer chromatogramNumber = getRequiredAttribute(xmlStreamReader, "index").toInt() + 1;
        vars.defaultArrayLength =
            getRequiredAttribute(xmlStreamReader, "defaultArrayLength").toInt();
        vars.chromatogram = new MzMLChromatogram(newRawFile, is, chromatogramId, chromatogramNumber,
            vars.defaultArrayLength);

      } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
        if (!tracker.inside(MzMLTags.TAG_BINARY_DATA_ARRAY)
            && !tracker.inside(MzMLTags.TAG_PRECURSOR) && !tracker.inside(MzMLTags.TAG_PRODUCT)
            && vars.chromatogram != null) {
          MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
          vars.chromatogram.getCVParams().addCVParam(cvParam);;
        }

      } else if (openingTagName.contentEquals(MzMLTags.TAG_BINARY_DATA_ARRAY)) {
        vars.skipBinaryDataArray = false;
        int encodedLength = getRequiredAttribute(xmlStreamReader, "encodedLength").toInt();
        final CharArray arrayLength = xmlStreamReader.getAttributeValue(null, "arrayLength");
        if (arrayLength != null) {
          vars.binaryDataInfo = new MzMLBinaryDataInfo(encodedLength, arrayLength.toInt());
        } else {
          vars.binaryDataInfo = new MzMLBinaryDataInfo(encodedLength, vars.defaultArrayLength);
        }

      } else if (openingTagName.contentEquals(MzMLTags.TAG_BINARY)) {
        if (vars.chromatogram != null && !vars.skipBinaryDataArray) {
          int bomOffset = xmlStreamReader.getLocation().getBomLength();
          // TODO Fetch long value from getCharacterOffset()
          vars.binaryDataInfo
              .setPosition(xmlStreamReader.getLocation().getCharacterOffset() + bomOffset);
        }
        if (!vars.skipBinaryDataArray) {
          if (MzMLCV.cvRetentionTimeArray
              .equals(vars.binaryDataInfo.getArrayType().getAccession())) {
            vars.chromatogram.setRtBinaryDataInfo(vars.binaryDataInfo);;
          }
          if (MzMLCV.cvIntensityArray.equals(vars.binaryDataInfo.getArrayType().getAccession())) {
            vars.chromatogram.setIntensityBinaryDataInfo(vars.binaryDataInfo);
          }
        }

      } else if (openingTagName.contentEquals(MzMLTags.TAG_REF_PARAM_GROUP_REF)) {
        String refValue = xmlStreamReader.getAttributeValue(null, "ref").toString();
        for (MzMLReferenceableParamGroup ref : vars.referenceableParamGroupList) {
          if (ref.getParamGroupName().equals(refValue)) {
            vars.chromatogram.getCVParams().getCVParamsList().addAll(ref.getCVParamsList());
            break;
          }
        }

      }

      if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM)
          && tracker.inside(MzMLTags.TAG_BINARY_DATA_ARRAY)
          && openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM) && vars.binaryDataInfo != null
          && !vars.skipBinaryDataArray) {
        String accession = getRequiredAttribute(xmlStreamReader, "accession").toString();
        if (vars.binaryDataInfo.isBitLengthAccession(accession)) {
          vars.binaryDataInfo.setBitLength(accession);
        } else if (vars.binaryDataInfo.isCompressionTypeAccession(accession)) {
          manageCompression(vars.binaryDataInfo, accession);
        } else if (vars.binaryDataInfo.isArrayTypeAccession(accession)) {
          vars.binaryDataInfo.setArrayType(accession);
        } else {
          vars.skipBinaryDataArray = true;
        }

      }

      if (openingTagName.contentEquals(MzMLTags.TAG_PRECURSOR)) {
        final CharArray spectrumRef = xmlStreamReader.getAttributeValue(null, "spectrumRef");
        String spectrumRefString = spectrumRef == null ? null : spectrumRef.toString();
        vars.precursor = new MzMLPrecursorElement(spectrumRefString);

      } else if (openingTagName.contentEquals(MzMLTags.TAG_PRODUCT)) {
        vars.product = new MzMLProduct();

      } else if (tracker.inside(MzMLTags.TAG_PRECURSOR)) {
        if (openingTagName.contentEquals(MzMLTags.TAG_ISOLATION_WINDOW)) {
          vars.isolationWindow = new MzMLIsolationWindow();
          vars.selectedIonList = new MzMLPrecursorSelectedIonList();

        } else if (openingTagName.contentEquals(MzMLTags.TAG_ACTIVATION)) {
          vars.activation = new MzMLPrecursorActivation();

        } else if (tracker.inside(MzMLTags.TAG_ISOLATION_WINDOW)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.isolationWindow.addCVParam(cvParam);
          }

        } else if (tracker.inside(MzMLTags.TAG_SELECTED_ION_LIST)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_SELECTED_ION)) {
            vars.selectedIon = new MzMLPrecursorSelectedIon();
          } else if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.selectedIon.addCVParam(cvParam);
          }

        } else if (tracker.inside(MzMLTags.TAG_ACTIVATION)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.activation.addCVParam(cvParam);
          }
        }
      } else if (tracker.inside(MzMLTags.TAG_PRODUCT)) {
        if (openingTagName.contentEquals(MzMLTags.TAG_ISOLATION_WINDOW)) {
          vars.isolationWindow = new MzMLIsolationWindow();

        } else if (tracker.inside(MzMLTags.TAG_ISOLATION_WINDOW)) {
          if (openingTagName.contentEquals(MzMLTags.TAG_CV_PARAM)) {
            MzMLCVParam cvParam = createMzMLCVParam(xmlStreamReader);
            vars.isolationWindow.addCVParam(cvParam);

          }

        }
      }

    }
  }

  /**
   * <p>
   * Carry out the required parsing of the mzML data when the
   * {@link javolution.xml.internal.stream.XMLStreamReaderImpl XMLStreamReaderImpl} exits the given
   * tag
   * </p>
   *
   * @param xmlStreamReader an instance of {@link javolution.xml.internal.stream.XMLStreamReaderImpl
   * XMLStreamReaderImpl
   * @param closingTagName a {@link javolution.text.CharArray} object.
   */
  public void processClosingTag(XMLStreamReaderImpl xmlStreamReader, CharArray closingTagName) {
    tracker.exit(closingTagName);

    CharArray s = closingTagName;
    if (s.equals(MzMLTags.TAG_REF_PARAM_GROUP)) {
      vars.referenceableParamGroupList.add(vars.referenceableParamGroup);

    } else if (s.equals(MzMLTags.TAG_ISOLATION_WINDOW)) {
      if (tracker.inside(MzMLTags.TAG_PRECURSOR)) {
        vars.precursor.setIsolationWindow(vars.isolationWindow);
      } else if (tracker.inside(MzMLTags.TAG_PRODUCT)) {
        vars.product.setIsolationWindow(vars.isolationWindow);
      }

    } else if (s.equals(MzMLTags.TAG_PRODUCT)) {
      if (tracker.inside(MzMLTags.TAG_SPECTRUM))
        vars.spectrum.getProductList().addProduct(vars.product);
      else if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM))
        vars.chromatogram.setProdcut(vars.product);

    } else if (s.equals(MzMLTags.TAG_SELECTED_ION_LIST)) {
      vars.precursor.setSelectedIonList(vars.selectedIonList);

    } else if (s.equals(MzMLTags.TAG_ACTIVATION)) {
      vars.precursor.setActivation(vars.activation);

    } else if (s.equals(MzMLTags.TAG_SELECTED_ION)) {
      vars.selectedIonList.addSelectedIon(vars.selectedIon);

    } else if (s.equals(MzMLTags.TAG_PRECURSOR)) {
      if (tracker.inside(MzMLTags.TAG_SPECTRUM))
        vars.spectrum.getPrecursorList().addPrecursor(vars.precursor);
      else if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM))
        vars.chromatogram.setPrecursor(vars.precursor);

    } else if (s.equals(MzMLTags.TAG_SCAN_WINDOW)) {
      vars.scanWindowList.addScanWindow(vars.scanWindow);

    } else if (s.equals(MzMLTags.TAG_SCAN_WINDOW_LIST)) {
      if (tracker.inside(MzMLTags.TAG_SPECTRUM))
        vars.scan.setScanWindowList(vars.scanWindowList);

    } else if (s.equals(MzMLTags.TAG_SCAN)) {
      if (tracker.inside(MzMLTags.TAG_SPECTRUM))
        vars.spectrum.getScanList().addScan(vars.scan);

    } else if (tracker.inside(MzMLTags.TAG_SPECTRUM_LIST)) {
      if (closingTagName.contentEquals(MzMLTags.TAG_SPECTRUM)) {
        if (vars.spectrum.getMzBinaryDataInfo() != null
            && vars.spectrum.getIntensityBinaryDataInfo() != null && (importer.getMzMLFile() != null
                || importer.getMsScanPredicate().test(vars.spectrum))) {
          vars.spectrumList.add(vars.spectrum);
        }
      }

    } else if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM_LIST)) {
      if (closingTagName.contentEquals(MzMLTags.TAG_CHROMATOGRAM)) {
        if (vars.chromatogram.getRtBinaryDataInfo() != null
            && vars.chromatogram.getIntensityBinaryDataInfo() != null
            && (importer.getMzMLFile() != null
                || importer.getChromatogramPredicate().test(vars.chromatogram)))
          vars.chromatogramsList.add(vars.chromatogram);
      }

    }

  }

  /**
   * <p>
   * Carry out the required parsing of the mzML data when the
   * {@link javolution.xml.internal.stream.XMLStreamReaderImpl XMLStreamReaderImpl} when
   * {@link javolution.xml.stream.XMLStreamConstants#CHARACTERS CHARACTERS} are found
   * </p>
   *
   * @param xmlStreamReader an instance of {@link javolution.xml.internal.stream.XMLStreamReaderImpl
   *        XMLStreamReaderImpl
   */
  public void processCharacters(XMLStreamReaderImpl xmlStreamReader) {
    if (!newRawFile.getOriginalFile().isPresent()
        && tracker.current().contentEquals(MzMLTags.TAG_BINARY) && !vars.skipBinaryDataArray) {
      if (tracker.inside(MzMLTags.TAG_SPECTRUM_LIST)
          && importer.getMsScanPredicate().test(vars.spectrum)) {
        vars.spectrum.setInputStream(IOUtils.toInputStream(xmlStreamReader.getText()));
        switch (vars.binaryDataInfo.getArrayType().getAccession()) {
          case MzMLCV.cvMzArray:
            vars.spectrum.getMzValues();
            break;

          case MzMLCV.cvIntensityArray:
            vars.spectrum.getIntensityValues();
            break;
        }
      } else if (tracker.inside(MzMLTags.TAG_CHROMATOGRAM_LIST)
          && importer.getChromatogramPredicate().test(vars.chromatogram)) {
        vars.chromatogram.setInputStream(IOUtils.toInputStream(xmlStreamReader.getText()));
        switch (vars.binaryDataInfo.getArrayType().getAccession()) {
          case MzMLCV.cvRetentionTimeArray:
            vars.chromatogram.getRetentionTimes();
            break;

          case MzMLCV.cvIntensityArray:
            vars.chromatogram.getIntensityBinaryDataInfo();
            break;
        }
      }
    }
  }

  /**
   * <p>
   * Call this method when the <code>xmlStreamReader</code> enters <code>&lt;cvParam&gt;</code> tag
   * </p>
   * 
   * @param xmlStreamReader an instance of {@link javolution.xml.internal.stream.XMLStreamReaderImpl
   * XMLStreamReaderImpl
   * @return {@link io.github.msdk.io.mzml.data.MzMLCVParam MzMLCVParam} object notation of the
   *         <code>&lt;cvParam&gt;</code> entered
   */
  private MzMLCVParam createMzMLCVParam(XMLStreamReader xmlStreamReader) {
    CharArray accession = xmlStreamReader.getAttributeValue(null, MzMLTags.ATTR_ACCESSION);
    CharArray value = xmlStreamReader.getAttributeValue(null, MzMLTags.ATTR_VALUE);
    CharArray name = xmlStreamReader.getAttributeValue(null, MzMLTags.ATTR_NAME);
    CharArray unitAccession = xmlStreamReader.getAttributeValue(null, MzMLTags.ATTR_UNIT_ACCESSION);

    // accession is a required attribute
    if (accession == null) {
      throw new IllegalStateException("Any cvParam must have an accession.");
    }

    // these attributes are optional
    String valueStr = value == null ? null : value.toString();
    String nameStr = name == null ? null : name.toString();
    String unitAccessionStr = unitAccession == null ? null : unitAccession.toString();

    return new MzMLCVParam(accession.toString(), valueStr, nameStr, unitAccessionStr);
  }


  /**
   * <p>
   * getScanNumber.
   * </p>
   *
   * @param spectrumId a {@link java.lang.String} object.
   * @return a {@link java.lang.Integer} object.
   */
  public Optional<Integer> getScanNumber(String spectrumId) {
    final Pattern pattern = Pattern.compile("scan=([0-9]+)");
    final Matcher matcher = pattern.matcher(spectrumId);
    boolean scanNumberFound = matcher.find();

    // Some vendors include scan=XX in the ID, some don't, such as
    // mzML converted from WIFF files. See the definition of nativeID in
    // http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo
    // So, get the value of the index tag if the scanNumber is not present in the ID
    if (scanNumberFound) {
      Integer scanNumber = Integer.parseInt(matcher.group(1));
      return Optional.ofNullable(scanNumber);
    }

    return Optional.ofNullable(null);
  }

  /**
   * <p>
   * Gets the required attribute from xmlStreamReader, throws an exception if the attribute is not
   * found
   * </p>
   *
   * @param xmlStreamReader XMLStreamReader instance used to parse
   * @param attr Attribute's value to be found
   * @return a CharArray containing the value of the attribute.
   */
  public CharArray getRequiredAttribute(XMLStreamReader xmlStreamReader, String attr) {
    CharArray attrValue = xmlStreamReader.getAttributeValue(null, attr);
    if (attrValue == null)
      throw new IllegalStateException("Tag " + xmlStreamReader.getLocalName() + " must provide an `"
          + attr + "`attribute (Line " + xmlStreamReader.getLocation().getLineNumber() + ")");
    return attrValue;
  }

  /**
   * <p>manageCompression.</p>
   *
   * @param binaryInfo a {@link io.github.msdk.io.mzml.data.MzMLBinaryDataInfo} object.
   * @param accession a {@link java.lang.String} object.
   */
  public void manageCompression(MzMLBinaryDataInfo binaryInfo, String accession) {
    if (binaryInfo.getCompressionType() == MzMLCompressionType.NO_COMPRESSION)
      binaryInfo.setCompressionType(accession);
    else {
      if (binaryInfo.getCompressionType(accession) == MzMLCompressionType.ZLIB) {
        switch (binaryInfo.getCompressionType()) {
          case NUMPRESS_LINPRED:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_LINPRED_ZLIB);
            break;
          case NUMPRESS_POSINT:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_POSINT_ZLIB);
            break;
          case NUMPRESS_SHLOGF:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_SHLOGF_ZLIB);
            break;
          default:
            break;
        }
      } else {
        switch (binaryInfo.getCompressionType(accession)) {
          case NUMPRESS_LINPRED:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_LINPRED_ZLIB);
            break;
          case NUMPRESS_POSINT:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_POSINT_ZLIB);
            break;
          case NUMPRESS_SHLOGF:
            binaryInfo.setCompressionType(MzMLCompressionType.NUMPRESS_SHLOGF_ZLIB);
            break;
          default:
            break;
        }
      }
    }
  }

  /**
   * <p>getMzMLRawFile.</p>
   *
   * @return a {@link io.github.msdk.io.mzml.data.MzMLRawDataFile MzMLRawDataFile} containing the
   *         parsed data
   */
  public MzMLRawDataFile getMzMLRawFile() {
    return newRawFile;
  }

  /**
   * 
   * Static class for holding temporary instances of variables initialized while parsing
   */
  private static class Vars {

    int defaultArrayLength;
    boolean skipBinaryDataArray;
    MzMLMsScan spectrum;
    MzMLChromatogram chromatogram;
    MzMLBinaryDataInfo binaryDataInfo;
    MzMLReferenceableParamGroup referenceableParamGroup;
    MzMLPrecursorElement precursor;
    MzMLProduct product;
    MzMLIsolationWindow isolationWindow;
    MzMLPrecursorSelectedIonList selectedIonList;
    MzMLPrecursorSelectedIon selectedIon;
    MzMLPrecursorActivation activation;
    MzMLScan scan;
    MzMLScanWindowList scanWindowList;
    MzMLScanWindow scanWindow;
    ArrayList<MzMLReferenceableParamGroup> referenceableParamGroupList;
    List<MsScan> spectrumList;
    List<Chromatogram> chromatogramsList;
    List<String> msFunctionsList;

    Vars() {
      defaultArrayLength = 0;
      skipBinaryDataArray = false;
      spectrum = null;
      chromatogram = null;
      binaryDataInfo = null;
      referenceableParamGroup = null;
      precursor = null;
      product = null;
      isolationWindow = null;
      selectedIonList = null;
      selectedIon = null;
      activation = null;
      scan = null;
      scanWindowList = null;
      scanWindow = null;
      referenceableParamGroupList = new ArrayList<>();
      spectrumList = new ArrayList<>();
      chromatogramsList = new ArrayList<>();
      msFunctionsList = new ArrayList<>(); // TODO populate this list
    }
  }

}