Untargeted Metabolomics Pre-Processing

The universal workflow for untargeted metabolomics always consists of feature detection in the individual MS sample files and their linkage to consensus features with common m/z and retention time values. In addition, there are optional steps such as adduct detection and annotation of features with associated MS2 spectra.

First, download two example mzML files.

from urllib.request import urlretrieve
gh = "https://raw.githubusercontent.com/OpenMS/pyopenms-docs/master"
urlretrieve (gh + "/src/data/Metabolomics_1.mzML", "Metabolomics_1.mzML")
urlretrieve (gh + "/src/data/Metabolomics_2.mzML", "Metabolomics_2.mzML")

For each mzML file do mass trace, elution peak and feature detection.

from pyopenms import *
import os

mzML_files = ["Metabolomics_1.mzML", "Metabolomics_2.mzML"]

feature_maps = []
for file in mzML_files:
    # load mzML file into MSExperiment
    exp = MSExperiment()
    MzMLFile().load(file, exp) # load each mzML file to an OpenMS file format (MSExperiment)

    # mass trace detection
    mass_traces = [] # introduce an empty list where the mass traces will be loaded
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults() # get the default parameters in order to edit them
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par) # set the new parameters
    mtd.run(exp, mass_traces, 0) # run mass trace detection

    # elution peak detection
    mass_traces_deconvol = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed") # The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_deconvol)

    # feature detection
    feature_map = FeatureMap() # output features
    chrom_out = [] # output chromatograms
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults()
    ffm_par.setValue("remove_single_traces", "true") # remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_deconvol, feature_map, chrom_out)
    feature_map.setUniqueIds() # Assigns a new, valid unique id per feature
    feature_map.setPrimaryMSRunPath([file.encode()]) # Sets the file path to the primary MS run (usually the mzML file)
    feature_maps.append(feature_map)

Align feature retention times based on the feature map with the highest number of features (reference map).

# use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = feature_maps.index(sorted(feature_maps, key=lambda x: x.size())[-1])

aligner = MapAlignmentAlgorithmPoseClustering()

trafos = {}

# parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) # infinite
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) # Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() # save the transformed data points
    aligner.align(feature_map, trafo)
    trafos[feature_map.getMetaValue("spectra_data")[0].decode()] = trafo
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True)

Align mzML files aligment based on FeatureMap alignment (optional, only for GNPS).

# align mzML files based on FeatureMap alignment and store as mzML files (for GNPS!)
for file in mzML_files:
    exp = MSExperiment()
    MzMLFile().load(file, exp)
    exp.sortSpectra(True)
    exp.setMetaValue("mzML_path", file)
    if file not in trafos.keys():
        MzMLFile().store(file[:-5]+"_aligned.mzML", exp)
        continue
    transformer = MapAlignmentTransformer()
    trafo_description = trafos[file]
    transformer.transformRetentionTimes(exp, trafo_description, True)
    MzMLFile().store(file[:-5]+"_aligned.mzML", exp)
mzML_files = [file[:-5]+"_aligned.mzML" for file in mzML_files]

Map MS2 spectra to features as PeptideIdentification objects (optional, only for GNPS).

feature_maps_mapped = []
use_centroid_rt = False
use_centroid_mz = True
mapper = IDMapper()
for file in mzML_files:
    exp = MSExperiment()
    MzMLFile().load(file, exp)
    for i, feature_map in enumerate(feature_maps):
        if feature_map.getMetaValue("spectra_data")[0].decode() == exp.getMetaValue("mzML_path"):
            peptide_ids = []
            protein_ids = []
            mapper.annotate(feature_map, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            fm_new = FeatureMap(feature_map)
            fm_new.clear(False)
            # set unique identifiers to protein and peptide identifications
            prot_ids = []
            if len(feature_map.getProteinIdentifications()) > 0:
                prot_id = feature_map.getProteinIdentifications()[0]
                prot_id.setIdentifier(f"Identifier_{i}")
                prot_ids.append(prot_id)
            fm_new.setProteinIdentifications(prot_ids)
            for feature in feature_map:
                pep_ids = []
                for pep_id in feature.getPeptideIdentifications():
                    pep_id.setIdentifier(f"Identifier_{i}")
                    pep_ids.append(pep_id)
                feature.setPeptideIdentifications(pep_ids)
                fm_new.push_back(feature)
            feature_maps_mapped.append(fm_new)
feature_maps = feature_maps_mapped

Detect adducts (optional, only for SIRIUS and GNPS Ion Identity Molecular Networking).

feature_maps_adducts = []
for feature_map in feature_maps:
    mfd = MetaboliteFeatureDeconvolution()
    mdf_par = mfd.getDefaults()
    mdf_par.setValue("potential_adducts", [b"H:+:0.4",b"Na:+:0.2",b"NH4:+:0.2", b"H-1O-1:+:0.1", b"H-3O-2:+:0.1"])
    mfd.setParameters(mdf_par)
    feature_map_adduct = FeatureMap()
    mfd.compute(feature_map, feature_map_adduct,
                ConsensusMap(), ConsensusMap())
    feature_maps_adducts.append(feature_map_adduct)
feature_maps = feature_maps_adducts

# for SIRIUS store the feature maps as featureXML files!
for feature_map in feature_maps:
    FeatureXMLFile().store(feature_map.getMetaValue("spectra_data")[0].decode()[:-4]+"featureXML", feature_map)

Link features in a ConsensusMap.

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(
        feature_map.getMetaValue("spectra_data")[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)
consensus_map.setUniqueIds()
ConsensusXMLFile().store("FeatureMatrix.consensusXML", consensus_map)

To get a final feature matrix in a table format, export the consensus features in a pandas DataFrame.

df = consensus_map.get_df()