Untargeted Metabolomics Pre-Processing

The universal workflow for untargeted metabolomics always consists of feature detection in the individual MS sample files and their linkage to consensus features with common m/z and retention time values. In addition, there are optional steps such as adduct detection and annotation of features with associated MS2 spectra.

_images/metabolomics-preprocessing.png

First, download two example mzML files.

from urllib.request import urlretrieve
gh = "https://raw.githubusercontent.com/OpenMS/pyopenms-docs/master"
urlretrieve (gh + "/src/data/Metabolomics_1.mzML", "Metabolomics_1.mzML")
urlretrieve (gh + "/src/data/Metabolomics_2.mzML", "Metabolomics_2.mzML")

For each mzML file do mass trace, elution peak and feature detection.

from pyopenms import *
import os

mzML_files = ["Metabolomics_1.mzML", "Metabolomics_2.mzML"]

feature_maps = []
for file in mzML_files:
    # load mzML file into MSExperiment
    exp = MSExperiment()
    MzMLFile().load(file, exp) # load each mzML file to an OpenMS file format (MSExperiment)

    # mass trace detection
    mass_traces = [] # introduce an empty list where the mass traces will be loaded
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults() # get the default parameters in order to edit them
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par) # set the new parameters
    mtd.run(exp, mass_traces, 0) # run mass trace detection

    # elution peak detection
    mass_traces_deconvol = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed") # The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_deconvol)

    # feature detection
    feature_map = FeatureMap() # output features
    chrom_out = [] # output chromatograms
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults()
    ffm_par.setValue("remove_single_traces", "true") # remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_deconvol, feature_map, chrom_out)
    feature_map.setUniqueIds() # Assigns a new, valid unique id per feature
    feature_map.setPrimaryMSRunPath([file.encode()]) # Sets the file path to the primary MS run (usually the mzML file)
    feature_maps.append(feature_map)

Align feature retention times based on the feature map with the highest number of features (reference map).

# use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = feature_maps.index(sorted(feature_maps, key=lambda x: x.size())[-1])

aligner = MapAlignmentAlgorithmPoseClustering()

trafos = {}

# parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) # infinite
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) # Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() # save the transformed data points
    aligner.align(feature_map, trafo)
    trafos[feature_map.getMetaValue("spectra_data")[0].decode()] = trafo
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True)

Align mzML files aligment based on FeatureMap alignment (optional, only for GNPS).

# align mzML files based on FeatureMap alignment and store as mzML files (for GNPS!)
for file in mzML_files:
    exp = MSExperiment()
    MzMLFile().load(file, exp)
    exp.sortSpectra(True)
    exp.setMetaValue("mzML_path", file)
    if file not in trafos.keys():
        MzMLFile().store(file[:-5]+"_aligned.mzML", exp)
        continue
    transformer = MapAlignmentTransformer()
    trafo_description = trafos[file]
    transformer.transformRetentionTimes(exp, trafo_description, True)
    MzMLFile().store(file[:-5]+"_aligned.mzML", exp)
mzML_files = [file[:-5]+"_aligned.mzML" for file in mzML_files]

Map MS2 spectra to features as PeptideIdentification objects (optional, only for GNPS).

feature_maps_mapped = []
use_centroid_rt = False
use_centroid_mz = True
mapper = IDMapper()
for file in mzML_files:
    exp = MSExperiment()
    MzMLFile().load(file, exp)
    for i, feature_map in enumerate(feature_maps):
        if feature_map.getMetaValue("spectra_data")[0].decode() == exp.getMetaValue("mzML_path"):
            peptide_ids = []
            protein_ids = []
            mapper.annotate(feature_map, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            fm_new = FeatureMap(feature_map)
            fm_new.clear(False)
            # set unique identifiers to protein and peptide identifications
            prot_ids = []
            if len(feature_map.getProteinIdentifications()) > 0:
                prot_id = feature_map.getProteinIdentifications()[0]
                prot_id.setIdentifier(f"Identifier_{i}")
                prot_ids.append(prot_id)
            fm_new.setProteinIdentifications(prot_ids)
            for feature in feature_map:
                pep_ids = []
                for pep_id in feature.getPeptideIdentifications():
                    pep_id.setIdentifier(f"Identifier_{i}")
                    pep_ids.append(pep_id)
                feature.setPeptideIdentifications(pep_ids)
                fm_new.push_back(feature)
            feature_maps_mapped.append(fm_new)
feature_maps = feature_maps_mapped

Detect adducts (optional, only for SIRIUS and GNPS Ion Identity Molecular Networking).

feature_maps_adducts = []
for feature_map in feature_maps:
    mfd = MetaboliteFeatureDeconvolution()
    mdf_par = mfd.getDefaults()
    mdf_par.setValue("potential_adducts", [b"H:+:0.4",b"Na:+:0.2",b"NH4:+:0.2", b"H-1O-1:+:0.1", b"H-3O-2:+:0.1"])
    mfd.setParameters(mdf_par)
    feature_map_adduct = FeatureMap()
    mfd.compute(feature_map, feature_map_adduct,
                ConsensusMap(), ConsensusMap())
    feature_maps_adducts.append(feature_map_adduct)
feature_maps = feature_maps_adducts

# for SIRIUS store the feature maps as featureXML files!
for feature_map in feature_maps:
    FeatureXMLFile().store(feature_map.getMetaValue("spectra_data")[0].decode()[:-4]+"featureXML", feature_map)

Link features in a ConsensusMap.

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(
        feature_map.getMetaValue("spectra_data")[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)
consensus_map.setUniqueIds()
ConsensusXMLFile().store("FeatureMatrix.consensusXML", consensus_map)

To get a final feature matrix in a table format, export the consensus features in a pandas DataFrame.

df = consensus_map.get_df()