Untargeted Metabolomics Pre-Processing#

The universal workflow for untargeted metabolomics always consists of feature detection in the individual MS sample files and their linkage to consensus features with common m/z and retention time values. In addition, there are optional steps such as adduct detection and annotation of features with associated MS2 spectra.

../_images/metabolomics-preprocessing.png

First, download two example mzML files.

from urllib.request import urlretrieve

gh = "https://raw.githubusercontent.com/OpenMS/pyopenms-docs/master"
urlretrieve(gh + "/src/data/Metabolomics_1.mzML", "Metabolomics_1.mzML")
urlretrieve(gh + "/src/data/Metabolomics_2.mzML", "Metabolomics_2.mzML")

For each mzML file do mass trace, elution peak and features detection.

import pyopenms as oms
import os

mzML_files = ["Metabolomics_1.mzML", "Metabolomics_2.mzML"]

feature_maps = []
for file in mzML_files:
    # load mzML file into MSExperiment
    exp = oms.MSExperiment()
    oms.MzMLFile().load(
        file, exp
    )  # load each mzML file to an OpenMS file format (MSExperiment)

    # mass trace detection
    mass_traces = (
        []
    )  # introduce an empty list where the mass traces will be loaded
    mtd = oms.MassTraceDetection()
    mtd_par = (
        mtd.getDefaults()
    )  # get the default parameters in order to edit them
    mtd_par.setValue("mass_error_ppm", 10.0)  # high-res instrument, orbitraps
    mtd_par.setValue(
        "noise_threshold_int", 1.0e04
    )  # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par)  # set the new parameters
    mtd.run(exp, mass_traces, 0)  # run mass trace detection

    # elution peak detection
    mass_traces_deconvol = []
    epd = oms.ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue(
        "width_filtering", "fixed"
    )  # The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_deconvol)

    # feature detection
    feature_map = oms.FeatureMap()  # output features
    chrom_out = []  # output chromatograms
    ffm = oms.FeatureFindingMetabo()
    ffm_par = ffm.getDefaults()
    ffm_par.setValue(
        "remove_single_traces", "true"
    )  # remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_deconvol, feature_map, chrom_out)
    feature_map.setUniqueIds()  # Assigns a new, valid unique id per feature
    feature_map.setPrimaryMSRunPath(
        [file.encode()]
    )  # Sets the file path to the primary MS run (usually the mzML file)
    feature_maps.append(feature_map)

Align features retention times based on the feature map with the highest number of features (reference map).

 1# use as reference for alignment, the file with the largest number of features
 2# (works well if you have a pooled QC for example)
 3ref_index = feature_maps.index(sorted(feature_maps, key=lambda x: x.size())[-1])
 4
 5aligner = oms.MapAlignmentAlgorithmPoseClustering()
 6
 7trafos = {}
 8
 9# parameter optimization
10aligner_par = aligner.getDefaults()
11aligner_par.setValue("max_num_peaks_considered", -1)  # infinite
12aligner_par.setValue(
13    "pairfinder:distance_MZ:max_difference", 10.0
14)  # Never pair features with larger m/z distance
15aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
16aligner.setParameters(aligner_par)
17aligner.setReference(feature_maps[ref_index])
18
19for feature_map in feature_maps[:ref_index] + feature_maps[ref_index + 1 :]:
20    trafo = oms.TransformationDescription()  # save the transformed data points
21    aligner.align(feature_map, trafo)
22    trafos[feature_map.getMetaValue("spectra_data")[0].decode()] = trafo
23    transformer = oms.MapAlignmentTransformer()
24    transformer.transformRetentionTimes(feature_map, trafo, True)

Align mzML files aligment based on FeatureMap alignment (optional, only for GNPS).

 1# align mzML files based on FeatureMap alignment and store as mzML files (for GNPS!)
 2for file in mzML_files:
 3    exp = oms.MSExperiment()
 4    oms.MzMLFile().load(file, exp)
 5    exp.sortSpectra(True)
 6    exp.setMetaValue("mzML_path", file)
 7    if file not in trafos.keys():
 8        oms.MzMLFile().store(file[:-5] + "_aligned.mzML", exp)
 9        continue
10    transformer = oms.MapAlignmentTransformer()
11    trafo_description = trafos[file]
12    transformer.transformRetentionTimes(exp, trafo_description, True)
13    oms.MzMLFile().store(file[:-5] + "_aligned.mzML", exp)
14mzML_files = [file[:-5] + "_aligned.mzML" for file in mzML_files]

Map MS2 spectra to features as PeptideIdentification objects (optional, only for GNPS).

 1feature_maps_mapped = []
 2use_centroid_rt = False
 3use_centroid_mz = True
 4mapper = oms.IDMapper()
 5for file in mzML_files:
 6    exp = oms.MSExperiment()
 7    oms.MzMLFile().load(file, exp)
 8    for i, feature_map in enumerate(feature_maps):
 9        if feature_map.getMetaValue("spectra_data")[
10            0
11        ].decode() == exp.getMetaValue("mzML_path"):
12            peptide_ids = []
13            protein_ids = []
14            mapper.annotate(
15                feature_map,
16                peptide_ids,
17                protein_ids,
18                use_centroid_rt,
19                use_centroid_mz,
20                exp,
21            )
22            fm_new = oms.FeatureMap(feature_map)
23            fm_new.clear(False)
24            # set unique identifiers to protein and peptide identifications
25            prot_ids = []
26            if len(feature_map.getProteinIdentifications()) > 0:
27                prot_id = feature_map.getProteinIdentifications()[0]
28                prot_id.setIdentifier(f"Identifier_{i}")
29                prot_ids.append(prot_id)
30            fm_new.setProteinIdentifications(prot_ids)
31            for feature in feature_map:
32                pep_ids = []
33                for pep_id in feature.getPeptideIdentifications():
34                    pep_id.setIdentifier(f"Identifier_{i}")
35                    pep_ids.append(pep_id)
36                feature.setPeptideIdentifications(pep_ids)
37                fm_new.push_back(feature)
38            feature_maps_mapped.append(fm_new)
39feature_maps = feature_maps_mapped

Detect adducts (optional, only for SIRIUS and GNPS Ion Identity Molecular Networking).

 1feature_maps_adducts = []
 2for feature_map in feature_maps:
 3    mfd = oms.MetaboliteFeatureDeconvolution()
 4    mdf_par = mfd.getDefaults()
 5    mdf_par.setValue(
 6        "potential_adducts",
 7        [
 8            b"H:+:0.4",
 9            b"Na:+:0.2",
10            b"NH4:+:0.2",
11            b"H-1O-1:+:0.1",
12            b"H-3O-2:+:0.1",
13        ],
14    )
15    mfd.setParameters(mdf_par)
16    feature_map_adduct = oms.FeatureMap()
17    mfd.compute(feature_map, feature_map_adduct, oms.ConsensusMap(), oms.ConsensusMap())
18    feature_maps_adducts.append(feature_map_adduct)
19feature_maps = feature_maps_adducts
20
21# for SIRIUS store the feature maps as featureXML files!
22for feature_map in feature_maps:
23    oms.FeatureXMLFile().store(
24        feature_map.getMetaValue("spectra_data")[0].decode()[:-4]
25        + "featureXML",
26        feature_map,
27    )

Link features in a ConsensusMap.

 1feature_grouper = oms.FeatureGroupingAlgorithmKD()
 2
 3consensus_map = oms.ConsensusMap()
 4file_descriptions = consensus_map.getColumnHeaders()
 5
 6for i, feature_map in enumerate(feature_maps):
 7    file_description = file_descriptions.get(i, oms.ColumnHeader())
 8    file_description.filename = os.path.basename(
 9        feature_map.getMetaValue("spectra_data")[0].decode()
10    )
11    file_description.size = feature_map.size()
12    file_descriptions[i] = file_description
13
14feature_grouper.group(feature_maps, consensus_map)
15consensus_map.setColumnHeaders(file_descriptions)
16consensus_map.setUniqueIds()
17oms.ConsensusXMLFile().store("FeatureMatrix.consensusXML", consensus_map)

To get a final feature matrix in a table format, export the :consensus features in a pandas DataFrame.

1df = consensus_map.get_df()