Identification by Accurate Mass#

Example workflow for the processing of a set of mzML files (defined in the files variable) including centroiding, features detection, :term:feature: linking and accurate mass search. The resulting data gets processed in a pandas data frame with features filtering (missing values, quality) and imputation of remaining missing values. Compounds detected during accurate mass search will be annotated in the resulting dataframe.

Imports and mzML file path#

 1import os
 2import shutil
 3import requests
 4import pandas as pd
 5import pyopenms as oms
 6import numpy as np
 7from sklearn.impute import KNNImputer
 8from sklearn.preprocessing import FunctionTransformer
 9from sklearn.pipeline import Pipeline
10import plotly.graph_objects as go
11import plotly.express as px
12import matplotlib.pyplot as plt
13
14# set path to your mzML files, or leave like this to use the example data
15files = os.path.join(os.getcwd(), "IdByMz_Example")

Download Example Data#

Execute this cell only for the example workflow.

 1if not os.path.isdir(os.path.join(os.getcwd(), "IdByMz_Example")):
 2    os.mkdir(os.path.join(os.getcwd(), "IdByMz_Example"))
 3
 4base = "https://abibuilder.cs.uni-tuebingen.de/archive/openms/Tutorials/Example_Data/Metabolomics/"
 5urls = [
 6    "datasets/2012_02_03_PStd_050_1.mzML",
 7    "datasets/2012_02_03_PStd_050_2.mzML",
 8    "datasets/2012_02_03_PStd_050_3.mzML",
 9    "databases/PositiveAdducts.tsv",
10    "databases/NegativeAdducts.tsv",
11    "databases/HMDBMappingFile.tsv",
12    "databases/HMDB2StructMapping.tsv",
13]
14
15for url in urls:
16    request = requests.get(base + url, allow_redirects=True)
17    open(os.path.join(files, os.path.basename(url)), "wb").write(
18        request.content
19    )

Centroiding#

If files are already centroided this step can bet omitted.

in: path to MS data (files)

out: path to centroided mzML files in a subfolder ‘centroid’ (files)

 1if os.path.exists(os.path.join(files, "centroid")):
 2    shutil.rmtree(os.path.join(files, "centroid"))
 3os.mkdir(os.path.join(files, "centroid"))
 4
 5for file in os.listdir(files):
 6    if file.endswith(".mzML"):
 7        exp_raw = oms.MSExperiment()
 8        oms.MzMLFile().load(os.path.join(files, file), exp_raw)
 9        exp_centroid = oms.MSExperiment()
10
11        oms.PeakPickerHiRes().pickExperiment(exp_raw, exp_centroid, True)
12
13        oms.MzMLFile().store(os.path.join(files, "centroid", file), exp_centroid)
14        del exp_raw
15
16files = os.path.join(files, "centroid")

Feature Detection#

in: path to centroid mzML files (files)

out: list of FeatureMap (feature_maps)

 1feature_maps = []
 2
 3for file in os.listdir(files):
 4    if file.endswith(".mzML"):
 5        exp = oms.MSExperiment()
 6        oms.MzMLFile().load(os.path.join(files, file), exp)
 7
 8        exp.sortSpectra(True)
 9
10        mass_traces = []
11        mtd = oms.MassTraceDetection()
12        mtd_params = mtd.getDefaults()
13        mtd_params.setValue(
14            "mass_error_ppm", 5.0
15        )  # set according to your instrument mass error
16        mtd_params.setValue(
17            "noise_threshold_int", 1000.0
18        )  # adjust to noise level in your data
19        mtd.setParameters(mtd_params)
20        mtd.run(exp, mass_traces, 0)
21
22        mass_traces_split = []
23        mass_traces_final = []
24        epd = oms.ElutionPeakDetection()
25        epd_params = epd.getDefaults()
26        epd_params.setValue("width_filtering", "fixed")
27        epd.setParameters(epd_params)
28        epd.detectPeaks(mass_traces, mass_traces_split)
29
30        if epd.getParameters().getValue("width_filtering") == "auto":
31            epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
32        else:
33            mass_traces_final = mass_traces_split
34
35        feature_map = oms.FeatureMap()
36        feat_chrom = []
37        ffm = oms.FeatureFindingMetabo()
38        ffm_params = ffm.getDefaults()
39        ffm_params.setValue("isotope_filtering_model", "none")
40        ffm_params.setValue(
41            "remove_single_traces", "true"
42        )  # set false to keep features with only one mass trace
43        ffm_params.setValue("mz_scoring_by_elements", "false")
44        ffm_params.setValue("report_convex_hulls", "true")
45        ffm.setParameters(ffm_params)
46        ffm.run(mass_traces_final, feature_map, feat_chrom)
47
48        feature_map.setUniqueIds()
49        feature_map.setPrimaryMSRunPath([file[:-5].encode()])
50
51        feature_maps.append(feature_map)

Feature Map Retention Time Alignment#

in: unaligned list of FeatureMap (feature_maps)

out: list of FeatureMap aligned to the first feature map in the list (feature_maps)

 1# get in index of feature map with highest number of features in feature map list
 2ref_index = [
 3    i[0]
 4    for i in sorted(
 5        enumerate([fm.size() for fm in feature_maps]), key=lambda x: x[1]
 6    )
 7][-1]
 8
 9aligner = oms.MapAlignmentAlgorithmPoseClustering()
10
11aligner.setReference(feature_maps[ref_index])
12
13for feature_map in feature_maps[:ref_index] + feature_maps[ref_index + 1 :]:
14    trafo = oms.TransformationDescription()
15    aligner.align(feature_map, trafo)
16    transformer = oms.MapAlignmentTransformer()
17    transformer.transformRetentionTimes(
18        feature_map, trafo, True
19    )  # store original RT as meta value

Visualization of RTs before and after Alignment#

 1fmaps = (
 2    [feature_maps[ref_index]]
 3    + feature_maps[:ref_index]
 4    + feature_maps[ref_index + 1 :]
 5)
 6
 7fig = plt.figure(figsize=(10, 5))
 8
 9ax = fig.add_subplot(1, 2, 1)
10ax.set_title("consensus map before alignment")
11ax.set_ylabel("m/z")
12ax.set_xlabel("RT")
13
14# use alpha value to display feature intensity
15ax.scatter(
16    [f.getRT() for f in fmaps[0]],
17    [f.getMZ() for f in fmaps[0]],
18    alpha=np.asarray([f.getIntensity() for f in fmaps[0]])
19    / max([f.getIntensity() for f in fmaps[0]]),
20)
21
22for fm in fmaps[1:]:
23    ax.scatter(
24        [f.getMetaValue("original_RT") for f in fm],
25        [f.getMZ() for f in fm],
26        alpha=np.asarray([f.getIntensity() for f in fm])
27        / max([f.getIntensity() for f in fm]),
28    )
29
30ax = fig.add_subplot(1, 2, 2)
31ax.set_title("consensus map after alignment")
32ax.set_xlabel("RT")
33
34for fm in fmaps:
35    ax.scatter(
36        [f.getRT() for f in fm],
37        [f.getMZ() for f in fm],
38        alpha=np.asarray([f.getIntensity() for f in fm])
39        / max([f.getIntensity() for f in fm]),
40    )
41
42fig.tight_layout()
43fig.legend(
44    [fmap.getMetaValue("spectra_data")[0].decode() for fmap in fmaps],
45    loc="lower center",
46)
47# in some cases get file name elsewhere, e.g. fmap.getDataProcessing()[0].getMetaValue('parameter: out')
48fig.show()

Feature Linking#

in: list of:py:class:~.FeatureMap (feature_maps)

out: ConsensusMap (consensus_map)

 1feature_grouper = oms.FeatureGroupingAlgorithmQT()
 2
 3consensus_map = oms.ConsensusMap()
 4file_descriptions = consensus_map.getColumnHeaders()
 5
 6for i, feature_map in enumerate(feature_maps):
 7    file_description = file_descriptions.get(i, oms.ColumnHeader())
 8    file_description.filename = feature_map.getMetaValue("spectra_data")[
 9        0
10    ].decode()
11    file_description.size = feature_map.size()
12    file_description.unique_id = feature_map.getUniqueId()
13    file_descriptions[i] = file_description
14
15consensus_map.setColumnHeaders(file_descriptions)
16feature_grouper.group(feature_maps, consensus_map)

ConsensusMap to Pandas DataFrame#

in: ConsensusMap (consensus_map)

out: DataFrame with RT, mz and quality from ConsensusMap (cm_df)

1intensities = consensus_map.get_intensity_df()
2
3meta_data = consensus_map.get_metadata_df()[["RT", "mz", "quality"]]
4
5cm_df = pd.concat([meta_data, intensities], axis=1)
6cm_df.reset_index(drop=True, inplace=True)
7cm_df

Data Filtering and Imputation#

in: unfiltered ConsensusMap DataFrame (cm_df)

out: features below minimum quality and with too many missing values removed, remaining missing values imputed with KNN algorithm (cm_df)

 1allowed_missing_values = 1
 2min_feature_quality = 0.8
 3n_nearest_neighbours = 2
 4
 5# drop features that have more then the allowed number of missing values or are below minimum feature quality
 6to_drop = []
 7
 8for i, row in cm_df.iterrows():
 9    if (
10        row.isna().sum() > allowed_missing_values
11        or row["quality"] < min_feature_quality
12    ):
13        to_drop.append(i)
14
15cm_df.drop(index=cm_df.index[to_drop], inplace=True)
16
17# Data imputation with KNN
18imputer = Pipeline(
19    [
20        ("imputer", KNNImputer(n_neighbors=2)),
21        (
22            "pandarizer",
23            FunctionTransformer(
24                lambda x: pd.DataFrame(x, columns=cm_df.columns)
25            ),
26        ),
27    ]
28)
29
30cm_df = imputer.fit_transform(cm_df)
31cm_df

Annotate :term:Features<features>` with Identified Compounds#

in: ConsensusMap DataFrame without identifications (cm_df) and AccurateMassSearch DataFrame (ams_df)

out: ConsensusMap DataFrame with new identifications column (id_df)

 1id_df = cm_df
 2
 3id_df["identifications"] = pd.Series(["" for x in range(len(id_df.index))])
 4
 5for rt, mz, description in zip(
 6    ams_df["retention_time"],
 7    ams_df["exp_mass_to_charge"],
 8    ams_df["description"],
 9):
10    indices = id_df.index[
11        np.isclose(id_df["mz"], float(mz), atol=1e-05)
12        & np.isclose(id_df["RT"], float(rt), atol=1e-05)
13    ].tolist()
14    for index in indices:
15        if description != "null":
16            id_df.loc[index, "identifications"] += str(description) + ";"
17id_df["identifications"] = [
18    item[:-1] if ";" in item else "" for item in id_df["identifications"]
19]
20id_df.to_csv(os.path.join(files, "result.tsv"), sep="\t", index=False)
21id_df

Visualize Consensus Features with Identifications#

1fig = px.scatter(id_df, x="RT", y="mz", hover_name="identifications")
2fig.update_layout(title="Consensus features with identifications (hover)")
3fig.show()