Parse Rhea#

Parsing Rhea into a network for LipiNet

import importlib

# Reload the module to ensure changes are picked up
#importlib.reload(lipinet)

# Now can use the functions after reloading the module
from lipinet.databases import get_prior_knowledge 
from lipinet.utils import split_and_expand_large, create_nodedf_from_edgedf, check_for_split_characters

import pandas as pd
import plotly.express as px
import seaborn as sns 
import missingno as msno 
import matplotlib.pyplot as plt

Parsing the manual way#

# Use the get_prior_knowledge function for df_rhea
df_rhea = get_prior_knowledge('rhea', verbose=True)
df_rhea
File found locally at /Users/agjanyunlu/Documents/Metabolomics/lipinet/lipinet/.data/downloaded/rhea.tsv. Loading data...
Reaction identifier Equation ChEBI name ChEBI identifier EC number Enzymes Gene Ontology PubMed Cross-reference (EcoCyc) Cross-reference (MetaCyc) Cross-reference (KEGG) Cross-reference (Reactome) Cross-reference (M-CSA)
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 (S)-2-hydroxyglutarate;A;2-oxoglutarate;AH2 CHEBI:16782;CHEBI:13193;CHEBI:16810;CHEBI:17499 EC:1.1.99.2 4258 GO:0047545 2-hydroxyglutarate dehydrogenase ac... 16746551;16005139;34555022;19586914 NaN MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN KEGG:R00298 NaN NaN
1 RHEA:21256 3-phosphoshikimate + phosphoenolpyruvate = 5-O... 3-phosphoshikimate;phosphoenolpyruvate;5-O-(1-... CHEBI:145989;CHEBI:58702;CHEBI:57701;CHEBI:43474 EC:2.5.1.19 44340 GO:0003866 3-phosphoshikimate 1-carboxyvinyltr... 4289188;1344882;11300775 EcoCyc:2.5.1.19-RXN MetaCyc:2.5.1.19-RXN KEGG:R03460 NaN NaN
2 RHEA:21260 [thioredoxin]-disulfide + L-methionine + H2O =... L-cystine residue;L-methionine;H2O;L-methionin... CHEBI:50058;CHEBI:57844;CHEBI:15377;CHEBI:5877... EC:1.8.4.14 3112 GO:0033745 L-methionine-(R)-S-oxide reductase ... 20382110;20504774;20851894;19049972;12504094;2... EcoCyc:1.8.4.14-RXN MetaCyc:1.8.4.14-RXN KEGG:R07608 NaN NaN
3 RHEA:21264 glycolate + A = glyoxylate + AH2 glycolate;A;glyoxylate;AH2 CHEBI:29805;CHEBI:13193;CHEBI:36655;CHEBI:17499 EC:1.1.99.14 14321 GO:0019154 glycolate dehydrogenase activity 4557653 EcoCyc:GLYCOLATEDEHYDRO-RXN MetaCyc:GLYCOLATEDEHYDRO-RXN KEGG:R00476 NaN NaN
4 RHEA:21268 (R)-canadine + 2 NADP(+) = berberine + 2 NADPH... (R)-canadine;NADP(+);berberine;NADPH;H(+) CHEBI:18146;CHEBI:58349;CHEBI:16118;CHEBI:5778... EC:1.5.1.31 0 GO:0050623 berberine reductase activity NaN NaN MetaCyc:1.5.1.31-RXN KEGG:R07169 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
17778 RHEA:21232 L-histidine = trans-urocanate + NH4(+) L-histidine;trans-urocanate;NH4(+) CHEBI:57595;CHEBI:17771;CHEBI:28938 EC:4.3.1.3 21113 GO:0004397 histidine ammonia-lyase activity 15906398;11732994;39337646 NaN MetaCyc:HISTIDINE-AMMONIA-LYASE-RXN KEGG:R01168 Reactome:R-HSA-70899.6 NaN
17779 RHEA:21236 anthranilate + NADPH + O2 + 2 H(+) = 2,3-dihyd... anthranilate;NADPH;O2;H(+);2,3-dihydroxybenzoa... CHEBI:16567;CHEBI:57783;CHEBI:15379;CHEBI:1537... EC:1.14.13.35 0 GO:0018672 anthranilate 3-monooxygenase (deami... 6501219;3793735 NaN MetaCyc:1.14.13.35-RXN KEGG:R00980 NaN NaN
17780 RHEA:21240 geranylgeranyl diphosphate + L-cysteinyl-[prot... geranylgeranyl diphosphate;L-cysteine residue;... CHEBI:57533;CHEBI:29950;CHEBI:86021;CHEBI:33019 EC:2.5.1.59;EC:2.5.1.60 10991 GO:0004661 protein geranylgeranyltransferase a... 12036349;9121446;19894725;8106351;10745007;126... NaN MetaCyc:RXN-3701 NaN NaN NaN
17781 RHEA:21244 an 8-hydroxyfurocoumarin + S-adenosyl-L-methio... an 8-hydroxyfurocoumarin;S-adenosyl-L-methioni... CHEBI:52025;CHEBI:59789;CHEBI:52028;CHEBI:5785... EC:2.1.1.70 0 NaN 156999;15009205;28084 NaN MetaCyc:2.1.1.70-RXN KEGG:R02982 NaN NaN
17782 RHEA:21248 RNA(n) + a ribonucleoside 5'-triphosphate = RN... (nucleoside 5'-monophosphate)n;a ribonucleosid... CHEBI:140395;CHEBI:61557;CHEBI:33019 EC:2.7.7.6;EC:2.7.7.48 465623 GO:0034062 5'-3' RNA polymerase activity 4946277;5289245;4501121 EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy... KEGG:R00444 NaN NaN

17783 rows × 13 columns

# Create a figure with a specific size
fig, ax = plt.subplots(figsize=(15, 8))  # Adjust the figsize parameter (width, height)

# Create the heatmap
sns.heatmap(df_rhea.isnull(), ax=ax) #sns.heatmap(df, ax=ax, cmap="YlGnBu")
<Axes: >
../_images/b703604d0639541000b5b865cd2cab7f1fe76989234f3d7c11b4d03443a97d42.png

What is the most common participant identifier across all entries?

df_rhea['ChEBI identifier'].str.split(';').explode().value_counts()
ChEBI identifier
CHEBI:15378     9932
CHEBI:15377     6485
CHEBI:15379     2851
CHEBI:57287     1567
CHEBI:58349     1335
                ... 
CHEBI:138211       1
CHEBI:138191       1
CHEBI:18534        1
CHEBI:18533        1
CHEBI:52028        1
Name: count, Length: 13723, dtype: int64

What are the names of these molecules?

df_rhea['ChEBI name'].str.split(';').explode().value_counts()
ChEBI name
H(+)                                                      9932
H2O                                                       6485
O2                                                        2851
CoA                                                       1567
NADP(+)                                                   1335
                                                          ... 
1-octadecanoyl-2-pentanoyl-sn-glycero-3-phosphocholine       1
3'-(N-acyl-L-alpha-aminoacyl)adenylyl 3'-end residue         1
campest-4-en-3-one                                           1
5alpha-campestan-3-one                                       1
an 8-methoxyfurocoumarin                                     1
Name: count, Length: 13723, dtype: int64

So it seems that Hydrogen ions, water, and O2 are the most common ChEBI molecules in Rhea

Are there any commonly represented GO terms?

df_rhea['Gene Ontology'].value_counts(dropna=False)
Gene Ontology
NaN                                                                                            13398
GO:0047545 2-hydroxyglutarate dehydrogenase activity                                               1
GO:0015046 rubredoxin-NADP+ reductase activity                                                     1
GO:0004413 homoserine kinase activity                                                              1
GO:0008721 D-serine ammonia-lyase activity                                                         1
                                                                                               ...  
GO:0008176 tRNA (guanine(46)-N7)-methyltransferase activity                                        1
GO:0030697 tRNA (uracil(54)-C5)-methyltransferase activity, S-adenosyl methionine-dependent        1
GO:0043828 tRNA 2-selenouridine synthase activity                                                  1
GO:0052914 16S rRNA (guanine(1207)-N(2))-methyltransferase activity                                1
GO:0034062 5'-3' RNA polymerase activity                                                           1
Name: count, Length: 4386, dtype: int64

It seems that all GO terms are unique, besides the ones that are NaN.

What about common enzyme classes?

# df_rhea['Enzyme class'].value_counts()

How many don’t have any ec class?

df_rhea['EC number'].notna().value_counts()
EC number
False    10214
True      7569
Name: count, dtype: int64

What about EC number, taking into account the possibility for multiple enzyme class numbers to be assigned per reaction?

df_rhea['EC number'].str.split(';').explode().value_counts()
EC number
EC:3.6.1.9        14
EC:1.14.13.105    13
EC:1.14.12.23     12
EC:2.7.4.6        12
EC:3.1.3.62       12
                  ..
EC:2.5.1.122       1
EC:2.5.1.121       1
EC:2.4.1.328       1
EC:2.4.1.326       1
EC:2.7.7.48        1
Name: count, Length: 6153, dtype: int64

How common is it actually to have multiple EC numbers?

df_rhea['EC number'].str.split(';').str.len().value_counts()
EC number
1.0     7343
2.0      202
3.0       18
17.0       2
7.0        2
5.0        1
4.0        1
Name: count, dtype: int64

Note that the IUBMB number actually is determined by a hierarchy. Because of this, let’s turn it into a nice df ready to be parsed as a hierarchy.

Preparing the Rhea enzyme classification (EC) info#

def process_ec_numbers(df):
    """
    Process the 'EC number' column of the input DataFrame.

    Parameters:
        df (pd.DataFrame): A DataFrame containing an 'EC number' column.

    Returns:
        pd.DataFrame: A new DataFrame with the following columns:
            - 'EC_number': The reassembled EC number in the format 'EC:Main_Class.Subclass.Subsubclass.Serial_Number'
            - 'Main_Class': The first part of the EC number.
            - 'Subclass': The second part of the EC number.
            - 'Subsubclass': The third part of the EC number.
            - 'Serial_Number': The fourth part of the EC number.
    """
    # Split, explode, deduplicate, clean and split by '.' into a Series of lists.
    ec_num_series = df['EC number'].str.split(';')\
                        .explode()\
                        .drop_duplicates()\
                        .dropna()\
                        .str.strip('EC:')\
                        .str.split('.')
    
    # Convert the Series of lists into a DataFrame with named columns.
    df_ec = pd.DataFrame(ec_num_series.tolist(), 
                         columns=["Main_Class", "Subclass", "Subsubclass", "Serial_Number"])
    
    # Create a new column with the reassembled EC number in the original format.
    df_ec.insert(0, 'EC_number', 
                 'EC:' + df_ec["Main_Class"].astype(str) + '.' +
                 df_ec["Subclass"].astype(str) + '.' +
                 df_ec["Subsubclass"].astype(str) + '.' +
                 df_ec["Serial_Number"].astype(str))
    
    return df_ec
# Assuming df_rhea is your original DataFrame with the 'EC number' column.
df_ec = process_ec_numbers(df_rhea)
df_ec
EC_number Main_Class Subclass Subsubclass Serial_Number
0 EC:1.1.99.2 1 1 99 2
1 EC:2.5.1.19 2 5 1 19
2 EC:1.8.4.14 1 8 4 14
3 EC:1.1.99.14 1 1 99 14
4 EC:1.5.1.31 1 5 1 31
... ... ... ... ... ...
6148 EC:1.14.13.35 1 14 13 35
6149 EC:2.5.1.59 2 5 1 59
6150 EC:2.5.1.60 2 5 1 60
6151 EC:2.7.7.6 2 7 7 6
6152 EC:2.7.7.48 2 7 7 48

6153 rows × 5 columns

def build_rhea_ec_edges_and_nodes(df_ec: pd.DataFrame):
    """
    Given a DataFrame with EC hierarchy columns:
        Main_Class, Subclass, Subsubclass, EC_number,
    this function creates:
      - A DataFrame of edges linking each hierarchical level.
      - A DataFrame of unique nodes with a 'ec_level' column 
        indicating the node's level in the hierarchy.
    """

    # -- Make a copy so we don't modify the original df in-place
    df = df_ec.copy()

    # 1) Build the node representations for each hierarchy level
    #    (convert to string just in case they're numeric)
    df["main_node"] = "EC:" + df["Main_Class"].astype(str)
    df["subclass_node"] = (
        "EC:" + df["Main_Class"].astype(str) 
        + "." + df["Subclass"].astype(str)
    )
    df["subsubclass_node"] = (
        "EC:" + df["Main_Class"].astype(str) 
        + "." + df["Subclass"].astype(str)
        + "." + df["Subsubclass"].astype(str)
    )

    # 2) Create edges for each hierarchical link and label them
    edges1 = df[["main_node", "subclass_node"]].rename(
        columns={"main_node": "source_id", "subclass_node": "target_id"}
    )
    edges1["ec_level"] = "main_class->subclass"

    edges2 = df[["subclass_node", "subsubclass_node"]].rename(
        columns={"subclass_node": "source_id", "subsubclass_node": "target_id"}
    )
    edges2["ec_level"] = "subclass->subsubclass"

    edges3 = df[["subsubclass_node", "EC_number"]].rename(
        columns={"subsubclass_node": "source_id", "EC_number": "target_id"}
    )
    edges3["ec_level"] = "subsubclass->EC_number"

    # Concatenate edges and drop duplicates
    edges_df = pd.concat([edges1, edges2, edges3], ignore_index=True).drop_duplicates()

    # Add the additional columns for your graph structure
    edges_df["source_layer"] = "rhea_ec"
    edges_df["target_layer"] = "rhea_ec"
    edges_df["interlayer"] = False

    # 3) Build a node DataFrame from all unique source/target IDs
    nodes_df = pd.DataFrame(
        pd.concat([edges_df["source_id"], edges_df["target_id"]]).unique(),
        columns=["node_id"]
    )
    nodes_df["layer"] = "rhea_ec"  # same layer for all

    # 4) Build a small lookup table to label each node with ec_level
    #    We'll mark each node as either "main_class", "subclass", "subsubclass", or "full_ec".
    df_main = (
        df[["main_node"]].drop_duplicates().rename(columns={"main_node": "node_id"})
    )
    df_main["ec_level"] = "main_class"

    df_sub = (
        df[["subclass_node"]].drop_duplicates().rename(columns={"subclass_node": "node_id"})
    )
    df_sub["ec_level"] = "subclass"

    df_subsub = (
        df[["subsubclass_node"]].drop_duplicates().rename(columns={"subsubclass_node": "node_id"})
    )
    df_subsub["ec_level"] = "subsubclass"

    df_full = (
        df[["EC_number"]].drop_duplicates().rename(columns={"EC_number": "node_id"})
    )
    df_full["ec_level"] = "full_ec"

    df_nodes_level = pd.concat([df_main, df_sub, df_subsub, df_full], ignore_index=True)
    df_nodes_level.drop_duplicates(subset="node_id", inplace=True)

    # 5) Merge the node-level info into the nodes_df
    nodes_df = nodes_df.merge(df_nodes_level, on="node_id", how="left")

    # Return the final DataFrames
    return edges_df, nodes_df

# Example usage:
# df_edges_rhea_ec, df_nodes_rhea_ec = build_rhea_ec_edges_and_nodes(df_ec)
df_edges_rhea_ec, df_nodes_rhea_ec = build_rhea_ec_edges_and_nodes(df_ec)
df_edges_rhea_ec
source_id target_id ec_level source_layer target_layer interlayer
0 EC:1 EC:1.1 main_class->subclass rhea_ec rhea_ec False
1 EC:2 EC:2.5 main_class->subclass rhea_ec rhea_ec False
2 EC:1 EC:1.8 main_class->subclass rhea_ec rhea_ec False
4 EC:1 EC:1.5 main_class->subclass rhea_ec rhea_ec False
5 EC:6 EC:6.3 main_class->subclass rhea_ec rhea_ec False
... ... ... ... ... ... ...
18454 EC:1.14.13 EC:1.14.13.35 subsubclass->EC_number rhea_ec rhea_ec False
18455 EC:2.5.1 EC:2.5.1.59 subsubclass->EC_number rhea_ec rhea_ec False
18456 EC:2.5.1 EC:2.5.1.60 subsubclass->EC_number rhea_ec rhea_ec False
18457 EC:2.7.7 EC:2.7.7.6 subsubclass->EC_number rhea_ec rhea_ec False
18458 EC:2.7.7 EC:2.7.7.48 subsubclass->EC_number rhea_ec rhea_ec False

6482 rows × 6 columns

df_nodes_rhea_ec
node_id layer ec_level
0 EC:1 rhea_ec main_class
1 EC:2 rhea_ec main_class
2 EC:6 rhea_ec main_class
3 EC:3 rhea_ec main_class
4 EC:5 rhea_ec main_class
... ... ... ...
6484 EC:1.14.13.35 rhea_ec full_ec
6485 EC:2.5.1.59 rhea_ec full_ec
6486 EC:2.5.1.60 rhea_ec full_ec
6487 EC:2.7.7.6 rhea_ec full_ec
6488 EC:2.7.7.48 rhea_ec full_ec

6489 rows × 3 columns

Preparing the remaining Rhea info#

Now we have created the df of nodes and edges for just the enzyme classification info. We want to also create nodes and edges for all relevant rhea info too.

def explode_columns(df, columns, delimiter=";"):
    """
    Split and explode the specified columns of a DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        columns (list of str): List of column names to split by the delimiter.
        delimiter (str): The delimiter to use when splitting the column values.

    Returns:
        pd.DataFrame: A new DataFrame with the specified columns exploded.

    Note:
        Each row in the specified columns must produce lists of the same length.
    """
    df = df.copy()
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' does not exist in the DataFrame.")
        df[col] = df[col].str.split(delimiter)
    return df.explode(columns)
df_rhea_exploded = explode_columns(df_rhea, ["ChEBI identifier", "ChEBI name"])
df_rhea_exploded
Reaction identifier Equation ChEBI name ChEBI identifier EC number Enzymes Gene Ontology PubMed Cross-reference (EcoCyc) Cross-reference (MetaCyc) Cross-reference (KEGG) Cross-reference (Reactome) Cross-reference (M-CSA)
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 (S)-2-hydroxyglutarate CHEBI:16782 EC:1.1.99.2 4258 GO:0047545 2-hydroxyglutarate dehydrogenase ac... 16746551;16005139;34555022;19586914 NaN MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN KEGG:R00298 NaN NaN
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 A CHEBI:13193 EC:1.1.99.2 4258 GO:0047545 2-hydroxyglutarate dehydrogenase ac... 16746551;16005139;34555022;19586914 NaN MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN KEGG:R00298 NaN NaN
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 2-oxoglutarate CHEBI:16810 EC:1.1.99.2 4258 GO:0047545 2-hydroxyglutarate dehydrogenase ac... 16746551;16005139;34555022;19586914 NaN MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN KEGG:R00298 NaN NaN
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 AH2 CHEBI:17499 EC:1.1.99.2 4258 GO:0047545 2-hydroxyglutarate dehydrogenase ac... 16746551;16005139;34555022;19586914 NaN MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN KEGG:R00298 NaN NaN
1 RHEA:21256 3-phosphoshikimate + phosphoenolpyruvate = 5-O... 3-phosphoshikimate CHEBI:145989 EC:2.5.1.19 44340 GO:0003866 3-phosphoshikimate 1-carboxyvinyltr... 4289188;1344882;11300775 EcoCyc:2.5.1.19-RXN MetaCyc:2.5.1.19-RXN KEGG:R03460 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
17781 RHEA:21244 an 8-hydroxyfurocoumarin + S-adenosyl-L-methio... S-adenosyl-L-homocysteine CHEBI:57856 EC:2.1.1.70 0 NaN 156999;15009205;28084 NaN MetaCyc:2.1.1.70-RXN KEGG:R02982 NaN NaN
17781 RHEA:21244 an 8-hydroxyfurocoumarin + S-adenosyl-L-methio... H(+) CHEBI:15378 EC:2.1.1.70 0 NaN 156999;15009205;28084 NaN MetaCyc:2.1.1.70-RXN KEGG:R02982 NaN NaN
17782 RHEA:21248 RNA(n) + a ribonucleoside 5'-triphosphate = RN... (nucleoside 5'-monophosphate)n CHEBI:140395 EC:2.7.7.6;EC:2.7.7.48 465623 GO:0034062 5'-3' RNA polymerase activity 4946277;5289245;4501121 EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy... KEGG:R00444 NaN NaN
17782 RHEA:21248 RNA(n) + a ribonucleoside 5'-triphosphate = RN... a ribonucleoside 5'-triphosphate CHEBI:61557 EC:2.7.7.6;EC:2.7.7.48 465623 GO:0034062 5'-3' RNA polymerase activity 4946277;5289245;4501121 EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy... KEGG:R00444 NaN NaN
17782 RHEA:21248 RNA(n) + a ribonucleoside 5'-triphosphate = RN... diphosphate CHEBI:33019 EC:2.7.7.6;EC:2.7.7.48 465623 GO:0034062 5'-3' RNA polymerase activity 4946277;5289245;4501121 EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy... KEGG:R00444 NaN NaN

83885 rows × 13 columns

df_edges_rhea_reactiontochebi = df_rhea_exploded.copy()
df_edges_rhea_reactiontochebi = df_edges_rhea_reactiontochebi[['Reaction identifier', 'ChEBI identifier']] #,'ChEBI name'
df_edges_rhea_reactiontochebi.columns = ['target_id', 'source_id']
df_edges_rhea_reactiontochebi['source_layer'] = 'rhea_chebiid'
df_edges_rhea_reactiontochebi['target_layer'] = 'rhea_reactionid'
df_edges_rhea_reactiontochebi['interlayer'] = False
df_edges_rhea_reactiontochebi
target_id source_id source_layer target_layer interlayer
0 RHEA:21252 CHEBI:16782 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:13193 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:16810 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:17499 rhea_chebiid rhea_reactionid False
1 RHEA:21256 CHEBI:145989 rhea_chebiid rhea_reactionid False
... ... ... ... ... ...
17781 RHEA:21244 CHEBI:57856 rhea_chebiid rhea_reactionid False
17781 RHEA:21244 CHEBI:15378 rhea_chebiid rhea_reactionid False
17782 RHEA:21248 CHEBI:140395 rhea_chebiid rhea_reactionid False
17782 RHEA:21248 CHEBI:61557 rhea_chebiid rhea_reactionid False
17782 RHEA:21248 CHEBI:33019 rhea_chebiid rhea_reactionid False

83885 rows × 5 columns

df_rhea.columns
Index(['Reaction identifier', 'Equation', 'ChEBI name', 'ChEBI identifier',
       'EC number', 'Enzymes', 'Gene Ontology', 'PubMed',
       'Cross-reference (EcoCyc)', 'Cross-reference (MetaCyc)',
       'Cross-reference (KEGG)', 'Cross-reference (Reactome)',
       'Cross-reference (M-CSA)'],
      dtype='object')
df_nodes_rhea_chebiid = df_rhea_exploded[['ChEBI identifier', 'ChEBI name']].rename(
    columns={'ChEBI identifier': 'node_id', 'ChEBI name': 'chebi_name'}
).assign(layer='rhea_chebiid').drop_duplicates()
# df_nodes_rhea_chebiid

# previously also included 'Participant identifier','Enzyme class' 
df_nodes_rhea_reactionid = df_rhea[['Reaction identifier','Equation','ChEBI identifier', 'ChEBI name','EC number','Enzymes','Gene Ontology','Cross-reference (Reactome)',
                                    'PubMed','Cross-reference (KEGG)']].rename(
    columns={'Reaction identifier': 'node_id', 'ChEBI name': 'chebi_name'}
).assign(layer='rhea_reactionid').drop_duplicates()
# df_nodes_rhea_reactionid

# df_nodes_rhea_reactiontochebi = pd.concat([df_nodes_rhea_chebiid, df_nodes_rhea_reactionid])
# df_nodes_rhea_reactiontochebi

df_rhea_exploded_enzymeclass = explode_columns(df_rhea, ["EC number"], delimiter=';')
df_rhea_exploded_enzymeclass

df_edges_rhea_reactiontoenzymeclass = df_rhea_exploded_enzymeclass[['Reaction identifier', 'EC number']].rename(
    columns={'Reaction identifier': 'source_id', 'EC number': 'target_id'}
).assign(source_layer='rhea_reactionid', target_layer='rhea_ec', interlayer=False).drop_duplicates()
df_edges_rhea_reactiontoenzymeclass

df_edges_rhea_reactiontochebi_comb = pd.concat([df_edges_rhea_reactiontochebi, df_edges_rhea_reactiontoenzymeclass,])
df_edges_rhea_reactiontochebi_comb

# Remake nodes with enzyme class included
df_nodes_rhea_reactiontochebi_comb = pd.concat([df_nodes_rhea_chebiid, 
                                           df_nodes_rhea_reactionid, 
                                           df_nodes_rhea_ec])
df_nodes_rhea_reactiontochebi_comb
node_id chebi_name layer Equation ChEBI identifier EC number Enzymes Gene Ontology Cross-reference (Reactome) PubMed Cross-reference (KEGG) ec_level
0 CHEBI:16782 (S)-2-hydroxyglutarate rhea_chebiid NaN NaN NaN NaN NaN NaN NaN NaN NaN
0 CHEBI:13193 A rhea_chebiid NaN NaN NaN NaN NaN NaN NaN NaN NaN
0 CHEBI:16810 2-oxoglutarate rhea_chebiid NaN NaN NaN NaN NaN NaN NaN NaN NaN
0 CHEBI:17499 AH2 rhea_chebiid NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 CHEBI:145989 3-phosphoshikimate rhea_chebiid NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
6484 EC:1.14.13.35 NaN rhea_ec NaN NaN NaN NaN NaN NaN NaN NaN full_ec
6485 EC:2.5.1.59 NaN rhea_ec NaN NaN NaN NaN NaN NaN NaN NaN full_ec
6486 EC:2.5.1.60 NaN rhea_ec NaN NaN NaN NaN NaN NaN NaN NaN full_ec
6487 EC:2.7.7.6 NaN rhea_ec NaN NaN NaN NaN NaN NaN NaN NaN full_ec
6488 EC:2.7.7.48 NaN rhea_ec NaN NaN NaN NaN NaN NaN NaN NaN full_ec

37995 rows × 12 columns

df_edges_rhea_reactiontochebi_comb
target_id source_id source_layer target_layer interlayer
0 RHEA:21252 CHEBI:16782 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:13193 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:16810 rhea_chebiid rhea_reactionid False
0 RHEA:21252 CHEBI:17499 rhea_chebiid rhea_reactionid False
1 RHEA:21256 CHEBI:145989 rhea_chebiid rhea_reactionid False
... ... ... ... ... ...
17780 EC:2.5.1.59 RHEA:21240 rhea_reactionid rhea_ec False
17780 EC:2.5.1.60 RHEA:21240 rhea_reactionid rhea_ec False
17781 EC:2.1.1.70 RHEA:21244 rhea_reactionid rhea_ec False
17782 EC:2.7.7.6 RHEA:21248 rhea_reactionid rhea_ec False
17782 EC:2.7.7.48 RHEA:21248 rhea_reactionid rhea_ec False

101957 rows × 5 columns

Get all chebi ids

df_nodes_rhea_reactiontochebi_comb[df_nodes_rhea_reactiontochebi_comb['layer']=='rhea_chebiid']['node_id'].drop_duplicates()
0         CHEBI:16782
0         CHEBI:13193
0         CHEBI:16810
0         CHEBI:17499
1        CHEBI:145989
             ...     
17773    CHEBI:144586
17775     CHEBI:57639
17777     CHEBI:18167
17781     CHEBI:52025
17781     CHEBI:52028
Name: node_id, Length: 13723, dtype: object

Using the LipiNet parse_rhea function#

The LipiNet parse_rhea function automatically runs through all of the same steps as we have just covered.

from lipinet.parse_rhea import parse_rhea_data

rhea_results = parse_rhea_data(verbose=False)
df_rhea_nodes = rhea_results['df_nodes']
df_rhea_edges = rhea_results['df_edges']
df_rhea_nodes
node_id Equation ChEBI identifier chebi_name EC number Enzymes Gene Ontology Cross-reference (Reactome) layer ec_level
0 RHEA:21252 (S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2 CHEBI:16782;CHEBI:13193;CHEBI:16810;CHEBI:17499 (S)-2-hydroxyglutarate;A;2-oxoglutarate;AH2 EC:1.1.99.2 4258.0 GO:0047545 2-hydroxyglutarate dehydrogenase ac... NaN rhea_reactionid NaN
1 RHEA:21256 3-phosphoshikimate + phosphoenolpyruvate = 5-O... CHEBI:145989;CHEBI:58702;CHEBI:57701;CHEBI:43474 3-phosphoshikimate;phosphoenolpyruvate;5-O-(1-... EC:2.5.1.19 44340.0 GO:0003866 3-phosphoshikimate 1-carboxyvinyltr... NaN rhea_reactionid NaN
2 RHEA:21260 [thioredoxin]-disulfide + L-methionine + H2O =... CHEBI:50058;CHEBI:57844;CHEBI:15377;CHEBI:5877... L-cystine residue;L-methionine;H2O;L-methionin... EC:1.8.4.14 3112.0 GO:0033745 L-methionine-(R)-S-oxide reductase ... NaN rhea_reactionid NaN
3 RHEA:21264 glycolate + A = glyoxylate + AH2 CHEBI:29805;CHEBI:13193;CHEBI:36655;CHEBI:17499 glycolate;A;glyoxylate;AH2 EC:1.1.99.14 14321.0 GO:0019154 glycolate dehydrogenase activity NaN rhea_reactionid NaN
4 RHEA:21268 (R)-canadine + 2 NADP(+) = berberine + 2 NADPH... CHEBI:18146;CHEBI:58349;CHEBI:16118;CHEBI:5778... (R)-canadine;NADP(+);berberine;NADPH;H(+) EC:1.5.1.31 0.0 GO:0050623 berberine reductase activity NaN rhea_reactionid NaN
... ... ... ... ... ... ... ... ... ... ...
37990 EC:1.14.13.35 NaN NaN NaN NaN NaN NaN NaN rhea_ec full_ec
37991 EC:2.5.1.59 NaN NaN NaN NaN NaN NaN NaN rhea_ec full_ec
37992 EC:2.5.1.60 NaN NaN NaN NaN NaN NaN NaN rhea_ec full_ec
37993 EC:2.7.7.6 NaN NaN NaN NaN NaN NaN NaN rhea_ec full_ec
37994 EC:2.7.7.48 NaN NaN NaN NaN NaN NaN NaN rhea_ec full_ec

37995 rows × 10 columns

df_rhea_edges
source_id target_id ec_level source_layer target_layer interlayer edge_type
0 EC:1 EC:1.1 main_class->subclass rhea_ec rhea_ec False ec_hierarchy
1 EC:2 EC:2.5 main_class->subclass rhea_ec rhea_ec False ec_hierarchy
2 EC:1 EC:1.8 main_class->subclass rhea_ec rhea_ec False ec_hierarchy
3 EC:1 EC:1.5 main_class->subclass rhea_ec rhea_ec False ec_hierarchy
4 EC:6 EC:6.3 main_class->subclass rhea_ec rhea_ec False ec_hierarchy
... ... ... ... ... ... ... ...
108434 RHEA:21240 EC:2.5.1.59 NaN rhea_reactionid rhea_ec False reaction_ec
108435 RHEA:21240 EC:2.5.1.60 NaN rhea_reactionid rhea_ec False reaction_ec
108436 RHEA:21244 EC:2.1.1.70 NaN rhea_reactionid rhea_ec False reaction_ec
108437 RHEA:21248 EC:2.7.7.6 NaN rhea_reactionid rhea_ec False reaction_ec
108438 RHEA:21248 EC:2.7.7.48 NaN rhea_reactionid rhea_ec False reaction_ec

108439 rows × 7 columns

df_rhea_edges['ec_level'].value_counts(dropna=False)
ec_level
NaN                       101957
subsubclass->EC_number      6153
subclass->subsubclass        253
main_class->subclass          76
Name: count, dtype: int64
df_rhea_nodes['ec_level'].value_counts(dropna=False)
ec_level
NaN            31506
full_ec         6153
subsubclass      253
subclass          76
main_class         7
Name: count, dtype: int64