Parse Rhea

Parse Rhea#

Parsing Rhea into a network for LipiNet

import importlib

# Reload the module to ensure changes are picked up
#importlib.reload(lipinet)

# Now can use the functions after reloading the module
from lipinet.databases import get_prior_knowledge 
from lipinet.utils import split_and_expand_large, create_nodedf_from_edgedf, check_for_split_characters

import pandas as pd

import plotly.express as px
import seaborn as sns 
import missingno as msno 
import matplotlib.pyplot as plt

Parsing the manual way#

# Use the get_prior_knowledge function for df_rhea
df_rhea = get_prior_knowledge('rhea', verbose=True)
df_rhea

File found locally at /Users/agjanyunlu/Documents/Metabolomics/lipinet/lipinet/.data/downloaded/rhea.tsv. Loading data...

	Reaction identifier	Equation	ChEBI name	ChEBI identifier	EC number	Enzymes	Gene Ontology	PubMed	Cross-reference (EcoCyc)	Cross-reference (MetaCyc)	Cross-reference (KEGG)	Cross-reference (Reactome)	Cross-reference (M-CSA)
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	(S)-2-hydroxyglutarate;A;2-oxoglutarate;AH2	CHEBI:16782;CHEBI:13193;CHEBI:16810;CHEBI:17499	EC:1.1.99.2	4258	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	16746551;16005139;34555022;19586914	NaN	MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN	KEGG:R00298	NaN	NaN
1	RHEA:21256	3-phosphoshikimate + phosphoenolpyruvate = 5-O...	3-phosphoshikimate;phosphoenolpyruvate;5-O-(1-...	CHEBI:145989;CHEBI:58702;CHEBI:57701;CHEBI:43474	EC:2.5.1.19	44340	GO:0003866 3-phosphoshikimate 1-carboxyvinyltr...	4289188;1344882;11300775	EcoCyc:2.5.1.19-RXN	MetaCyc:2.5.1.19-RXN	KEGG:R03460	NaN	NaN
2	RHEA:21260	[thioredoxin]-disulfide + L-methionine + H2O =...	L-cystine residue;L-methionine;H2O;L-methionin...	CHEBI:50058;CHEBI:57844;CHEBI:15377;CHEBI:5877...	EC:1.8.4.14	3112	GO:0033745 L-methionine-(R)-S-oxide reductase ...	20382110;20504774;20851894;19049972;12504094;2...	EcoCyc:1.8.4.14-RXN	MetaCyc:1.8.4.14-RXN	KEGG:R07608	NaN	NaN
3	RHEA:21264	glycolate + A = glyoxylate + AH2	glycolate;A;glyoxylate;AH2	CHEBI:29805;CHEBI:13193;CHEBI:36655;CHEBI:17499	EC:1.1.99.14	14321	GO:0019154 glycolate dehydrogenase activity	4557653	EcoCyc:GLYCOLATEDEHYDRO-RXN	MetaCyc:GLYCOLATEDEHYDRO-RXN	KEGG:R00476	NaN	NaN
4	RHEA:21268	(R)-canadine + 2 NADP(+) = berberine + 2 NADPH...	(R)-canadine;NADP(+);berberine;NADPH;H(+)	CHEBI:18146;CHEBI:58349;CHEBI:16118;CHEBI:5778...	EC:1.5.1.31	0	GO:0050623 berberine reductase activity	NaN	NaN	MetaCyc:1.5.1.31-RXN	KEGG:R07169	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
17778	RHEA:21232	L-histidine = trans-urocanate + NH4(+)	L-histidine;trans-urocanate;NH4(+)	CHEBI:57595;CHEBI:17771;CHEBI:28938	EC:4.3.1.3	21113	GO:0004397 histidine ammonia-lyase activity	15906398;11732994;39337646	NaN	MetaCyc:HISTIDINE-AMMONIA-LYASE-RXN	KEGG:R01168	Reactome:R-HSA-70899.6	NaN
17779	RHEA:21236	anthranilate + NADPH + O2 + 2 H(+) = 2,3-dihyd...	anthranilate;NADPH;O2;H(+);2,3-dihydroxybenzoa...	CHEBI:16567;CHEBI:57783;CHEBI:15379;CHEBI:1537...	EC:1.14.13.35	0	GO:0018672 anthranilate 3-monooxygenase (deami...	6501219;3793735	NaN	MetaCyc:1.14.13.35-RXN	KEGG:R00980	NaN	NaN
17780	RHEA:21240	geranylgeranyl diphosphate + L-cysteinyl-[prot...	geranylgeranyl diphosphate;L-cysteine residue;...	CHEBI:57533;CHEBI:29950;CHEBI:86021;CHEBI:33019	EC:2.5.1.59;EC:2.5.1.60	10991	GO:0004661 protein geranylgeranyltransferase a...	12036349;9121446;19894725;8106351;10745007;126...	NaN	MetaCyc:RXN-3701	NaN	NaN	NaN
17781	RHEA:21244	an 8-hydroxyfurocoumarin + S-adenosyl-L-methio...	an 8-hydroxyfurocoumarin;S-adenosyl-L-methioni...	CHEBI:52025;CHEBI:59789;CHEBI:52028;CHEBI:5785...	EC:2.1.1.70	0	NaN	156999;15009205;28084	NaN	MetaCyc:2.1.1.70-RXN	KEGG:R02982	NaN	NaN
17782	RHEA:21248	RNA(n) + a ribonucleoside 5'-triphosphate = RN...	(nucleoside 5'-monophosphate)n;a ribonucleosid...	CHEBI:140395;CHEBI:61557;CHEBI:33019	EC:2.7.7.6;EC:2.7.7.48	465623	GO:0034062 5'-3' RNA polymerase activity	4946277;5289245;4501121	EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN	MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy...	KEGG:R00444	NaN	NaN

17783 rows × 13 columns

# Create a figure with a specific size
fig, ax = plt.subplots(figsize=(15, 8))  # Adjust the figsize parameter (width, height)

# Create the heatmap
sns.heatmap(df_rhea.isnull(), ax=ax) #sns.heatmap(df, ax=ax, cmap="YlGnBu")

<Axes: >

../_images/b703604d0639541000b5b865cd2cab7f1fe76989234f3d7c11b4d03443a97d42.png

What is the most common participant identifier across all entries?

df_rhea['ChEBI identifier'].str.split(';').explode().value_counts()

ChEBI identifier
CHEBI:15378     9932
CHEBI:15377     6485
CHEBI:15379     2851
CHEBI:57287     1567
CHEBI:58349     1335
                ... 
CHEBI:138211       1
CHEBI:138191       1
CHEBI:18534        1
CHEBI:18533        1
CHEBI:52028        1
Name: count, Length: 13723, dtype: int64

What are the names of these molecules?

df_rhea['ChEBI name'].str.split(';').explode().value_counts()

ChEBI name
H(+)                                                      9932
H2O                                                       6485
O2                                                        2851
CoA                                                       1567
NADP(+)                                                   1335
                                                          ... 
1-octadecanoyl-2-pentanoyl-sn-glycero-3-phosphocholine       1
3'-(N-acyl-L-alpha-aminoacyl)adenylyl 3'-end residue         1
campest-4-en-3-one                                           1
5alpha-campestan-3-one                                       1
an 8-methoxyfurocoumarin                                     1
Name: count, Length: 13723, dtype: int64

So it seems that Hydrogen ions, water, and O2 are the most common ChEBI molecules in Rhea

Are there any commonly represented GO terms?

df_rhea['Gene Ontology'].value_counts(dropna=False)

Gene Ontology
NaN                                                                                            13398
GO:0047545 2-hydroxyglutarate dehydrogenase activity                                               1
GO:0015046 rubredoxin-NADP+ reductase activity                                                     1
GO:0004413 homoserine kinase activity                                                              1
GO:0008721 D-serine ammonia-lyase activity                                                         1
                                                                                               ...  
GO:0008176 tRNA (guanine(46)-N7)-methyltransferase activity                                        1
GO:0030697 tRNA (uracil(54)-C5)-methyltransferase activity, S-adenosyl methionine-dependent        1
GO:0043828 tRNA 2-selenouridine synthase activity                                                  1
GO:0052914 16S rRNA (guanine(1207)-N(2))-methyltransferase activity                                1
GO:0034062 5'-3' RNA polymerase activity                                                           1
Name: count, Length: 4386, dtype: int64

It seems that all GO terms are unique, besides the ones that are NaN.

What about common enzyme classes?

# df_rhea['Enzyme class'].value_counts()

How many don’t have any ec class?

df_rhea['EC number'].notna().value_counts()

EC number
False    10214
True      7569
Name: count, dtype: int64

What about EC number, taking into account the possibility for multiple enzyme class numbers to be assigned per reaction?

df_rhea['EC number'].str.split(';').explode().value_counts()

EC number
EC:3.6.1.9        14
EC:1.14.13.105    13
EC:1.14.12.23     12
EC:2.7.4.6        12
EC:3.1.3.62       12
                  ..
EC:2.5.1.122       1
EC:2.5.1.121       1
EC:2.4.1.328       1
EC:2.4.1.326       1
EC:2.7.7.48        1
Name: count, Length: 6153, dtype: int64

How common is it actually to have multiple EC numbers?

df_rhea['EC number'].str.split(';').str.len().value_counts()

EC number
0     7343
0      202
0       18
0       2
0        2
0        1
0        1
Name: count, dtype: int64

Note that the IUBMB number actually is determined by a hierarchy. Because of this, let’s turn it into a nice df ready to be parsed as a hierarchy.

Preparing the Rhea enzyme classification (EC) info#

def process_ec_numbers(df):
    """
    Process the 'EC number' column of the input DataFrame.

    Parameters:
        df (pd.DataFrame): A DataFrame containing an 'EC number' column.

    Returns:
        pd.DataFrame: A new DataFrame with the following columns:
            - 'EC_number': The reassembled EC number in the format 'EC:Main_Class.Subclass.Subsubclass.Serial_Number'
            - 'Main_Class': The first part of the EC number.
            - 'Subclass': The second part of the EC number.
            - 'Subsubclass': The third part of the EC number.
            - 'Serial_Number': The fourth part of the EC number.
    """
    # Split, explode, deduplicate, clean and split by '.' into a Series of lists.
    ec_num_series = df['EC number'].str.split(';')\
                        .explode()\
                        .drop_duplicates()\
                        .dropna()\
                        .str.strip('EC:')\
                        .str.split('.')
    
    # Convert the Series of lists into a DataFrame with named columns.
    df_ec = pd.DataFrame(ec_num_series.tolist(), 
                         columns=["Main_Class", "Subclass", "Subsubclass", "Serial_Number"])
    
    # Create a new column with the reassembled EC number in the original format.
    df_ec.insert(0, 'EC_number', 
                 'EC:' + df_ec["Main_Class"].astype(str) + '.' +
                 df_ec["Subclass"].astype(str) + '.' +
                 df_ec["Subsubclass"].astype(str) + '.' +
                 df_ec["Serial_Number"].astype(str))
    
    return df_ec

# Assuming df_rhea is your original DataFrame with the 'EC number' column.
df_ec = process_ec_numbers(df_rhea)
df_ec

	EC_number	Main_Class	Subclass	Subsubclass	Serial_Number
0	EC:1.1.99.2	1	1	99	2
1	EC:2.5.1.19	2	5	1	19
2	EC:1.8.4.14	1	8	4	14
3	EC:1.1.99.14	1	1	99	14
4	EC:1.5.1.31	1	5	1	31
...	...	...	...	...	...
6148	EC:1.14.13.35	1	14	13	35
6149	EC:2.5.1.59	2	5	1	59
6150	EC:2.5.1.60	2	5	1	60
6151	EC:2.7.7.6	2	7	7	6
6152	EC:2.7.7.48	2	7	7	48

6153 rows × 5 columns

def build_rhea_ec_edges_and_nodes(df_ec: pd.DataFrame):
    """
    Given a DataFrame with EC hierarchy columns:
        Main_Class, Subclass, Subsubclass, EC_number,
    this function creates:
      - A DataFrame of edges linking each hierarchical level.
      - A DataFrame of unique nodes with a 'ec_level' column 
        indicating the node's level in the hierarchy.
    """

    # -- Make a copy so we don't modify the original df in-place
    df = df_ec.copy()

    # 1) Build the node representations for each hierarchy level
    #    (convert to string just in case they're numeric)
    df["main_node"] = "EC:" + df["Main_Class"].astype(str)
    df["subclass_node"] = (
        "EC:" + df["Main_Class"].astype(str) 
        + "." + df["Subclass"].astype(str)
    )
    df["subsubclass_node"] = (
        "EC:" + df["Main_Class"].astype(str) 
        + "." + df["Subclass"].astype(str)
        + "." + df["Subsubclass"].astype(str)
    )

    # 2) Create edges for each hierarchical link and label them
    edges1 = df[["main_node", "subclass_node"]].rename(
        columns={"main_node": "source_id", "subclass_node": "target_id"}
    )
    edges1["ec_level"] = "main_class->subclass"

    edges2 = df[["subclass_node", "subsubclass_node"]].rename(
        columns={"subclass_node": "source_id", "subsubclass_node": "target_id"}
    )
    edges2["ec_level"] = "subclass->subsubclass"

    edges3 = df[["subsubclass_node", "EC_number"]].rename(
        columns={"subsubclass_node": "source_id", "EC_number": "target_id"}
    )
    edges3["ec_level"] = "subsubclass->EC_number"

    # Concatenate edges and drop duplicates
    edges_df = pd.concat([edges1, edges2, edges3], ignore_index=True).drop_duplicates()

    # Add the additional columns for your graph structure
    edges_df["source_layer"] = "rhea_ec"
    edges_df["target_layer"] = "rhea_ec"
    edges_df["interlayer"] = False

    # 3) Build a node DataFrame from all unique source/target IDs
    nodes_df = pd.DataFrame(
        pd.concat([edges_df["source_id"], edges_df["target_id"]]).unique(),
        columns=["node_id"]
    )
    nodes_df["layer"] = "rhea_ec"  # same layer for all

    # 4) Build a small lookup table to label each node with ec_level
    #    We'll mark each node as either "main_class", "subclass", "subsubclass", or "full_ec".
    df_main = (
        df[["main_node"]].drop_duplicates().rename(columns={"main_node": "node_id"})
    )
    df_main["ec_level"] = "main_class"

    df_sub = (
        df[["subclass_node"]].drop_duplicates().rename(columns={"subclass_node": "node_id"})
    )
    df_sub["ec_level"] = "subclass"

    df_subsub = (
        df[["subsubclass_node"]].drop_duplicates().rename(columns={"subsubclass_node": "node_id"})
    )
    df_subsub["ec_level"] = "subsubclass"

    df_full = (
        df[["EC_number"]].drop_duplicates().rename(columns={"EC_number": "node_id"})
    )
    df_full["ec_level"] = "full_ec"

    df_nodes_level = pd.concat([df_main, df_sub, df_subsub, df_full], ignore_index=True)
    df_nodes_level.drop_duplicates(subset="node_id", inplace=True)

    # 5) Merge the node-level info into the nodes_df
    nodes_df = nodes_df.merge(df_nodes_level, on="node_id", how="left")

    # Return the final DataFrames
    return edges_df, nodes_df

# Example usage:
# df_edges_rhea_ec, df_nodes_rhea_ec = build_rhea_ec_edges_and_nodes(df_ec)

df_edges_rhea_ec, df_nodes_rhea_ec = build_rhea_ec_edges_and_nodes(df_ec)

df_edges_rhea_ec

	source_id	target_id	ec_level	source_layer	target_layer	interlayer
0	EC:1	EC:1.1	main_class->subclass	rhea_ec	rhea_ec	False
1	EC:2	EC:2.5	main_class->subclass	rhea_ec	rhea_ec	False
2	EC:1	EC:1.8	main_class->subclass	rhea_ec	rhea_ec	False
4	EC:1	EC:1.5	main_class->subclass	rhea_ec	rhea_ec	False
5	EC:6	EC:6.3	main_class->subclass	rhea_ec	rhea_ec	False
...	...	...	...	...	...	...
18454	EC:1.14.13	EC:1.14.13.35	subsubclass->EC_number	rhea_ec	rhea_ec	False
18455	EC:2.5.1	EC:2.5.1.59	subsubclass->EC_number	rhea_ec	rhea_ec	False
18456	EC:2.5.1	EC:2.5.1.60	subsubclass->EC_number	rhea_ec	rhea_ec	False
18457	EC:2.7.7	EC:2.7.7.6	subsubclass->EC_number	rhea_ec	rhea_ec	False
18458	EC:2.7.7	EC:2.7.7.48	subsubclass->EC_number	rhea_ec	rhea_ec	False

6482 rows × 6 columns

df_nodes_rhea_ec

	node_id	layer	ec_level
0	EC:1	rhea_ec	main_class
1	EC:2	rhea_ec	main_class
2	EC:6	rhea_ec	main_class
3	EC:3	rhea_ec	main_class
4	EC:5	rhea_ec	main_class
...	...	...	...
6484	EC:1.14.13.35	rhea_ec	full_ec
6485	EC:2.5.1.59	rhea_ec	full_ec
6486	EC:2.5.1.60	rhea_ec	full_ec
6487	EC:2.7.7.6	rhea_ec	full_ec
6488	EC:2.7.7.48	rhea_ec	full_ec

6489 rows × 3 columns

Preparing the remaining Rhea info#

Now we have created the df of nodes and edges for just the enzyme classification info. We want to also create nodes and edges for all relevant rhea info too.

def explode_columns(df, columns, delimiter=";"):
    """
    Split and explode the specified columns of a DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        columns (list of str): List of column names to split by the delimiter.
        delimiter (str): The delimiter to use when splitting the column values.

    Returns:
        pd.DataFrame: A new DataFrame with the specified columns exploded.

    Note:
        Each row in the specified columns must produce lists of the same length.
    """
    df = df.copy()
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' does not exist in the DataFrame.")
        df[col] = df[col].str.split(delimiter)
    return df.explode(columns)

df_rhea_exploded = explode_columns(df_rhea, ["ChEBI identifier", "ChEBI name"])
df_rhea_exploded

	Reaction identifier	Equation	ChEBI name	ChEBI identifier	EC number	Enzymes	Gene Ontology	PubMed	Cross-reference (EcoCyc)	Cross-reference (MetaCyc)	Cross-reference (KEGG)	Cross-reference (Reactome)	Cross-reference (M-CSA)
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	(S)-2-hydroxyglutarate	CHEBI:16782	EC:1.1.99.2	4258	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	16746551;16005139;34555022;19586914	NaN	MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN	KEGG:R00298	NaN	NaN
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	A	CHEBI:13193	EC:1.1.99.2	4258	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	16746551;16005139;34555022;19586914	NaN	MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN	KEGG:R00298	NaN	NaN
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	2-oxoglutarate	CHEBI:16810	EC:1.1.99.2	4258	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	16746551;16005139;34555022;19586914	NaN	MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN	KEGG:R00298	NaN	NaN
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	AH2	CHEBI:17499	EC:1.1.99.2	4258	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	16746551;16005139;34555022;19586914	NaN	MetaCyc:2-HYDROXYGLUTARATE-DEHYDROGENASE-RXN	KEGG:R00298	NaN	NaN
1	RHEA:21256	3-phosphoshikimate + phosphoenolpyruvate = 5-O...	3-phosphoshikimate	CHEBI:145989	EC:2.5.1.19	44340	GO:0003866 3-phosphoshikimate 1-carboxyvinyltr...	4289188;1344882;11300775	EcoCyc:2.5.1.19-RXN	MetaCyc:2.5.1.19-RXN	KEGG:R03460	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
17781	RHEA:21244	an 8-hydroxyfurocoumarin + S-adenosyl-L-methio...	S-adenosyl-L-homocysteine	CHEBI:57856	EC:2.1.1.70	0	NaN	156999;15009205;28084	NaN	MetaCyc:2.1.1.70-RXN	KEGG:R02982	NaN	NaN
17781	RHEA:21244	an 8-hydroxyfurocoumarin + S-adenosyl-L-methio...	H(+)	CHEBI:15378	EC:2.1.1.70	0	NaN	156999;15009205;28084	NaN	MetaCyc:2.1.1.70-RXN	KEGG:R02982	NaN	NaN
17782	RHEA:21248	RNA(n) + a ribonucleoside 5'-triphosphate = RN...	(nucleoside 5'-monophosphate)n	CHEBI:140395	EC:2.7.7.6;EC:2.7.7.48	465623	GO:0034062 5'-3' RNA polymerase activity	4946277;5289245;4501121	EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN	MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy...	KEGG:R00444	NaN	NaN
17782	RHEA:21248	RNA(n) + a ribonucleoside 5'-triphosphate = RN...	a ribonucleoside 5'-triphosphate	CHEBI:61557	EC:2.7.7.6;EC:2.7.7.48	465623	GO:0034062 5'-3' RNA polymerase activity	4946277;5289245;4501121	EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN	MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy...	KEGG:R00444	NaN	NaN
17782	RHEA:21248	RNA(n) + a ribonucleoside 5'-triphosphate = RN...	diphosphate	CHEBI:33019	EC:2.7.7.6;EC:2.7.7.48	465623	GO:0034062 5'-3' RNA polymerase activity	4946277;5289245;4501121	EcoCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN	MetaCyc:DNA-DIRECTED-RNA-POLYMERASE-RXN,MetaCy...	KEGG:R00444	NaN	NaN

83885 rows × 13 columns

df_edges_rhea_reactiontochebi = df_rhea_exploded.copy()
df_edges_rhea_reactiontochebi = df_edges_rhea_reactiontochebi[['Reaction identifier', 'ChEBI identifier']] #,'ChEBI name'
df_edges_rhea_reactiontochebi.columns = ['target_id', 'source_id']
df_edges_rhea_reactiontochebi['source_layer'] = 'rhea_chebiid'
df_edges_rhea_reactiontochebi['target_layer'] = 'rhea_reactionid'
df_edges_rhea_reactiontochebi['interlayer'] = False
df_edges_rhea_reactiontochebi

	target_id	source_id	source_layer	target_layer	interlayer
0	RHEA:21252	CHEBI:16782	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:13193	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:16810	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:17499	rhea_chebiid	rhea_reactionid	False
1	RHEA:21256	CHEBI:145989	rhea_chebiid	rhea_reactionid	False
...	...	...	...	...	...
17781	RHEA:21244	CHEBI:57856	rhea_chebiid	rhea_reactionid	False
17781	RHEA:21244	CHEBI:15378	rhea_chebiid	rhea_reactionid	False
17782	RHEA:21248	CHEBI:140395	rhea_chebiid	rhea_reactionid	False
17782	RHEA:21248	CHEBI:61557	rhea_chebiid	rhea_reactionid	False
17782	RHEA:21248	CHEBI:33019	rhea_chebiid	rhea_reactionid	False

83885 rows × 5 columns

df_rhea.columns

Index(['Reaction identifier', 'Equation', 'ChEBI name', 'ChEBI identifier',
       'EC number', 'Enzymes', 'Gene Ontology', 'PubMed',
       'Cross-reference (EcoCyc)', 'Cross-reference (MetaCyc)',
       'Cross-reference (KEGG)', 'Cross-reference (Reactome)',
       'Cross-reference (M-CSA)'],
      dtype='object')

df_nodes_rhea_chebiid = df_rhea_exploded[['ChEBI identifier', 'ChEBI name']].rename(
    columns={'ChEBI identifier': 'node_id', 'ChEBI name': 'chebi_name'}
).assign(layer='rhea_chebiid').drop_duplicates()
# df_nodes_rhea_chebiid

# previously also included 'Participant identifier','Enzyme class' 
df_nodes_rhea_reactionid = df_rhea[['Reaction identifier','Equation','ChEBI identifier', 'ChEBI name','EC number','Enzymes','Gene Ontology','Cross-reference (Reactome)',
                                    'PubMed','Cross-reference (KEGG)']].rename(
    columns={'Reaction identifier': 'node_id', 'ChEBI name': 'chebi_name'}
).assign(layer='rhea_reactionid').drop_duplicates()
# df_nodes_rhea_reactionid

# df_nodes_rhea_reactiontochebi = pd.concat([df_nodes_rhea_chebiid, df_nodes_rhea_reactionid])
# df_nodes_rhea_reactiontochebi

df_rhea_exploded_enzymeclass = explode_columns(df_rhea, ["EC number"], delimiter=';')
df_rhea_exploded_enzymeclass

df_edges_rhea_reactiontoenzymeclass = df_rhea_exploded_enzymeclass[['Reaction identifier', 'EC number']].rename(
    columns={'Reaction identifier': 'source_id', 'EC number': 'target_id'}
).assign(source_layer='rhea_reactionid', target_layer='rhea_ec', interlayer=False).drop_duplicates()
df_edges_rhea_reactiontoenzymeclass

df_edges_rhea_reactiontochebi_comb = pd.concat([df_edges_rhea_reactiontochebi, df_edges_rhea_reactiontoenzymeclass,])
df_edges_rhea_reactiontochebi_comb

# Remake nodes with enzyme class included
df_nodes_rhea_reactiontochebi_comb = pd.concat([df_nodes_rhea_chebiid, 
                                           df_nodes_rhea_reactionid, 
                                           df_nodes_rhea_ec])

df_nodes_rhea_reactiontochebi_comb

	node_id	chebi_name	layer	Equation	ChEBI identifier	EC number	Enzymes	Gene Ontology	Cross-reference (Reactome)	PubMed	Cross-reference (KEGG)	ec_level
0	CHEBI:16782	(S)-2-hydroxyglutarate	rhea_chebiid	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
0	CHEBI:13193	A	rhea_chebiid	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
0	CHEBI:16810	2-oxoglutarate	rhea_chebiid	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
0	CHEBI:17499	AH2	rhea_chebiid	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	CHEBI:145989	3-phosphoshikimate	rhea_chebiid	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
6484	EC:1.14.13.35	NaN	rhea_ec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	full_ec
6485	EC:2.5.1.59	NaN	rhea_ec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	full_ec
6486	EC:2.5.1.60	NaN	rhea_ec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	full_ec
6487	EC:2.7.7.6	NaN	rhea_ec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	full_ec
6488	EC:2.7.7.48	NaN	rhea_ec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	full_ec

37995 rows × 12 columns

df_edges_rhea_reactiontochebi_comb

	target_id	source_id	source_layer	target_layer	interlayer
0	RHEA:21252	CHEBI:16782	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:13193	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:16810	rhea_chebiid	rhea_reactionid	False
0	RHEA:21252	CHEBI:17499	rhea_chebiid	rhea_reactionid	False
1	RHEA:21256	CHEBI:145989	rhea_chebiid	rhea_reactionid	False
...	...	...	...	...	...
17780	EC:2.5.1.59	RHEA:21240	rhea_reactionid	rhea_ec	False
17780	EC:2.5.1.60	RHEA:21240	rhea_reactionid	rhea_ec	False
17781	EC:2.1.1.70	RHEA:21244	rhea_reactionid	rhea_ec	False
17782	EC:2.7.7.6	RHEA:21248	rhea_reactionid	rhea_ec	False
17782	EC:2.7.7.48	RHEA:21248	rhea_reactionid	rhea_ec	False

101957 rows × 5 columns

Get all chebi ids

df_nodes_rhea_reactiontochebi_comb[df_nodes_rhea_reactiontochebi_comb['layer']=='rhea_chebiid']['node_id'].drop_duplicates()

       CHEBI:16782
       CHEBI:13193
       CHEBI:16810
       CHEBI:17499
      CHEBI:145989
             ...     
  CHEBI:144586
   CHEBI:57639
   CHEBI:18167
   CHEBI:52025
   CHEBI:52028
Name: node_id, Length: 13723, dtype: object

Using the LipiNet `parse_rhea` function#

The LipiNet parse_rhea function automatically runs through all of the same steps as we have just covered.

from lipinet.parse_rhea import parse_rhea_data

rhea_results = parse_rhea_data(verbose=False)
df_rhea_nodes = rhea_results['df_nodes']
df_rhea_edges = rhea_results['df_edges']

df_rhea_nodes

	node_id	Equation	ChEBI identifier	chebi_name	EC number	Enzymes	Gene Ontology	Cross-reference (Reactome)	layer	ec_level
0	RHEA:21252	(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2	CHEBI:16782;CHEBI:13193;CHEBI:16810;CHEBI:17499	(S)-2-hydroxyglutarate;A;2-oxoglutarate;AH2	EC:1.1.99.2	4258.0	GO:0047545 2-hydroxyglutarate dehydrogenase ac...	NaN	rhea_reactionid	NaN
1	RHEA:21256	3-phosphoshikimate + phosphoenolpyruvate = 5-O...	CHEBI:145989;CHEBI:58702;CHEBI:57701;CHEBI:43474	3-phosphoshikimate;phosphoenolpyruvate;5-O-(1-...	EC:2.5.1.19	44340.0	GO:0003866 3-phosphoshikimate 1-carboxyvinyltr...	NaN	rhea_reactionid	NaN
2	RHEA:21260	[thioredoxin]-disulfide + L-methionine + H2O =...	CHEBI:50058;CHEBI:57844;CHEBI:15377;CHEBI:5877...	L-cystine residue;L-methionine;H2O;L-methionin...	EC:1.8.4.14	3112.0	GO:0033745 L-methionine-(R)-S-oxide reductase ...	NaN	rhea_reactionid	NaN
3	RHEA:21264	glycolate + A = glyoxylate + AH2	CHEBI:29805;CHEBI:13193;CHEBI:36655;CHEBI:17499	glycolate;A;glyoxylate;AH2	EC:1.1.99.14	14321.0	GO:0019154 glycolate dehydrogenase activity	NaN	rhea_reactionid	NaN
4	RHEA:21268	(R)-canadine + 2 NADP(+) = berberine + 2 NADPH...	CHEBI:18146;CHEBI:58349;CHEBI:16118;CHEBI:5778...	(R)-canadine;NADP(+);berberine;NADPH;H(+)	EC:1.5.1.31	0.0	GO:0050623 berberine reductase activity	NaN	rhea_reactionid	NaN
...	...	...	...	...	...	...	...	...	...	...
37990	EC:1.14.13.35	NaN	NaN	NaN	NaN	NaN	NaN	NaN	rhea_ec	full_ec
37991	EC:2.5.1.59	NaN	NaN	NaN	NaN	NaN	NaN	NaN	rhea_ec	full_ec
37992	EC:2.5.1.60	NaN	NaN	NaN	NaN	NaN	NaN	NaN	rhea_ec	full_ec
37993	EC:2.7.7.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	rhea_ec	full_ec
37994	EC:2.7.7.48	NaN	NaN	NaN	NaN	NaN	NaN	NaN	rhea_ec	full_ec

37995 rows × 10 columns

df_rhea_edges

	source_id	target_id	ec_level	source_layer	target_layer	interlayer	edge_type
0	EC:1	EC:1.1	main_class->subclass	rhea_ec	rhea_ec	False	ec_hierarchy
1	EC:2	EC:2.5	main_class->subclass	rhea_ec	rhea_ec	False	ec_hierarchy
2	EC:1	EC:1.8	main_class->subclass	rhea_ec	rhea_ec	False	ec_hierarchy
3	EC:1	EC:1.5	main_class->subclass	rhea_ec	rhea_ec	False	ec_hierarchy
4	EC:6	EC:6.3	main_class->subclass	rhea_ec	rhea_ec	False	ec_hierarchy
...	...	...	...	...	...	...	...
108434	RHEA:21240	EC:2.5.1.59	NaN	rhea_reactionid	rhea_ec	False	reaction_ec
108435	RHEA:21240	EC:2.5.1.60	NaN	rhea_reactionid	rhea_ec	False	reaction_ec
108436	RHEA:21244	EC:2.1.1.70	NaN	rhea_reactionid	rhea_ec	False	reaction_ec
108437	RHEA:21248	EC:2.7.7.6	NaN	rhea_reactionid	rhea_ec	False	reaction_ec
108438	RHEA:21248	EC:2.7.7.48	NaN	rhea_reactionid	rhea_ec	False	reaction_ec

108439 rows × 7 columns

df_rhea_edges['ec_level'].value_counts(dropna=False)

ec_level
NaN                       101957
subsubclass->EC_number      6153
subclass->subsubclass        253
main_class->subclass          76
Name: count, dtype: int64

df_rhea_nodes['ec_level'].value_counts(dropna=False)

ec_level
NaN            31506
full_ec         6153
subsubclass      253
subclass          76
main_class         7
Name: count, dtype: int64