如何将 KEGG 模块 (md:) 平面文件解析为 Python 中的 Pandas DataFrame? (即,逐节解析文本,逐行阅读一次)

How to parse KEGG module (md:) flat files into Pandas DataFrame in Python? (i.e., parse text by section reading line-by-line only once)

我正在尝试为这种类型的文件编写解析器,但事实证明它比我想象的要复杂一些。如果这是几年前,我会多次阅读文件,但现在我考虑效率,这不是什么好事,因为必须有更好的方法。

如何在逐行读取文件的同时解析每个部分?

我的尝试是设置一个根据部分更改的变量。这开始起作用,但正如您所看到的,它在第二个分组(例如反应)中变得混乱。我觉得 while 循环可能是答案,但我不确定如何在这种情况下实现它。

from io import StringIO
f = StringIO(
"""
ENTRY       M00001            Pathway   Module
NAME        Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
DEFINITION  (K00844,K12407,K00845,K00886,K08074,K00918) (K01810,K06859,K13810,K15916) (K00850,K16370,K21071,K00918) (K01623,K01624,K11645,K16305,K16306) K01803 ((K00134,K00150) K00927,K11389) (K01834,K15633,K15634,K15635) K01689 (K00873,K12406)
ORTHOLOGY   K00844,K12407,K00845  hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786]
            K00886  polyphosphate glucokinase [EC:2.7.1.63] [RN:R02189]
            K08074,K00918  ADP-dependent glucokinase [EC:2.7.1.147] [RN:R09085]
            K01810,K06859,K13810,K15916  glucose-6-phosphate isomerase [EC:5.3.1.9] [RN:R02740]
            K00850,K16370,K21071  6-phosphofructokinase [EC:2.7.1.11] [RN:R04779]
            K00918  ADP-dependent phosphofructokinase [EC:2.7.1.146] [RN:R09084]
            K01623,K01624,K11645,K16305,K16306  fructose-bisphosphate aldolase [EC:4.1.2.13] [RN:R01070]
            K01803  triosephosphate isomerase [EC:5.3.1.1] [RN:R01015]
            K00134,K00150  glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12 1.2.1.59] [RN:R01061 R01063]
            K00927  phosphoglycerate kinase [EC:2.7.2.3] [RN:R01512]
            K11389  glyceraldehyde-3-phosphate dehydrogenase (ferredoxin) [EC:1.2.7.6] [RN:R07159]
            K01834,K15633,K15634,K15635  phosphoglycerate mutase [EC:5.4.2.11 5.4.2.12] [RN:R01518]
            K01689  enolase [EC:4.2.1.11] [RN:R00658]
            K00873,K12406  pyruvate kinase [EC:2.7.1.40] [RN:R00200]
CLASS       Pathway modules; Carbohydrate metabolism; Central carbohydrate metabolism
PATHWAY     map00010  Glycolysis / Gluconeogenesis
            map01200  Carbon metabolism
            map01100  Metabolic pathways
REACTION    R01786,R02189,R09085  C00267 -> C00668
            R02740  C00668 -> C05345
            R04779,R09084  C05345 -> C05378
            R01070  C05378 -> C00111 + C00118
            R01015  C00111 -> C00118
            R01061,R01063  C00118 -> C00236
            R01512  C00236 -> C00197
            R07159  C00118 -> C00197
            R01518  C00197 -> C00631
            R00658  C00631 -> C00074
            R00200  C00074 -> C00022
COMPOUND    C00267  alpha-D-Glucose
            C00668  alpha-D-Glucose 6-phosphate
            C05345  beta-D-Fructose 6-phosphate
            C05378  beta-D-Fructose 1,6-bisphosphate
            C00111  Glycerone phosphate
            C00118  D-Glyceraldehyde 3-phosphate
            C00236  3-Phospho-D-glyceroyl phosphate
            C00197  3-Phospho-D-glycerate
            C00631  2-Phospho-D-glycerate
            C00074  Phosphoenolpyruvate
            C00022  Pyruvate
///
"""
)


kegg_definition = None
kegg_orthology = list()
kegg_ortholog_set = set()
kegg_class = None
kegg_pathways = list()
kegg_pathway_set = set()
kegg_reactions = list()
kegg_reaction_set = set()
kegg_compounds = list()
kegg_compound_set = set()

# Read KEGG module text
parsing = None
for line_level_1 in f:
    line_level_1 = line_level_1.strip()
    if not line_level_1.startswith("/"):
        # Get orthologs
        if line_level_1.startswith("DEFINITION"):
            kegg_definition = line_level_1.replace("DEFINITION","").strip()
            kegg_ortholog_set = str(kegg_definition)
            for character in list("(+ -)"):
                kegg_ortholog_set.replace(character, ",")
            kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
            parsing = "ORTHOLOGY"
        if parsing == "ORTHOLOGY":
            ko_annot = line_level_1.replace("DEFINITION","").strip()
            kegg_orthology.append(ko_annot)
        # Get class
        if line_level_1.startswith("CLASS"):
    #         parsing = None
            kegg_class = line_level_1.replace("CLASS","").strip().split("; ")
        # Get pathways
        if line_level_1.startswith("PATHWAY"):
            parsing = "PATHWAY"
        if parsing == "PATHWAY":
            kegg_pathway = line_level_1.replace("PATHWAY","").strip().split("; ")
            kegg_pathways.append(kegg_pathway)
        # Get reactions
        if line_level_1.startswith("REACTION"):
            parsing = "REACTION"
        if parsing == "REACTION":
            kegg_reaction = line_level_1.replace("REACTION","").strip().split("; ")
            kegg_reactions.append(kegg_reaction)
        # Get compounds
        if line_level_1.startswith("COMPOUND"):
            parsing = "COMPOUND"
        if parsing == "COMPOUND":
            kegg_compound = line_level_1.replace("COMPOUND","").strip().split("; ")
            kegg_compounds.append(kegg_compound)
            
kegg_pathways
# [['map00010  Glycolysis / Gluconeogenesis'],
#  ['map01200  Carbon metabolism'],
#  ['map01100  Metabolic pathways'],
#  ['REACTION    R01786,R02189,R09085  C00267 -> C00668']]
        
        

这是因为直到您已经将线路收集到 PATHWAY 之后才检查 REACTION。做你所有的“下一节”吗?在开始使用“解析”值之前进行检查:

# Read KEGG module text
parsing = None
for line_level_1 in f:
    line_level_1 = line_level_1.strip()
    if not line_level_1.startswith("/"):
        if line_level_1.startswith("DEFINITION"):
            kegg_definition = line_level_1.replace("DEFINITION","").strip()
            kegg_ortholog_set = str(kegg_definition)
            for character in list("(+ -)"):
                kegg_ortholog_set.replace(character, ",")
            kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
        elif line_level_1.startswith("CLASS"):
            kegg_class = line_level_1.replace("CLASS","").strip().split("; ")
        elif line_level_1.startswith("ORTHOLOGY"):
            parsing = "ORTHOLOGY"
        elif line_level_1.startswith("PATHWAY"):
            parsing = "PATHWAY"
        elif line_level_1.startswith("REACTION"):
            parsing = "REACTION"
        elif line_level_1.startswith("COMPOUND"):
            parsing = "COMPOUND"
        
        if parsing == "ORTHOLOGY":
            ko_annot = line_level_1.replace("DEFINITION","").strip()
            kegg_orthology.append(ko_annot)
        elif parsing == "PATHWAY":
        # Get pathways
            kegg_pathway = line_level_1.replace("PATHWAY","").strip().split("; ")
            kegg_pathways.append(kegg_pathway)
        # Get reactions
        elif parsing == "REACTION":
            kegg_reaction = line_level_1.replace("REACTION","").strip().split("; ")
            kegg_reactions.append(kegg_reaction)
        # Get compounds
        elif parsing == "COMPOUND":
            kegg_compound = line_level_1.replace("COMPOUND","").strip().split("; ")
            kegg_compounds.append(kegg_compound)
            
print(kegg_pathways)

以防万一这对将来的任何人都有帮助。这是我制作的解析器。但是,如果您希望从一组直系同源物(例如宏基因组组装基因组或 bin)中计算模块完成率,那么您需要考虑 logicals in the KEGG definition and the fact that some modules are bifurcated...in which you should probably use some of the code in MicrobeAnnotator

import datetime
from Bio.KEGG.REST import kegg_list, kegg_get

# Parse KEGG Module
def parse_kegg_module(module_file):
    """
    
    Example of a KEGG REST module file:
    
    ENTRY       M00001            Pathway   Module
    NAME        Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
    DEFINITION  (K00844,K12407,K00845,K00886,K08074,K00918) (K01810,K06859,K13810,K15916) (K00850,K16370,K21071,K00918) (K01623,K01624,K11645,K16305,K16306) K01803 ((K00134,K00150) K00927,K11389) (K01834,K15633,K15634,K15635) K01689 (K00873,K12406)
    ORTHOLOGY   K00844,K12407,K00845  hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786]
                K00886  polyphosphate glucokinase [EC:2.7.1.63] [RN:R02189]
                K08074,K00918  ADP-dependent glucokinase [EC:2.7.1.147] [RN:R09085]
                K01810,K06859,K13810,K15916  glucose-6-phosphate isomerase [EC:5.3.1.9] [RN:R02740]
                K00850,K16370,K21071  6-phosphofructokinase [EC:2.7.1.11] [RN:R04779]
                K00918  ADP-dependent phosphofructokinase [EC:2.7.1.146] [RN:R09084]
                K01623,K01624,K11645,K16305,K16306  fructose-bisphosphate aldolase [EC:4.1.2.13] [RN:R01070]
                K01803  triosephosphate isomerase [EC:5.3.1.1] [RN:R01015]
                K00134,K00150  glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12 1.2.1.59] [RN:R01061 R01063]
                K00927  phosphoglycerate kinase [EC:2.7.2.3] [RN:R01512]
                K11389  glyceraldehyde-3-phosphate dehydrogenase (ferredoxin) [EC:1.2.7.6] [RN:R07159]
                K01834,K15633,K15634,K15635  phosphoglycerate mutase [EC:5.4.2.11 5.4.2.12] [RN:R01518]
                K01689  enolase [EC:4.2.1.11] [RN:R00658]
                K00873,K12406  pyruvate kinase [EC:2.7.1.40] [RN:R00200]
    CLASS       Pathway modules; Carbohydrate metabolism; Central carbohydrate metabolism
    PATHWAY     map00010  Glycolysis / Gluconeogenesis
                map01200  Carbon metabolism
                map01100  Metabolic pathways
    REACTION    R01786,R02189,R09085  C00267 -> C00668
                R02740  C00668 -> C05345
                R04779,R09084  C05345 -> C05378
                R01070  C05378 -> C00111 + C00118
                R01015  C00111 -> C00118
                R01061,R01063  C00118 -> C00236
                R01512  C00236 -> C00197
                R07159  C00118 -> C00197
                R01518  C00197 -> C00631
                R00658  C00631 -> C00074
                R00200  C00074 -> C00022
    COMPOUND    C00267  alpha-D-Glucose
                C00668  alpha-D-Glucose 6-phosphate
                C05345  beta-D-Fructose 6-phosphate
                C05378  beta-D-Fructose 1,6-bisphosphate
                C00111  Glycerone phosphate
                C00118  D-Glyceraldehyde 3-phosphate
                C00236  3-Phospho-D-glyceroyl phosphate
                C00197  3-Phospho-D-glycerate
                C00631  2-Phospho-D-glycerate
                C00074  Phosphoenolpyruvate
                C00022  Pyruvate
    ///
    """

    kegg_module = None
    kegg_name = None
    kegg_definition = None
    kegg_orthology = list()
    kegg_ortholog_set = set()
    kegg_classes = list()
    kegg_pathways = list()
    kegg_pathway_set = set()
    kegg_reactions = list()
    kegg_reaction_set = set()
    kegg_compounds = list()
    kegg_compound_set = set()

    # Read KEGG module text
    parsing = None
    for line in module_file:
        line = line.strip()
        if not line.startswith("/"):
            if not line.startswith(" "):
                first_word = line.split(" ")[0]
                if first_word.isupper() and first_word.isalpha():
                    parsing = first_word
            if parsing == "ENTRY":
                kegg_module = list(filter(bool, line.split(" ")))[1]
            if parsing == "NAME":
                kegg_name = line.replace(parsing, "").strip()
                parsing = None
            if parsing == "DEFINITION":
                kegg_definition = line.replace(parsing,"").strip()
                kegg_ortholog_set = str(kegg_definition)
                for character in list("(+ -)"):
                    kegg_ortholog_set = kegg_ortholog_set.replace(character, ",")
                kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
            if parsing == "ORTHOLOGY":
                kegg_orthology.append(line.replace(parsing,"").strip())
            if parsing == "CLASS":
                kegg_classes = line.replace(parsing,"").strip().split("; ")
            if parsing == "PATHWAY":
                kegg_pathway = line.replace(parsing,"").strip()
                kegg_pathways.append(kegg_pathway)
                id_pathway = kegg_pathway.split(" ")[0]
                kegg_pathway_set.add(id_pathway)
            if parsing == "REACTION":
                kegg_reaction = line.replace(parsing,"").strip()
                kegg_reactions.append(kegg_reaction)
                for id_reaction in kegg_reaction.split(" ")[0].split(","):
                    kegg_reaction_set.add(id_reaction)
            if parsing == "COMPOUND":
                kegg_compound = line.replace(parsing,"").strip()
                id_compound = kegg_compound.split(" ")[0]
                kegg_compounds.append(kegg_compound)
                kegg_compound_set.add(id_compound)

    module_info = pd.Series(
        data = OrderedDict([
            ("NAME",kegg_name),
            ("DEFINITION",kegg_definition),
            ("ORTHOLOGY",kegg_orthology),
            ("ORTHOLOGY_SET",kegg_ortholog_set),
            ("CLASS",kegg_classes),
            ("PATHWAY",kegg_pathways),
            ("PATHWAY_SET",kegg_pathway_set),
            ("REACTION",kegg_reactions),
            ("REACTION_SET",kegg_reaction_set),
            ("COMPOUND",kegg_compounds),
            ("COMPOUND_SET",kegg_compound_set),
        ]),
        name=kegg_module,
    )
            
    return module_info


def get_kegg_modules(expand_nested_modules=True):
    results = list()
    for line in list(kegg_list("module")):
        line = line.strip()
        module, name = line.split("\t")
        prefix, id_module = module.split(":")
        module_file = kegg_get(module)
        module_info = parse_kegg_module(module_file)
        results.append(module_info)
    df = pd.DataFrame(results)
    df.index.name = datetime.datetime.now().strftime("Accessed: %Y-%m-%d @ %H:%M")
    
    # Expand nested modules
    if expand_nested_modules:
        for id_module, row in df.iterrows():
            kegg_orthology_set = row["ORTHOLOGY_SET"]
            expanded = set()
            for x in kegg_orthology_set:
                if x.startswith("K"):
                    expanded.add(x)
                if x.startswith("M"):
                    for id_ko in df.loc[x,"ORTHOLOGY_SET"]:
                        expanded.add(id_ko)
            df.loc[id_module, "ORTHOLOGY_SET"] = expanded
    return df