全局变量减慢 PyParsing

Global variables slowing down PyParsing

使用 Pyparsing 解析文件时,解析 Ex1 大约需要一分钟,解析 Ex2 大约需要 15 秒。唯一的区别是,在 Ex1 中,PyParsing 使用的变量在 class 之前声明为全局变量。在 Ex2 中,变量在每个方法中单独声明(例如 def parse_components(self))。这是预期的吗?如果是,解释是什么?

Ex1(解析时间:~60s):

import modules
import pyparsing as pp
from collections import defaultdict
from multiprocessing import (Process, Manager, Event)

# GLOBALS for PyParsing
EOL = pp.LineEnd().suppress()
linebreak = pp.Suppress(";" + pp.LineEnd())
identifier = pp.Word(pp.alphanums + '._“!<>/[]$#$%&‘*+,/:<=>?@[\]^_`{|}~')
number = pp.pyparsing_common.number
word = pp.Word(pp.alphas)
LPAR = pp.Suppress('(')
RPAR = pp.Suppress(')')
ORIENT = (pp.Keyword('N')
        | pp.Keyword('S')
        | pp.Keyword('E')
        | pp.Keyword('W')
        | pp.Keyword('FN')
        | pp.Keyword('FS')
        | pp.Keyword('FE')
        | pp.Keyword('FW'))
pt = LPAR + pp.OneOrMore(number | pp.Keyword('*')) + RPAR  # pair of x,y

class DEF():
    def __init__(self, Base):
        self.mydict = lambda: defaultdict(self.mydict)
        ...
        ...
        self.ignore_nets = True
        self.ignore_nets_route = False
        # Each list is a new process. Careful with dependencies.
        # eg. 'dbuPerMicron' must be executed before the others, but it can be after 'design'
        self.sections_grp = [['design', 'dbuPerMicron', 'diearea'],
                             ['components'],
                            ]
        if not self.ignore_nets:
            self.sections_grp.append(['nets'])
        self.n_elems_sections_grp = sum([len(x) for x in self.sections_grp])
        self.counter = modules.SharedCounter(0)
        self.events = [Event()]
        self.design = ''
        modules.debug_log.print_repr([self.__repr__()])

    #
    def run(self):
        for curr_file in self.def_file_design:
            ifile = open(curr_file,'r')
            file_string = ifile.read()
            ifile.close()
            self.parse_all(file_string)

    # Create a process for each section to parse
    def parse_all(self, file_string):
        manager = Manager()
        shared_dict = manager.dict()
        jobs = []
        for sections in self.sections_grp:
            p = Process(target=self.parse_sections, args=(sections, file_string, shared_dict))
            jobs.append(p)
            p.start()

        # Wait for the workers to finish
        for job in jobs:
            job.join()

        for sections in self.sections_grp:
            for section in sections:
                getattr(self, 'handle_' + section)(shared_dict)


    # Spawn the processes from each group of self.sections_grp
    def parse_sections(self, sections, file_string, shared_dict):
        for section in sections:
            to_parse = getattr(self, 'parse_' + section)
            for t, s, e in to_parse().scanString(file_string):
                shared_dict.update(t.asDict())
                break

    # Parse the DESIGN section of a .DEF file
    def parse_design(self):
        ...
        return design

    # Parse the UNITS DISTANCE MICRONS section of a .DEF file
    def parse_dbuPerMicron(self):
        ...
        return dbuPerMicron

    # Parse the DIEAREA section of a .DEF file
    def parse_diearea(self):
        ...
        return diearea

    # Parse the COMPONENTS section of a .DEF file
    def parse_components(self):
        self.events[0].wait()  # Wait for event[0] to finish
        components_id = pp.Keyword('COMPONENTS')
        end_components_id = pp.Keyword("END COMPONENTS").suppress()
        begin_comp = pp.Suppress(pp.Keyword('-'))
        ws_comp = pp.Suppress(pp.Keyword('+'))  # parameter division in components

        # compName
        compName = (identifier('comp_name') + identifier('cell')
                   ).setResultsName('compName')

        ...
        ...
        ...

        subcomponent = pp.Group(begin_comp
                                + compName
                                + pp.Optional(EEQMASTER)
                                + pp.Optional(SOURCE)  # & because it can be in any order
                                + pp.Optional(PLACEMENT)
                                + pp.Optional(MASKSHIFT)
                                + pp.Optional(HALO)
                                + pp.Optional(ROUTEHALO)
                                + pp.Optional(WEIGHT)
                                + pp.Optional(REGION)
                                + pp.ZeroOrMore(PROPERTY)
                                + pp.Suppress(linebreak)
                               ).setResultsName('subcomponents', listAllMatches=True)

        components = pp.Group(pp.Suppress(components_id)
                                          + number('numComps')
                                          + pp.Suppress(linebreak)
                                          + pp.OneOrMore(subcomponent)
                                          + pp.Suppress(end_components_id)
                             ).setResultsName('COMPONENTS')

        return components

Ex2(解析时间:~15s):

import modules
import pyparsing as pp
from collections import defaultdict
from multiprocessing import (Process, Manager, Event)

class DEF():
    def __init__(self, Base):
        self.mydict = lambda: defaultdict(self.mydict)
        ...
        ...
        self.ignore_nets = True
        self.ignore_nets_route = False
        # Each list is a new process. Careful with dependencies.
        # eg. 'dbuPerMicron' must be executed before the others, but it can be after 'design'
        self.sections_grp = [['design', 'dbuPerMicron', 'diearea'],
                             ['components'],
                            ]
        if not self.ignore_nets:
            self.sections_grp.append(['nets'])
        self.n_elems_sections_grp = sum([len(x) for x in self.sections_grp])
        self.counter = modules.SharedCounter(0)
        self.events = [Event()]
        self.design = ''
        modules.debug_log.print_repr([self.__repr__()])

    #
    def run(self):
        for curr_file in self.def_file_design:
            ifile = open(curr_file,'r')
            file_string = ifile.read()
            ifile.close()
            self.parse_all(file_string)

    # Create a process for each section to parse
    def parse_all(self, file_string):
        manager = Manager()
        shared_dict = manager.dict()
        jobs = []
        for sections in self.sections_grp:
            p = Process(target=self.parse_sections, args=(sections, file_string, shared_dict))
            jobs.append(p)
            p.start()

        # Wait for the workers to finish
        for job in jobs:
            job.join()

        for sections in self.sections_grp:
            for section in sections:
                getattr(self, 'handle_' + section)(shared_dict)


    # Spawn the processes from each group of self.sections_grp
    def parse_sections(self, sections, file_string, shared_dict):
        for section in sections:
            to_parse = getattr(self, 'parse_' + section)
            for t, s, e in to_parse().scanString(file_string):
                shared_dict.update(t.asDict())
                break


    # Parse the DESIGN section of a .DEF file
    def parse_design(self):
        ...
        return design

    # Parse the UNITS DISTANCE MICRONS section of a .DEF file
    def parse_dbuPerMicron(self):
        ...
        return dbuPerMicron

    # Parse the DIEAREA section of a .DEF file
    def parse_diearea(self):
        ...
        return diearea

    # Parse the COMPONENTS section of a .DEF file
    def parse_components(self):
        self.events[0].wait()  # Wait for event[0] to finish
        
        EOL = pp.LineEnd().suppress()
        linebreak = pp.Suppress(";" + pp.LineEnd())
        identifier = pp.Word(pp.alphanums + '._“!<>/[]$#$%&‘*+,/:<=>?@[\]^_`{|}~')
        number = pp.pyparsing_common.number
        word = pp.Word(pp.alphas)
        LPAR = pp.Suppress('(')
        RPAR = pp.Suppress(')')
        ORIENT = (pp.Keyword('N')
                | pp.Keyword('S')
                | pp.Keyword('E')
                | pp.Keyword('W')
                | pp.Keyword('FN')
                | pp.Keyword('FS')
                | pp.Keyword('FE')
                | pp.Keyword('FW'))
        pt = LPAR + pp.OneOrMore(number | pp.Keyword('*')) + RPAR  # pair of x,y
        
        components_id = pp.Keyword('COMPONENTS')
        end_components_id = pp.Keyword("END COMPONENTS").suppress()
        begin_comp = pp.Suppress(pp.Keyword('-'))
        ws_comp = pp.Suppress(pp.Keyword('+'))  # parameter division in components

        # compName
        compName = (identifier('comp_name') + identifier('cell')
                   ).setResultsName('compName')

        ...
        ...
        ...

        subcomponent = pp.Group(begin_comp
                                + compName
                                + pp.Optional(EEQMASTER)
                                + pp.Optional(SOURCE)  # & because it can be in any order
                                + pp.Optional(PLACEMENT)
                                + pp.Optional(MASKSHIFT)
                                + pp.Optional(HALO)
                                + pp.Optional(ROUTEHALO)
                                + pp.Optional(WEIGHT)
                                + pp.Optional(REGION)
                                + pp.ZeroOrMore(PROPERTY)
                                + pp.Suppress(linebreak)
                               ).setResultsName('subcomponents', listAllMatches=True)

        components = pp.Group(pp.Suppress(components_id)
                                          + number('numComps')
                                          + pp.Suppress(linebreak)
                                          + pp.OneOrMore(subcomponent)
                                          + pp.Suppress(end_components_id)
                             ).setResultsName('COMPONENTS')

        return components

可能的罪魁祸首似乎正在这里盯着你看:

... in Ex1 the variables used by PyParsing are declared as globals before the class.

from multiprocessing import (Process, Manager, Event)

Mu​​ltiprocessing 可能正在以一种有趣的方式重新加载或与之交互。 DEF.sections_grp 在您的计时测试中是否总是一个列表(例如 1 个进程)?