Python: 减少大文件读取-存储-操作-写入程序中的 RAM 使用

Python: Reducing the RAM usage in a large file read-store-operate-write program

我这里有一个 python 程序可以将 DIMACS cnf 格式文件转换为 PLA 格式。我正在从文件中读取 CNF 子句并将它们存储在列表中,然后对列表元素进行 运行ning 操作。 该程序适用于最多 15,000 行(子句)的较小文件,但是当我尝试 运行 较大文件上的程序时,系统 运行 内存不足。我需要对大约 90,000 到 120,000 行的文件进行操作。有人可以建议一些更改以优化内存使用吗?下面是我的程序:

import sys

#Input: CNF file path
#Output: dictionary of params and clauses from cnf file
def readCNFFile(name):
    lines_str = []
    inputs_dict = {}
    clause_list = []
    with open(name, "r") as f:
        for line in f.readlines():
            lines_str.append(str(line))
        for current in range(len(lines_str)):
            li=lines_str[current].strip()
            if li.startswith("c"):
                pass
            elif li.startswith("C"):
                pass
            elif li.startswith("p"):
                inputs_dict['params'] = li
            else:
                clause_list.append(li)
        inputs_dict['clauses']=clause_list
        f.close()
    return inputs_dict

#Input: inputs_dict["clauses"]
#Output: list of clauses split into integers
def getClauses(clause_list):
    for current in range(len(clause_list)):
        temp = clause_list[current].split()
        clause_list[current] = temp
    for current in range(len(clause_list)):
        nums = [int(n) for n in clause_list[current]]
        clause_list[current]= nums
    return clause_list

#Input: inputs_dict["params"]
#Output: number of inputs in PLA
def getNumInputs(param_list):
    param = param_list.split()
    num_inputs = int(param[2])
    return num_inputs

#Input: inputs_dict["params"]
#Output: number of products in PLA
def getNumProducts(param_list):
    param = param_list.split()
    num_prod = int(param[3])
    return num_prod

#Inputs: 1. list of clauses split into integers
#        2. number of inputs in PLA
#Output: list of products for PLA file    
def getPLAlist(clause_list, num_inputs):
    s = "-"
    temp_list_total=[]
    for current in range(len(clause_list)):
        temp_list = []
        for index in range(1,(num_inputs+1)):
            temp_list.append("-")
        for index in range(len(clause_list[current])):
            for i in range(1,(num_inputs+1)):
                if(abs(clause_list[current][index])==i):
                    if(clause_list[current][index]<0):
                        temp_list[i-1]="1"
                    else:
                        temp_list[i-1]="0"
        temp_list_total.append(temp_list)
    return temp_list_total 

#Inputs: 1. input CNF file path
#        2. list of products for PLA file
#        3. number of inputs in PLA
#        4. number of products in PLA
#Output: print PLA file
def printPLAfile(inputFile, PLA_list, num_inputs, num_prod):
    outputfile = inputFile.split(".")[0] + "_pla2.pla"
    with open(outputfile, "w" ) as file_out:
        file_out.write(".i ")
        file_out.write(str(num_inputs))
        file_out.write("\n.o 1")
        file_out.write("\n.p ")
        file_out.write(str(num_prod))
        file_out.write("\n")
        for current in range(len(PLA_list)):
            for index in range(len(PLA_list[current])):
                file_out.write(PLA_list[current][index],)
            file_out.write(" 1 \n")
        file_out.write(".e")

#Get .pla file from .cnf 
#input: CNF File path
def convert_CNF_2_PLA(name):
    inputs = readCNFFile(name)
    clause_list = getClauses(inputs['clauses'])
    num_vars = getNumInputs(inputs['params'])
    num_clause = getNumProducts(inputs['params'])
    PLA_list = getPLAlist(clause_list, num_vars)
    printPLAfile(name, PLA_list, num_vars, num_clause)

if __name__ == '__main__':
    name = sys.argv[1]
    convert_CNF_2_PLA(name)

一个词:发电机。这里的基本思想是每个函数只做获取下一行所需的工作量,仅此而已。这使内存占用很小。肯定有更多的优化可以在这里完成,但我对问题的理解不够好,或者没有任何测试数据可以使用。这完全未经测试,但可能有效!

import sys

def readCNFParams(name):
    with open(name, "r") as f:
        for line in f:
            li=line.strip()
            if li[0] == "p":
                return li

def readCNFClauses(name):
    with open(name, "r") as f:
        for line in f:
            li=line.strip()
            if li[0] in ("c", "C", "p")
                continue
            else:
                yield li

#Input: inputs_dict["clauses"]
#Output: list of clauses split into integers
def getClauses(clause_list):
    clause_list = (c.split() for c in clause_list)
    return ([int(n) for n in c] for c in clause_list)

#Input: inputs_dict["params"]
#Output: number of inputs in PLA
def getNumInputs(param_list):
    param = param_list.split()
    num_inputs = int(param[2])
    return num_inputs

#Input: inputs_dict["params"]
#Output: number of products in PLA
def getNumProducts(param_list):
    param = param_list.split()
    num_prod = int(param[3])
    return num_prod

#Inputs: 1. list of clauses split into integers
#        2. number of inputs in PLA
#Output: list of products for PLA file    
def getPLAlist(clause_list, num_inputs):
    s = "-"
    for current in clause_list:
        temp_list = []
        for index in range(1,(num_inputs+1)):
            temp_list.append("-")
        for index in range(len(current)):
            for i in range(1,(num_inputs+1)):
                if(abs(current[index])==i):
                    if(current[index]<0):
                        temp_list[i-1]="1"
                    else:
                        temp_list[i-1]="0"
        yield temp_list

#Inputs: 1. input CNF file path
#        2. list of products for PLA file
#        3. number of inputs in PLA
#        4. number of products in PLA
#Output: print PLA file
def printPLAfile(inputFile, PLA_list, num_inputs, num_prod):
    outputfile = inputFile.split(".")[0] + "_pla2.pla"
    with open(outputfile, "w" ) as file_out:
        file_out.write(".i ")
        file_out.write(str(num_inputs))
        file_out.write("\n.o 1")
        file_out.write("\n.p ")
        file_out.write(str(num_prod))
        file_out.write("\n")
        for current in PLA_list:
            for index in range(len(current)):
                file_out.write(current[index],)
            file_out.write(" 1 \n")
        file_out.write(".e")

#Get .pla file from .cnf 
#input: CNF File path
def convert_CNF_2_PLA(name):
    clauses = readCNFClauses(name)
    clause_list = getClauses(clauses)
    params = readCNFParams(name)
    num_vars = getNumInputs(params)
    num_clause = getNumProducts(inputs['params'])
    PLA_list = getPLAlist(clause_list, num_vars)
    printPLAfile(name, PLA_list, num_vars, num_clause)

if __name__ == '__main__':
    name = sys.argv[1]
    convert_CNF_2_PLA(name)