Python PostgreSQL 查询脚本中的内存错误
Memory error in Python script for PostgreSQL queries
我用于更新 PostgreSQL 数据库 table 列的 Python 脚本被内存错误中断。该脚本从大约 28 GB 的 XML 文件 Posts
中读取数据,并尝试更新数据库 forum
.
中的列 parentid
错误的示例如下:
Traceback (most recent call last):
File "dbUpdate.py", line 43, in <module>
update_table('C:\dataset\Posts')
File "dbUpdate.py", line 23, in update_table
for event, elem in iterparse(xml_file):
File "C:\Python34\lib\xml\etree\ElementTree.py", line 1304, in __next__
self._parser.feed(data)
File "C:\Python34\lib\xml\etree\ElementTree.py", line 1235, in feed
self._parser.feed(data)
接下来是Python脚本:
import psycopg2
import gc
import sys
import os
from xml.etree.cElementTree import iterparse
import logging
def update_table(file,
dump_path='C:\dataset',
update_query='UPDATE posts SET parentid={parentIdValue} WHERE id={idValue};', log_filename='parser.log'):
logging.basicConfig(filename=os.path.join(dump_path, log_filename),level=logging.INFO)
with open(os.path.join(dump_path, file + '.xml'), encoding='utf8') as xml_file:
table_name = file
cur = conn.cursor()
for event, elem in iterparse(xml_file):
if elem.tag == "row":
logging.debug(elem.attrib.keys())
if 'ParentId' not in elem.attrib.keys() or 'Id' not in elem.attrib.keys():
continue
parentIdVal = elem.attrib.get('ParentId')
idVal = elem.attrib.get('Id')
query = update_query.format(parentIdValue=parentIdVal,idValue=idVal)
cur.execute(query)
conn.commit()
elem.clear()
conn.commit()
if __name__ == '__main__':
conn = psycopg2.connect(database="forum", user="postgres", password="password", port="5432")
print ("Opened database successfully")
update_table('C:\dataset\Posts')
conn.close()
就 Python 中的内存处理而言,我肯定缺少一些东西。不胜感激。
我的 xml 文件的片段如下:
<?xml version="1.0" encoding="utf-8"?>
<posts>
<row Id="4" PostTypeId="1" AcceptedAnswerId="7" CreationDate="2008-07-31T21:42:52.667" Score="305" ViewCount="20324" Body="<p>I want to use a track-bar to change a form's opacity.</p>

<p>This is my code:</p>

<pre><code>decimal trans = trackBar1.Value / 5000;
this.Opacity = trans;
</code></pre>

<p>When I try to build it, I get this error:</p>

<blockquote>
 <p>Cannot implicitly convert type 'decimal' to 'double'.</p>
</blockquote>

<p>I tried making <code>trans</code> a <code>double</code>, but then the control doesn't work. This code has worked fine for me in VB.NET in the past. </p>
" OwnerUserId="8" LastEditorUserId="451518" LastEditorDisplayName="Rich B" LastEditDate="2014-07-28T10:02:50.557" LastActivityDate="2014-07-28T10:02:50.557" Title="When setting a form's opacity should I use a decimal or double?" Tags="<c#><winforms><type-conversion><opacity>" AnswerCount="13" CommentCount="1" FavoriteCount="28" CommunityOwnedDate="2012-10-31T16:42:47.213" />
<row Id="7" PostTypeId="2" ParentId="4" CreationDate="2008-07-31T22:17:57.883" Score="234" Body="<p>An explicit cast to double isn't necessary.</p>

<pre><code>double trans = (double)trackBar1.Value / 5000.0;
</code></pre>

<p>Identifying the constant as <code>5000.0</code> (or as <code>5000d</code>) is sufficient:</p>

<pre><code>double trans = trackBar1.Value / 5000.0;
double trans = trackBar1.Value / 5000d;
</code></pre>
" OwnerUserId="9" LastEditorUserId="967315" LastEditDate="2012-10-14T11:50:16.703" LastActivityDate="2012-10-14T11:50:16.703" CommentCount="0" />
如评论中所述,考虑在定义函数的循环中使用 lxml 库中的 xpath:
import lxml.etree as et # TO REPLACE: from xml.etree.cElementTree import iterparse
...
with open(os.path.join(dump_path, file + '.xml'), encoding='utf8') as xml_file:
table_name = file
cur = conn.cursor()
tree = et.parse(xml_file)
# PULLS EVERY PARENTID ATTRIBUTE IN DOCUMENT REGARDLESS OF LOCATION INTO LIST
parentidval = tree.xpath("//row/@ParentId")
# PULLS EVERY ID ATTRIBUTE IN DOCUMENT REGARDLESS OF LOCATION INTO LIST
idval = tree.xpath("//row/@Id")
for p, i in zip(parentidval, idval):
query = update_query.format(parentIdValue=p,idValue=i)
cur.execute(query)
conn.commit()
将大文件分成小块------------------------
for ftimes in range(1,6): # REMOVE SMALLER FILES IF EXISTS
fname = os.path.join(dump_path, 'Posts{0}.xml'.format(ftimes))
if os.path.isfile(fname):
os.remove(fname)
f = open(os.path.join(dump_path, 'Posts.xml')) # OPENING LARGE FILE
for ftimes in range(1,6): # NUMBER OF SMALLER FILES (HERE 5)
fname = os.path.join(dump_path, 'Posts{0}.xml'.format(ftimes))
for line, i in zip(f, range(1,101)): # NUMBER OF LINES PER FILE (HERE 100)
w = open(fname, 'a', newline='')
w.write(line)
w.close()
f.close()
# PASS EACH POST1.XML, POST2.XML, POST3.XML... IN DEFINED FUNCTION update_table()
XSLT-------------------------------- ---------------------------------------------- --
<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:strip-space elements="*" />
<xsl:template match="posts">
<posts>
<xsl:for-each select="row">
<row>
<Id><xsl:value-of select="@Id"/></Id>
<ParentId><xsl:value-of select="@ParentId"/></ParentId>
</row>
</xsl:for-each>
</posts>
</xsl:template>
</xsl:transform>
Python
import lxml.etree as ET
...
dom = ET.parse(os.path.join(cd, 'Posts.xml'))
xslt = ET.parse(os.path.join(cd, 'Posts.xsl'))
transform = ET.XSLT(xslt)
newdom = transform(dom)
tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
xmlfile = open(os.path.join(cd, 'Posts_py.xml'),'wb') # SMALLER FILE TO WORK WITH
xmlfile.write(tree_out)
xmlfile.close()
输出
<?xml version='1.0' encoding='UTF-8'?>
<posts>
<row>
<Id>4</Id>
<ParentId/>
</row>
<row>
<Id>7</Id>
<ParentId>4</ParentId>
</row>
</posts>
或(没有pretty print
)
<?xml version='1.0' encoding='UTF-8'?>
<posts><row><Id>4</Id><ParentId/></row><row><Id>7</Id><ParentId>4</ParentId></row></posts>
我用于更新 PostgreSQL 数据库 table 列的 Python 脚本被内存错误中断。该脚本从大约 28 GB 的 XML 文件 Posts
中读取数据,并尝试更新数据库 forum
.
parentid
错误的示例如下:
Traceback (most recent call last): File "dbUpdate.py", line 43, in <module> update_table('C:\dataset\Posts') File "dbUpdate.py", line 23, in update_table for event, elem in iterparse(xml_file): File "C:\Python34\lib\xml\etree\ElementTree.py", line 1304, in __next__ self._parser.feed(data) File "C:\Python34\lib\xml\etree\ElementTree.py", line 1235, in feed self._parser.feed(data)
接下来是Python脚本:
import psycopg2
import gc
import sys
import os
from xml.etree.cElementTree import iterparse
import logging
def update_table(file,
dump_path='C:\dataset',
update_query='UPDATE posts SET parentid={parentIdValue} WHERE id={idValue};', log_filename='parser.log'):
logging.basicConfig(filename=os.path.join(dump_path, log_filename),level=logging.INFO)
with open(os.path.join(dump_path, file + '.xml'), encoding='utf8') as xml_file:
table_name = file
cur = conn.cursor()
for event, elem in iterparse(xml_file):
if elem.tag == "row":
logging.debug(elem.attrib.keys())
if 'ParentId' not in elem.attrib.keys() or 'Id' not in elem.attrib.keys():
continue
parentIdVal = elem.attrib.get('ParentId')
idVal = elem.attrib.get('Id')
query = update_query.format(parentIdValue=parentIdVal,idValue=idVal)
cur.execute(query)
conn.commit()
elem.clear()
conn.commit()
if __name__ == '__main__':
conn = psycopg2.connect(database="forum", user="postgres", password="password", port="5432")
print ("Opened database successfully")
update_table('C:\dataset\Posts')
conn.close()
就 Python 中的内存处理而言,我肯定缺少一些东西。不胜感激。
我的 xml 文件的片段如下:
<?xml version="1.0" encoding="utf-8"?>
<posts>
<row Id="4" PostTypeId="1" AcceptedAnswerId="7" CreationDate="2008-07-31T21:42:52.667" Score="305" ViewCount="20324" Body="<p>I want to use a track-bar to change a form's opacity.</p>

<p>This is my code:</p>

<pre><code>decimal trans = trackBar1.Value / 5000;
this.Opacity = trans;
</code></pre>

<p>When I try to build it, I get this error:</p>

<blockquote>
 <p>Cannot implicitly convert type 'decimal' to 'double'.</p>
</blockquote>

<p>I tried making <code>trans</code> a <code>double</code>, but then the control doesn't work. This code has worked fine for me in VB.NET in the past. </p>
" OwnerUserId="8" LastEditorUserId="451518" LastEditorDisplayName="Rich B" LastEditDate="2014-07-28T10:02:50.557" LastActivityDate="2014-07-28T10:02:50.557" Title="When setting a form's opacity should I use a decimal or double?" Tags="<c#><winforms><type-conversion><opacity>" AnswerCount="13" CommentCount="1" FavoriteCount="28" CommunityOwnedDate="2012-10-31T16:42:47.213" />
<row Id="7" PostTypeId="2" ParentId="4" CreationDate="2008-07-31T22:17:57.883" Score="234" Body="<p>An explicit cast to double isn't necessary.</p>

<pre><code>double trans = (double)trackBar1.Value / 5000.0;
</code></pre>

<p>Identifying the constant as <code>5000.0</code> (or as <code>5000d</code>) is sufficient:</p>

<pre><code>double trans = trackBar1.Value / 5000.0;
double trans = trackBar1.Value / 5000d;
</code></pre>
" OwnerUserId="9" LastEditorUserId="967315" LastEditDate="2012-10-14T11:50:16.703" LastActivityDate="2012-10-14T11:50:16.703" CommentCount="0" />
如评论中所述,考虑在定义函数的循环中使用 lxml 库中的 xpath:
import lxml.etree as et # TO REPLACE: from xml.etree.cElementTree import iterparse
...
with open(os.path.join(dump_path, file + '.xml'), encoding='utf8') as xml_file:
table_name = file
cur = conn.cursor()
tree = et.parse(xml_file)
# PULLS EVERY PARENTID ATTRIBUTE IN DOCUMENT REGARDLESS OF LOCATION INTO LIST
parentidval = tree.xpath("//row/@ParentId")
# PULLS EVERY ID ATTRIBUTE IN DOCUMENT REGARDLESS OF LOCATION INTO LIST
idval = tree.xpath("//row/@Id")
for p, i in zip(parentidval, idval):
query = update_query.format(parentIdValue=p,idValue=i)
cur.execute(query)
conn.commit()
将大文件分成小块------------------------
for ftimes in range(1,6): # REMOVE SMALLER FILES IF EXISTS
fname = os.path.join(dump_path, 'Posts{0}.xml'.format(ftimes))
if os.path.isfile(fname):
os.remove(fname)
f = open(os.path.join(dump_path, 'Posts.xml')) # OPENING LARGE FILE
for ftimes in range(1,6): # NUMBER OF SMALLER FILES (HERE 5)
fname = os.path.join(dump_path, 'Posts{0}.xml'.format(ftimes))
for line, i in zip(f, range(1,101)): # NUMBER OF LINES PER FILE (HERE 100)
w = open(fname, 'a', newline='')
w.write(line)
w.close()
f.close()
# PASS EACH POST1.XML, POST2.XML, POST3.XML... IN DEFINED FUNCTION update_table()
XSLT-------------------------------- ---------------------------------------------- --
<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:strip-space elements="*" />
<xsl:template match="posts">
<posts>
<xsl:for-each select="row">
<row>
<Id><xsl:value-of select="@Id"/></Id>
<ParentId><xsl:value-of select="@ParentId"/></ParentId>
</row>
</xsl:for-each>
</posts>
</xsl:template>
</xsl:transform>
Python
import lxml.etree as ET
...
dom = ET.parse(os.path.join(cd, 'Posts.xml'))
xslt = ET.parse(os.path.join(cd, 'Posts.xsl'))
transform = ET.XSLT(xslt)
newdom = transform(dom)
tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
xmlfile = open(os.path.join(cd, 'Posts_py.xml'),'wb') # SMALLER FILE TO WORK WITH
xmlfile.write(tree_out)
xmlfile.close()
输出
<?xml version='1.0' encoding='UTF-8'?>
<posts>
<row>
<Id>4</Id>
<ParentId/>
</row>
<row>
<Id>7</Id>
<ParentId>4</ParentId>
</row>
</posts>
或(没有pretty print
)
<?xml version='1.0' encoding='UTF-8'?>
<posts><row><Id>4</Id><ParentId/></row><row><Id>7</Id><ParentId>4</ParentId></row></posts>