python 内存错误 - 大循环 xml 到 mongodb
python memoryerror - large loop xml to mongodb
我从 https://clinicaltrials.gov/AllPublicXML.zip 下载了一个 zip 文件,其中包含超过 200k xml 个文件(大多数大小 < 10 kb),到一个目录(参见代码中的 'dirpath_zip' ) 我在 ubuntu 16.04 中创建(使用 DigitalOcean)。我想要完成的是将所有这些加载到 MongoDB(也安装在与 zip 文件相同的位置)。
我运行下面的CODE两次,在处理第15988个文件时一直失败。
我用谷歌搜索并尝试阅读有关此特定错误的其他帖子,但找不到解决此特定问题的方法。实际上,我不太确定到底是什么问题...非常感谢任何帮助!!
代码:
import re
import json
import zipfile
import pymongo
import datetime
import xmltodict
from bs4 import BeautifulSoup
from pprint import pprint as ppt
def timestamper(stamp_type="regular"):
if stamp_type == "regular":
timestamp = str(datetime.datetime.now())
elif stamp_type == "filename":
timestamp = str(datetime.datetime.now()).replace("-", "").replace(":", "").replace(" ", "_")[:15]
else:
sys.exit("ERROR [timestamper()]: unexpected 'stamp_type' (parameter) encountered")
return timestamp
client = pymongo.MongoClient()
db = client['ctgov']
coll_name = "ts_"+timestamper(stamp_type="filename")
coll = db[coll_name]
dirpath_zip = '/glbdat/ctgov/all/alltrials_20180402.zip'
z = zipfile.ZipFile(dirpath_zip, 'r')
i = 0
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue
else:
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
i+=1
错误信息:
Traceback (most recent call last):
File "zip_to_mongo_alltrials.py", line 38, in <module>
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
File "/usr/local/lib/python3.5/dist-packages/bs4/__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.5/dist-packages/bs4/builder/_lxml.py", line 118, in prepare_markup
for encoding in detector.encodings:
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "/usr/lib/python3/dist-packages/chardet/__init__.py", line 30, in detect
u.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/universaldetector.py", line 128, in feed
if prober.feed(aBuf) == constants.eFoundIt:
File "/usr/lib/python3/dist-packages/chardet/charsetgroupprober.py", line 64, in feed
st = prober.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/hebrewprober.py", line 224, in feed
aBuf = self.filter_high_bit_only(aBuf)
File "/usr/lib/python3/dist-packages/chardet/charsetprober.py", line 53, in filter_high_bit_only
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
File "/usr/lib/python3.5/re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError
尝试用另一种方法从文件中读取并插入到数据库中。
还要为垃圾收集添加 gc.collect()
。
import gc;
def read_xml_insert(xmlfile):
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue;
else:
read_xml_insert(xmlfile);
i+=1
gc.collect()
`
请see。
我从 https://clinicaltrials.gov/AllPublicXML.zip 下载了一个 zip 文件,其中包含超过 200k xml 个文件(大多数大小 < 10 kb),到一个目录(参见代码中的 'dirpath_zip' ) 我在 ubuntu 16.04 中创建(使用 DigitalOcean)。我想要完成的是将所有这些加载到 MongoDB(也安装在与 zip 文件相同的位置)。
我运行下面的CODE两次,在处理第15988个文件时一直失败。
我用谷歌搜索并尝试阅读有关此特定错误的其他帖子,但找不到解决此特定问题的方法。实际上,我不太确定到底是什么问题...非常感谢任何帮助!!
代码:
import re
import json
import zipfile
import pymongo
import datetime
import xmltodict
from bs4 import BeautifulSoup
from pprint import pprint as ppt
def timestamper(stamp_type="regular"):
if stamp_type == "regular":
timestamp = str(datetime.datetime.now())
elif stamp_type == "filename":
timestamp = str(datetime.datetime.now()).replace("-", "").replace(":", "").replace(" ", "_")[:15]
else:
sys.exit("ERROR [timestamper()]: unexpected 'stamp_type' (parameter) encountered")
return timestamp
client = pymongo.MongoClient()
db = client['ctgov']
coll_name = "ts_"+timestamper(stamp_type="filename")
coll = db[coll_name]
dirpath_zip = '/glbdat/ctgov/all/alltrials_20180402.zip'
z = zipfile.ZipFile(dirpath_zip, 'r')
i = 0
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue
else:
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
i+=1
错误信息:
Traceback (most recent call last):
File "zip_to_mongo_alltrials.py", line 38, in <module>
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
File "/usr/local/lib/python3.5/dist-packages/bs4/__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.5/dist-packages/bs4/builder/_lxml.py", line 118, in prepare_markup
for encoding in detector.encodings:
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "/usr/lib/python3/dist-packages/chardet/__init__.py", line 30, in detect
u.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/universaldetector.py", line 128, in feed
if prober.feed(aBuf) == constants.eFoundIt:
File "/usr/lib/python3/dist-packages/chardet/charsetgroupprober.py", line 64, in feed
st = prober.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/hebrewprober.py", line 224, in feed
aBuf = self.filter_high_bit_only(aBuf)
File "/usr/lib/python3/dist-packages/chardet/charsetprober.py", line 53, in filter_high_bit_only
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
File "/usr/lib/python3.5/re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError
尝试用另一种方法从文件中读取并插入到数据库中。
还要为垃圾收集添加 gc.collect()
。
import gc;
def read_xml_insert(xmlfile):
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue;
else:
read_xml_insert(xmlfile);
i+=1
gc.collect()
`
请see。