从 XML 数据转储中提取维基百科数据
Extract Wikipedia Data From XML Data dumps
我一直在尝试从一些维基百科转储中提取文本。我需要从 full-history 英语维基百科转储中的 id、标题、ns、时间戳、用户名、ip 和文本标签中获取文本。
我阅读并修改了 https://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html 的代码。
我能够编写以下代码:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
PATH_WIKI_XML = '/home/wikipedia'
FILENAME_WIKI = 'enwiki-latest-pages-meta-history1.xml-p24706p25444'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)
totalCount = 0
articleCount = 0
templateCount = 0
title = None
timestamp= None
username= None
ip= None
text=None
start_time = time.time()
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
templateWriter.writerow(['id', 'title'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = ''
id = -1
inrevision = False
incontributor= False
ns = 0
elif tname == 'revision':
# Do not pick up on revision id's
inrevision = True
elif tname == 'contributor':
incontributor = True
else:
if tname == 'title':
title = elem.text
elif tname == 'id' and not inrevision and not incontributor:
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = int(elem.text)
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
elif tname == 'page':
totalCount += 1
if ns == 10:
templateCount += 1
templateWriter.writerow([id, title])
elif len(title) > 0:
articleCount += 1
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
# if totalCount > 100000:
# break
if totalCount > 1 and (totalCount % 100000) == 0:
print("{:,}".format(totalCount))
elem.clear()
elapsed_time = time.time() - start_time
print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))
但是,生成的 csv 文件只有 ID 和标题。其他列是空的,我猜是因为时间戳标记嵌套在修订标记中,但我的事件只处理页面标记的开始和结束。用户名和 IP 也嵌套在贡献者标签中,如以下示例 XML 文件所示。有人可以告诉我如何解决这个问题吗?我应该在事件中循环事件以从所需的嵌套标签中提取文本吗?这里有可以为我工作的代码吗?所需的输出是一个文件,其中包含所需的标签 headers 和行中的文本。请记住,一个页面可以有多个修订版,每个修订版都有不同的元数据。我想从所需标签中获取所有文本以及页面中的元数据,在我从该页面获取所有内容后,继续下一页。谢谢
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/
http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="sco">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>scowiki</dbname>
<base>http://sco.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.25wmf12</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Wikipedia</namespace>
<namespace key="5" case="first-letter">Wikipedia talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="100" case="first-letter">Portal</namespace>
<namespace key="101" case="first-letter">Portal talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Inglis leid</title>
<ns>0</ns>
<id>2</id>
<revision>
<id>7</id>
<timestamp>2005-06-22T10:17:05Z</timestamp>
<contributor>
<ip>24.251.198.251</ip>
</contributor>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">Tha '''Inglis''' (English) leid is a west [[Gairmanic leid]] at cam frae Ingland an thats forebear wis [[auld Inglis]]. Tha name "English" cams frae tha pairt o [[Gairmanie]] caw'd "Angeln". Inglis is tha waruld's seicont maist widelie spaken first leid, an his aboot 340 million hameborn speikers waruldwide.
[[en:English language]]</text>
<sha1>6m5yxiaalrm6te7e3x3fiw1aq7wk9ir</sha1>
</revision>
</page>
</mediawiki>
将尝试的脚本简化到您需要的最低限度,例如删除计时。这里的过程通常对非常大的 XML 文件使用 iterparse
来逐个标签迭代地解析标签驻留在文档中的标签,以便作为根、父、子、后代等。
因此,逐个标签清理逻辑标签,然后在最后需要的标签上,将行写入 csv,其中包含在每个 <page>
标签上重置的当前分配变量。
pathWikiXML = "Input.xml"
pathWikiCSV = "Output.csv"
def strip_tag_name(t):
return t.split("}")[1] if "}" in t else t
with codecs.open(pathWikiCSV, "w", "utf-8") as f:
cw = csv.writer(f)
cw.writerow(['id', 'title', 'timestamp','username','ip','text'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = None
timestamp = None
username = None
ip = None
text = None
elif tname == 'title':
title = elem.text
elif tname == 'id':
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = elem.text
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
cw.writerow([id, title, timestamp, username, ip, text])
elem.clear()
CSV 输出
id
title
timestamp
username
ip
text
7
Inglis leid
2005-06-22T10:17:05Z
24.251.198.251
"Tha '''Inglis''' (English)...
已更新
阅读下面的评论后,您一定要将真正大的事情分解成尽可能小的事情,然后依次处理最小的事情:
在第一阶段,取任意数量的多页大 XML 文件,并生成许多单页小 XML 文件。
在第二阶段,迭代每个页面文件,提取数据,并写入您的 CSV。
这需要更长的时间,但是:
- 它更容错:如果遇到错误并且出现故障,您可以修复错误并从中断的地方继续
- 当您可以将错误清晰地分为“大事变小有问题”和“从小事中提取数据有问题”两类时,诊断错误会容易得多。
这对于数 TB 的数据尤其如此:您绝不会想一次性尝试获取它。
阶段 1
#!/usr/bin/env python3
import xml.etree.ElementTree as ET
NS = {'xmlns': 'http://www.mediawiki.org/xml/export-0.10/'}
strip_ns = lambda t: t.split('}')[1] if '}' in t else t
# Open XML for input, and iterate
xml_f = open('big.xml')
for (event, elem) in ET.iterparse(xml_f, events=['start', 'end']):
tag_name = strip_ns(elem.tag)
if event == 'start' and tag_name == 'page':
pg_title = elem.find('xmlns:title', NS).text.replace(' ', '_')
pg_ns = elem.find('xmlns:ns', NS).text
pg_id = elem.find('xmlns:id', NS).text
xml_pg_fname = f'{pg_title}_{pg_ns}_{pg_id}.xml' # e.g., Inglis_leid_0_2.xml
xml_byte_str = ET.tostring(elem, encoding='utf-8', default_namespace=NS['xmlns']) # definitely use default_namespace
with open(xml_pg_fname, 'wb') as f_out:
f_out.write(xml_byte_str)
# Close big
xml_f.close()
第 2 阶段
现在您可以轻松安全地使用 XPath 获取数据。
#!/usr/bin/env python3
import csv
import xml.etree.ElementTree as ET
# Set up the namespace that needs to be a part of every XPath query
ns_dict = {'xmlns': 'http://www.mediawiki.org/xml/export-0.10/'}
# Open a file and parse it
page = ET.parse('Inglis_leid_0_2.xml')
# With the page "element", find its direct children (always using `xmlns:` and passing `ns_dict`)
# `./xmlns:<element>` means "find, from here (at page), the element that's only 1 level (directly) under page"
pg_title = page.find('./xmlns:title', ns_dict)
pg_ns = page.find('./xmlns:ns', ns_dict)
pg_id = page.find('./xmlns:id', ns_dict)
# Get the revision element
revision = page.find('./xmlns:revision', ns_dict)
rev_id = revision.find('./xmlns:id', ns_dict)
rev_ts = revision.find('./xmlns:timestamp', ns_dict)
# Find ip under contributor
contrib_ip = revision.find('./xmlns:contributor/xmlns:ip', ns_dict)
print('page title:', pg_title.text)
print('page id:', pg_id.text)
print('rev id:', rev_id.text)
print('rev timestamp:', rev_ts.text)
print('contributor ip:', contrib_ip.text)
# From here, write out to a CSV
with open('out.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Page title', 'Page id', 'Rev id', 'Rev timestamp', 'Contributor ip'])
writer.writerow([pg_title.text, pg_id.text, pg_ns.text, rev_id.text, rev_ts.text, contrib_ip.text])
我一直在尝试从一些维基百科转储中提取文本。我需要从 full-history 英语维基百科转储中的 id、标题、ns、时间戳、用户名、ip 和文本标签中获取文本。
我阅读并修改了 https://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html 的代码。
我能够编写以下代码:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
PATH_WIKI_XML = '/home/wikipedia'
FILENAME_WIKI = 'enwiki-latest-pages-meta-history1.xml-p24706p25444'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)
totalCount = 0
articleCount = 0
templateCount = 0
title = None
timestamp= None
username= None
ip= None
text=None
start_time = time.time()
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
templateWriter.writerow(['id', 'title'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = ''
id = -1
inrevision = False
incontributor= False
ns = 0
elif tname == 'revision':
# Do not pick up on revision id's
inrevision = True
elif tname == 'contributor':
incontributor = True
else:
if tname == 'title':
title = elem.text
elif tname == 'id' and not inrevision and not incontributor:
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = int(elem.text)
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
elif tname == 'page':
totalCount += 1
if ns == 10:
templateCount += 1
templateWriter.writerow([id, title])
elif len(title) > 0:
articleCount += 1
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
# if totalCount > 100000:
# break
if totalCount > 1 and (totalCount % 100000) == 0:
print("{:,}".format(totalCount))
elem.clear()
elapsed_time = time.time() - start_time
print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))
但是,生成的 csv 文件只有 ID 和标题。其他列是空的,我猜是因为时间戳标记嵌套在修订标记中,但我的事件只处理页面标记的开始和结束。用户名和 IP 也嵌套在贡献者标签中,如以下示例 XML 文件所示。有人可以告诉我如何解决这个问题吗?我应该在事件中循环事件以从所需的嵌套标签中提取文本吗?这里有可以为我工作的代码吗?所需的输出是一个文件,其中包含所需的标签 headers 和行中的文本。请记住,一个页面可以有多个修订版,每个修订版都有不同的元数据。我想从所需标签中获取所有文本以及页面中的元数据,在我从该页面获取所有内容后,继续下一页。谢谢
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/
http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="sco">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>scowiki</dbname>
<base>http://sco.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.25wmf12</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Wikipedia</namespace>
<namespace key="5" case="first-letter">Wikipedia talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="100" case="first-letter">Portal</namespace>
<namespace key="101" case="first-letter">Portal talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Inglis leid</title>
<ns>0</ns>
<id>2</id>
<revision>
<id>7</id>
<timestamp>2005-06-22T10:17:05Z</timestamp>
<contributor>
<ip>24.251.198.251</ip>
</contributor>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">Tha '''Inglis''' (English) leid is a west [[Gairmanic leid]] at cam frae Ingland an thats forebear wis [[auld Inglis]]. Tha name "English" cams frae tha pairt o [[Gairmanie]] caw'd "Angeln". Inglis is tha waruld's seicont maist widelie spaken first leid, an his aboot 340 million hameborn speikers waruldwide.
[[en:English language]]</text>
<sha1>6m5yxiaalrm6te7e3x3fiw1aq7wk9ir</sha1>
</revision>
</page>
</mediawiki>
将尝试的脚本简化到您需要的最低限度,例如删除计时。这里的过程通常对非常大的 XML 文件使用 iterparse
来逐个标签迭代地解析标签驻留在文档中的标签,以便作为根、父、子、后代等。
因此,逐个标签清理逻辑标签,然后在最后需要的标签上,将行写入 csv,其中包含在每个 <page>
标签上重置的当前分配变量。
pathWikiXML = "Input.xml"
pathWikiCSV = "Output.csv"
def strip_tag_name(t):
return t.split("}")[1] if "}" in t else t
with codecs.open(pathWikiCSV, "w", "utf-8") as f:
cw = csv.writer(f)
cw.writerow(['id', 'title', 'timestamp','username','ip','text'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = None
timestamp = None
username = None
ip = None
text = None
elif tname == 'title':
title = elem.text
elif tname == 'id':
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = elem.text
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
cw.writerow([id, title, timestamp, username, ip, text])
elem.clear()
CSV 输出
id | title | timestamp | username | ip | text |
---|---|---|---|---|---|
7 | Inglis leid | 2005-06-22T10:17:05Z | 24.251.198.251 | "Tha '''Inglis''' (English)... |
已更新
阅读下面的评论后,您一定要将真正大的事情分解成尽可能小的事情,然后依次处理最小的事情:
在第一阶段,取任意数量的多页大 XML 文件,并生成许多单页小 XML 文件。
在第二阶段,迭代每个页面文件,提取数据,并写入您的 CSV。
这需要更长的时间,但是:
- 它更容错:如果遇到错误并且出现故障,您可以修复错误并从中断的地方继续
- 当您可以将错误清晰地分为“大事变小有问题”和“从小事中提取数据有问题”两类时,诊断错误会容易得多。
这对于数 TB 的数据尤其如此:您绝不会想一次性尝试获取它。
阶段 1
#!/usr/bin/env python3
import xml.etree.ElementTree as ET
NS = {'xmlns': 'http://www.mediawiki.org/xml/export-0.10/'}
strip_ns = lambda t: t.split('}')[1] if '}' in t else t
# Open XML for input, and iterate
xml_f = open('big.xml')
for (event, elem) in ET.iterparse(xml_f, events=['start', 'end']):
tag_name = strip_ns(elem.tag)
if event == 'start' and tag_name == 'page':
pg_title = elem.find('xmlns:title', NS).text.replace(' ', '_')
pg_ns = elem.find('xmlns:ns', NS).text
pg_id = elem.find('xmlns:id', NS).text
xml_pg_fname = f'{pg_title}_{pg_ns}_{pg_id}.xml' # e.g., Inglis_leid_0_2.xml
xml_byte_str = ET.tostring(elem, encoding='utf-8', default_namespace=NS['xmlns']) # definitely use default_namespace
with open(xml_pg_fname, 'wb') as f_out:
f_out.write(xml_byte_str)
# Close big
xml_f.close()
第 2 阶段
现在您可以轻松安全地使用 XPath 获取数据。
#!/usr/bin/env python3
import csv
import xml.etree.ElementTree as ET
# Set up the namespace that needs to be a part of every XPath query
ns_dict = {'xmlns': 'http://www.mediawiki.org/xml/export-0.10/'}
# Open a file and parse it
page = ET.parse('Inglis_leid_0_2.xml')
# With the page "element", find its direct children (always using `xmlns:` and passing `ns_dict`)
# `./xmlns:<element>` means "find, from here (at page), the element that's only 1 level (directly) under page"
pg_title = page.find('./xmlns:title', ns_dict)
pg_ns = page.find('./xmlns:ns', ns_dict)
pg_id = page.find('./xmlns:id', ns_dict)
# Get the revision element
revision = page.find('./xmlns:revision', ns_dict)
rev_id = revision.find('./xmlns:id', ns_dict)
rev_ts = revision.find('./xmlns:timestamp', ns_dict)
# Find ip under contributor
contrib_ip = revision.find('./xmlns:contributor/xmlns:ip', ns_dict)
print('page title:', pg_title.text)
print('page id:', pg_id.text)
print('rev id:', rev_id.text)
print('rev timestamp:', rev_ts.text)
print('contributor ip:', contrib_ip.text)
# From here, write out to a CSV
with open('out.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Page title', 'Page id', 'Rev id', 'Rev timestamp', 'Contributor ip'])
writer.writerow([pg_title.text, pg_id.text, pg_ns.text, rev_id.text, rev_ts.text, contrib_ip.text])