使用 python 解析不同的 xml 文件
parsing different xml files with python
我有 2 个 xml 个文件,
单词和主题。
我需要根据主题文件解析word文件。
文件如下
文件 1 个主题
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.topic"
xmlns:nite="http://nite.sourceforge.net/">
<topic nite:id="ES2002a.topic.vkaraisk.1" other_description="introduction of participants and their roles">
<nite:pointer role="scenario_topic_type" href="default-topics.xml#id(top.4)"/>
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
<nite:child href="ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)"/>
<nite:child
文件 2 字 (ES2002a.B.words.xml)
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.B.words" xmlns:nite="http://nite.sourceforge.net/">
<w nite:id="ES2002a.B.words0" starttime="50.42" endtime="50.99">Okay</w>
<w nite:id="ES2002a.B.words1" starttime="50.99" endtime="50.99" punc="true">.</w>
<w nite:id="ES2002a.B.words2" starttime="53.56" endtime="53.96">Right</w>
<w nite:id="ES2002a.B.words3" starttime="53.96" endtime="53.96" punc="true">.</w>
<vocalsound nite:id="ES2002a.B.words4" starttime="55.415" endtime="55.415" type="other"/>
<w nite:id="ES2002a.B.words5" starttime="55.98" endtime="56.53">Um</w>
文件 2 字 (ES2002a.D.words.xml)
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.D.words" xmlns:nite="http://nite.sourceforge.net/">
<w nite:id="ES2002a.D.words0" starttime="67.21" endtime="67.45">Mm-hmm</w>
<w nite:id="ES2002a.D.words1" starttime="67.45" endtime="67.45" punc="true">.</w>
<w nite:id="ES2002a.D.words2" starttime="74.89" endtime="75.24">Great</w>
<w nite:id="ES2002a.D.words3" starttime="75.24" endtime="75.24" punc="true">.</w>
<w nite:id="ES2002a.D.words4" starttime="82.08" endtime="82.25">And</w>
<w nite:id="ES2002a.D.words5" starttime="82.25" endtime="82.43">I'm</w>
有多个word文件需要根据主题文件进行解析。
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
我们看到主题文件说从文件 ES2002a.B.words
中获取单词 1-5
期望的输出是
好的 。正确的 。嗯
嗯嗯。很棒
题目文件里我已经解析过了,虽然代码很笨重
from lxml import etree
tree = etree.parse("./ES2013a.topic.xml")
root = tree.getroot()
childA = []
elementT = []
ElementA = []
for child in root:
elementT.append(str(child.tag))
ElementA.append(str(child.attrib))
childA.append(str(child.attrib))
for element in child:
elementT.append(str(element.tag))
#childA.append(child.attrib)
ElementA.append(str(element.attrib))
childA.append(str(child.attrib))
for sub in element:
#print('***', child.attrib , ':' , element.tag, ':' , element.attrib, '***')
#childA.append(child.attrib)
elementT.append(str(sub.tag))
ElementA.append(str(sub.attrib))
childA.append(str(child.attrib))
df = pd.DataFrame()
df['c'] = np.array (childA)
df['t'] = np.array(ElementA)
df['a'] = np.array(elementT)
file = df['t'].str.extract(r'([A-Z][A-Z].*[words.xml])#')
start = df['t'].str.extract(r'words([0-9]+)')
stop = df['t'].str.extract(r'.*words([0-9]+)')
tags = df['a'].str.extract(r'.*([topic]|[pointer]|[child])')
rootTopic = df['c'].str.extract(r'ES2013a.topic.rdhillon.(\d+)')
df['f'] = file
df['start'] = start
df['stop'] = stop
df['tags'] = tags
# c= topic
# r = pointerr
# d= child
df['topicID'] = rootTopic
df = df.iloc[:,3:]
我正在考虑使用一些 word 文件,然后根据开始和停止条件迭代 word 文件
我曾经用jxmlease来解析xml。它只是将 XML 字符串转换为 python dictionary
。
topic.xml
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.topic"
xmlns:nite="http://nite.sourceforge.net/">
<topic nite:id="ES2002a.topic.vkaraisk.1" other_description="introduction of participants and their roles">
<nite:pointer role="scenario_topic_type" href="default-topics.xml#id(top.4)"/>
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
<nite:child href="ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)"/>
</topic>
</nite:root>
import jxmlease
with open('topic.xml') as topic:
topic_content = topic.read()
root = jxmlease.parse(topic_content)
first_word_selection = root['nite:root']['topic']['nite:child'][0].get_xml_attr("href")
print(first_word_selection)
output : ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)
我有 2 个 xml 个文件, 单词和主题。
我需要根据主题文件解析word文件。 文件如下
文件 1 个主题
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.topic"
xmlns:nite="http://nite.sourceforge.net/">
<topic nite:id="ES2002a.topic.vkaraisk.1" other_description="introduction of participants and their roles">
<nite:pointer role="scenario_topic_type" href="default-topics.xml#id(top.4)"/>
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
<nite:child href="ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)"/>
<nite:child
文件 2 字 (ES2002a.B.words.xml)
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.B.words" xmlns:nite="http://nite.sourceforge.net/">
<w nite:id="ES2002a.B.words0" starttime="50.42" endtime="50.99">Okay</w>
<w nite:id="ES2002a.B.words1" starttime="50.99" endtime="50.99" punc="true">.</w>
<w nite:id="ES2002a.B.words2" starttime="53.56" endtime="53.96">Right</w>
<w nite:id="ES2002a.B.words3" starttime="53.96" endtime="53.96" punc="true">.</w>
<vocalsound nite:id="ES2002a.B.words4" starttime="55.415" endtime="55.415" type="other"/>
<w nite:id="ES2002a.B.words5" starttime="55.98" endtime="56.53">Um</w>
文件 2 字 (ES2002a.D.words.xml)
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.D.words" xmlns:nite="http://nite.sourceforge.net/">
<w nite:id="ES2002a.D.words0" starttime="67.21" endtime="67.45">Mm-hmm</w>
<w nite:id="ES2002a.D.words1" starttime="67.45" endtime="67.45" punc="true">.</w>
<w nite:id="ES2002a.D.words2" starttime="74.89" endtime="75.24">Great</w>
<w nite:id="ES2002a.D.words3" starttime="75.24" endtime="75.24" punc="true">.</w>
<w nite:id="ES2002a.D.words4" starttime="82.08" endtime="82.25">And</w>
<w nite:id="ES2002a.D.words5" starttime="82.25" endtime="82.43">I'm</w>
有多个word文件需要根据主题文件进行解析。
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
我们看到主题文件说从文件 ES2002a.B.words
中获取单词 1-5期望的输出是 好的 。正确的 。嗯 嗯嗯。很棒
题目文件里我已经解析过了,虽然代码很笨重
from lxml import etree
tree = etree.parse("./ES2013a.topic.xml")
root = tree.getroot()
childA = []
elementT = []
ElementA = []
for child in root:
elementT.append(str(child.tag))
ElementA.append(str(child.attrib))
childA.append(str(child.attrib))
for element in child:
elementT.append(str(element.tag))
#childA.append(child.attrib)
ElementA.append(str(element.attrib))
childA.append(str(child.attrib))
for sub in element:
#print('***', child.attrib , ':' , element.tag, ':' , element.attrib, '***')
#childA.append(child.attrib)
elementT.append(str(sub.tag))
ElementA.append(str(sub.attrib))
childA.append(str(child.attrib))
df = pd.DataFrame()
df['c'] = np.array (childA)
df['t'] = np.array(ElementA)
df['a'] = np.array(elementT)
file = df['t'].str.extract(r'([A-Z][A-Z].*[words.xml])#')
start = df['t'].str.extract(r'words([0-9]+)')
stop = df['t'].str.extract(r'.*words([0-9]+)')
tags = df['a'].str.extract(r'.*([topic]|[pointer]|[child])')
rootTopic = df['c'].str.extract(r'ES2013a.topic.rdhillon.(\d+)')
df['f'] = file
df['start'] = start
df['stop'] = stop
df['tags'] = tags
# c= topic
# r = pointerr
# d= child
df['topicID'] = rootTopic
df = df.iloc[:,3:]
我正在考虑使用一些 word 文件,然后根据开始和停止条件迭代 word 文件
我曾经用jxmlease来解析xml。它只是将 XML 字符串转换为 python dictionary
。
topic.xml
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<nite:root nite:id="ES2002a.topic"
xmlns:nite="http://nite.sourceforge.net/">
<topic nite:id="ES2002a.topic.vkaraisk.1" other_description="introduction of participants and their roles">
<nite:pointer role="scenario_topic_type" href="default-topics.xml#id(top.4)"/>
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words0)..id(ES2002a.B.words5)"/>
<nite:child href="ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)"/>
</topic>
</nite:root>
import jxmlease
with open('topic.xml') as topic:
topic_content = topic.read()
root = jxmlease.parse(topic_content)
first_word_selection = root['nite:root']['topic']['nite:child'][0].get_xml_attr("href")
print(first_word_selection)
output : ES2002a.D.words.xml#id(ES2002a.D.words0)..id(ES2002a.D.words3)