将动态 XML 文件转换为 CSV 文件 - Python
Convert dynamic XML file to CSV file - Python
我想转换这个 XML 文件:
<record id="idOne">
<ts date="2019-07-03" time="15:28:41.720440">5</ts>
<ts date="2019-07-03" time="15:28:42.629959">10</ts>
<ts date="2019-07-03" time="15:28:43.552677">15</ts>
<ts date="2019-07-03" time="15:28:43.855345">20</ts>
</record>
<record id="idOne">
<ts date="2019-07-03" time="15:28:45.072922">30</ts>
<ts date="2019-07-03" time="15:28:45.377087">35</ts>
<ts date="2019-07-03" time="15:28:46.316321">40</ts>
<ts date="2019-07-03" time="15:28:47.527960">45</ts>
</record>
到此 CSV 文件:
ID, date, time, value
idOne, 2019-07-03, 15:28:41.720440, 5
idOne, 2019-07-03, 15:28:42.629959, 10
idOne, 2019-07-03, 15:28:43.552677, 15
idOne, 2019-07-03, 15:28:43.855345, 20
idOne, 2019-07-03, 15:28:45.072922, 30
idOne, 2019-07-03, 15:28:45.377087, 35
idOne, 2019-07-03, 15:28:46.316321, 40
idOne, 2019-07-03, 15:28:47.527960, 45
我可以有多个 ID 结构体。
我使用 lxml 库。
我尝试了xpath方法和for循环,但我只能得到ID而不能得到其余的。问题是第二个for循环,但是我不知道如何处理"date"和"time"...
的值
with open(args.input, "r") as f:
# add root balises to parse the xml file
records = itertools.chain('<root>', f, '</root>')
root = etree.fromstringlist(records)
#root = etree.fromstring(records)
# count the number of records
NumberRecords = int(root.xpath('count(//record)'))
RecordsGrid = [[] for __ in range(NumberRecords)]
tss = ["id","date", "time", "value"]
paths = root.xpath('//record')
#print(paths)
Counter = 0
for path in paths:
for ts in tss[:1]:
target = f'(./@{ts})' # using f-strings to populate the full path
if path.xpath(target):
# we start populating our current sublist with the relevant info
RecordsGrid[Counter].append(path.xpath(target)[0])
else:
RecordsGrid[Counter].append('NA')
for ts in tss[1:]:
target = f'(./ts[@name="{ts}"]/text())'
if path.xpath(target):
RecordsGrid[Counter].append(path.xpath(target)[0])
else:
RecordsGrid[Counter].append('NA')
Counter += 1
# now that we have our lists, create a df
df = pd.DataFrame(RecordsGrid, columns=tss)
df.to_csv(args.output, sep=',', encoding='utf-8', index=False)
结果如下:
id,date,time,value
idOne,NA,NA,NA
感谢您的宝贵时间。
尝试以下方法
from bs4 import BeautifulSoup as bs
data = list()
with open("data.xml") as xml:
data_xml = bs(xml, "html.parser")
for record in data_xml.find_all("record"):
for ts in record.find_all("ts"):
id_, date, time, value = record.get("id"), ts.get("date"), ts.get("time"), ts.text
data.append(", ".join([id_, date, time, value]) + "\n")
with open("data.csv", "w") as csv:
csv.write("ID, date, time, value\n")
csv.writelines(data)
要使用 lxml,您只需将字符串作为 html() 传递即可。通过使用 xpath //record/ts(以双反斜杠开头),您可以获取所有 ts 结果。可以通过调用 .getparent() 然后调用属性来访问主 ID。
要将 xml 转换为 csv,我建议使用 python 包 csv。您可以使用普通的文件编写器。然而,csv 写入处理了很多问题,而且更干净。
一般来说,你有一个方法可以处理所有事情。我建议将逻辑拆分为函数。想想Single Responsibility。还有下面的解决方案,我将 xml 节点转换为 NamedTupple,然后将 namedTupple 写入 csv。维护/阅读要容易得多。 (即有一个地方设置 header 文本,一个地方填充数据)。
from lxml import etree
import csv #py -m pip install python-csv
import collections
from collections import namedtuple
Record = namedtuple('Record', ['id', 'date', 'time', 'value']) # Model to store records.
def CreateCsvFile(results):
with open('results.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=list(Record._fields)) # use the namedtuple fields for the headers
writer.writeheader()
writer.writerows([x._asdict() for x in results]) # To use DictWriter, the namedtuple has to be converted to dictionary
def FormatRecord(xmlNode):
return Record(xmlNode.getparent().attrib['id'], xmlNode.attrib["date"], xmlNode.attrib["time"], xmlNode.text)
def Main(html):
xmlTree = etree.HTML(html)
results = [FormatRecord(xmlNode) for xmlNode in xmlTree.xpath('//record/ts')] # the double backslash will retrieve all nodes for record.
CreateCsvFile(results)
if __name__ == '__main__':
Main("""<record id="idOne">
<ts date="2019-07-03" time="15:28:41.720440">5</ts>
<ts date="2019-07-03" time="15:28:42.629959">10</ts>
<ts date="2019-07-03" time="15:28:43.552677">15</ts>
<ts date="2019-07-03" time="15:28:43.855345">20</ts>
</record>
<record id="idTwo">
<ts date="2019-07-03" time="15:28:45.072922">30</ts>
<ts date="2019-07-03" time="15:28:45.377087">35</ts>
<ts date="2019-07-03" time="15:28:46.316321">40</ts>
<ts date="2019-07-03" time="15:28:47.527960">45</ts>
</record>""")
我想转换这个 XML 文件:
<record id="idOne">
<ts date="2019-07-03" time="15:28:41.720440">5</ts>
<ts date="2019-07-03" time="15:28:42.629959">10</ts>
<ts date="2019-07-03" time="15:28:43.552677">15</ts>
<ts date="2019-07-03" time="15:28:43.855345">20</ts>
</record>
<record id="idOne">
<ts date="2019-07-03" time="15:28:45.072922">30</ts>
<ts date="2019-07-03" time="15:28:45.377087">35</ts>
<ts date="2019-07-03" time="15:28:46.316321">40</ts>
<ts date="2019-07-03" time="15:28:47.527960">45</ts>
</record>
到此 CSV 文件:
ID, date, time, value
idOne, 2019-07-03, 15:28:41.720440, 5
idOne, 2019-07-03, 15:28:42.629959, 10
idOne, 2019-07-03, 15:28:43.552677, 15
idOne, 2019-07-03, 15:28:43.855345, 20
idOne, 2019-07-03, 15:28:45.072922, 30
idOne, 2019-07-03, 15:28:45.377087, 35
idOne, 2019-07-03, 15:28:46.316321, 40
idOne, 2019-07-03, 15:28:47.527960, 45
我可以有多个 ID 结构体。
我使用 lxml 库。
我尝试了xpath方法和for循环,但我只能得到ID而不能得到其余的。问题是第二个for循环,但是我不知道如何处理"date"和"time"...
的值with open(args.input, "r") as f:
# add root balises to parse the xml file
records = itertools.chain('<root>', f, '</root>')
root = etree.fromstringlist(records)
#root = etree.fromstring(records)
# count the number of records
NumberRecords = int(root.xpath('count(//record)'))
RecordsGrid = [[] for __ in range(NumberRecords)]
tss = ["id","date", "time", "value"]
paths = root.xpath('//record')
#print(paths)
Counter = 0
for path in paths:
for ts in tss[:1]:
target = f'(./@{ts})' # using f-strings to populate the full path
if path.xpath(target):
# we start populating our current sublist with the relevant info
RecordsGrid[Counter].append(path.xpath(target)[0])
else:
RecordsGrid[Counter].append('NA')
for ts in tss[1:]:
target = f'(./ts[@name="{ts}"]/text())'
if path.xpath(target):
RecordsGrid[Counter].append(path.xpath(target)[0])
else:
RecordsGrid[Counter].append('NA')
Counter += 1
# now that we have our lists, create a df
df = pd.DataFrame(RecordsGrid, columns=tss)
df.to_csv(args.output, sep=',', encoding='utf-8', index=False)
结果如下:
id,date,time,value
idOne,NA,NA,NA
感谢您的宝贵时间。
尝试以下方法
from bs4 import BeautifulSoup as bs
data = list()
with open("data.xml") as xml:
data_xml = bs(xml, "html.parser")
for record in data_xml.find_all("record"):
for ts in record.find_all("ts"):
id_, date, time, value = record.get("id"), ts.get("date"), ts.get("time"), ts.text
data.append(", ".join([id_, date, time, value]) + "\n")
with open("data.csv", "w") as csv:
csv.write("ID, date, time, value\n")
csv.writelines(data)
要使用 lxml,您只需将字符串作为 html() 传递即可。通过使用 xpath //record/ts(以双反斜杠开头),您可以获取所有 ts 结果。可以通过调用 .getparent() 然后调用属性来访问主 ID。
要将 xml 转换为 csv,我建议使用 python 包 csv。您可以使用普通的文件编写器。然而,csv 写入处理了很多问题,而且更干净。
一般来说,你有一个方法可以处理所有事情。我建议将逻辑拆分为函数。想想Single Responsibility。还有下面的解决方案,我将 xml 节点转换为 NamedTupple,然后将 namedTupple 写入 csv。维护/阅读要容易得多。 (即有一个地方设置 header 文本,一个地方填充数据)。
from lxml import etree
import csv #py -m pip install python-csv
import collections
from collections import namedtuple
Record = namedtuple('Record', ['id', 'date', 'time', 'value']) # Model to store records.
def CreateCsvFile(results):
with open('results.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=list(Record._fields)) # use the namedtuple fields for the headers
writer.writeheader()
writer.writerows([x._asdict() for x in results]) # To use DictWriter, the namedtuple has to be converted to dictionary
def FormatRecord(xmlNode):
return Record(xmlNode.getparent().attrib['id'], xmlNode.attrib["date"], xmlNode.attrib["time"], xmlNode.text)
def Main(html):
xmlTree = etree.HTML(html)
results = [FormatRecord(xmlNode) for xmlNode in xmlTree.xpath('//record/ts')] # the double backslash will retrieve all nodes for record.
CreateCsvFile(results)
if __name__ == '__main__':
Main("""<record id="idOne">
<ts date="2019-07-03" time="15:28:41.720440">5</ts>
<ts date="2019-07-03" time="15:28:42.629959">10</ts>
<ts date="2019-07-03" time="15:28:43.552677">15</ts>
<ts date="2019-07-03" time="15:28:43.855345">20</ts>
</record>
<record id="idTwo">
<ts date="2019-07-03" time="15:28:45.072922">30</ts>
<ts date="2019-07-03" time="15:28:45.377087">35</ts>
<ts date="2019-07-03" time="15:28:46.316321">40</ts>
<ts date="2019-07-03" time="15:28:47.527960">45</ts>
</record>""")