将 XML 文件转换为 Python 中的 CSV 文件
Converting an XML file to a CSV file in Python
我正在将 XML 文件转换为 CSV 文件,但在转换过程中遇到问题。
我收到一个 AttributeError:'NoneType' 对象没有属性 'tag'。
我已经为解决这个问题做了很多研究,不幸的是我没有成功地得出合适的结果,我看不出哪里错了。
XML 文件的片段
<?xml version="1.0" encoding="UTF-8"?>
<filebooks xmlns="http://www.something.com/xml/xxx/filebook/2006-10-31"\>
<filebook>
<header filebook-id="Some Title">
<currency>GBP</currency>
<display-name xml:lang="x-default">Some Name</display-name>
</header>
<file-tables>
<file_table product-id="111">
<amount quantity="1">21.5000</amount>
<file-info>xxx 01/06/2020:Test</file-info>
<date-from>2020-06-01</date-from>
<date-to>2020-06-02</date-to>
</file_table>
<file_table product-id="222">
<amount quantity="1">18.3000</amount>
<file-info>xxx 01/07/2020: Txt</file-info>
<date-from>2020-07-02</date-from>
<date-to>2020-07-02</date-to>
</file_table>
</file-tables>
</filebook>
</filebooks>
我希望 CSV 文件中的输出格式如下:
product_id; currency; amount; quantity; file_info; date_from; date_to
111;GBP;21.500;1;xxx 01/06/2020:Test;2020-06-01;2020-06-02
222;GBP;18.300;1;xxx 01/07/2020: Txt;2020-07-02;2020-07-02
这是我目前所做的。
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("sample.xml")
root = tree.getroot()
ns = {'nspace': 'http://www.something.com/xml/xxx/filebook/2006-10-31'}
price_data = open('data.csv', 'w')
csvwriter = csv.writer(price_data)
price_head = []
count = 0
print (root.findall('nspace:filebook', ns)) #debugging
for member in root.findall('nspace:filebook', ns):
price = []
if count == 0:
currency = member.find('nspace:currency', ns).tag
price_head.append(currency)
product_id = member.find('nspace:product_id').tag
price_head.append(product_id)
amount = member.find('nspace:amount').tag
price_head.append(amount)
quantity = member.find('nspace:quantity').tag
price_head.append(quantity)
file_info = member.find('nspace:file-info').tag
price_head.append(file_info)
date_from = member.find('nspace:date-from').tag
price_head.append(date_from)
date_to = member.find('nspace:date-to').tag
price_head.append(date_to)
count = count + 1
currency = member.find('nspace:currency', ns).text
price.append(currency)
product_id = member.find('nspace:product_id').text
price.append(product_id)
amount = member.find('nspace:amount').text
price.append(amount)
quantity = member.find('nspace:quantity').text
price.append(quantity)
file_info = member.find('nspace:file-info').text
price.append(file_info)
date_from = member.find('nspace:date-from').text
price.append(date_from)
date_to = member.find('nspace:date-to').text
price.append(date_to)
price_data.close()
另一种方法。
from simplified_scrapy import SimplifiedDoc, utils, req
html = '''<?xml version="1.0" encoding="UTF-8"?>
<filebooks xmlns=\"http://www.something.com/xml/xxx/filebook/2006-10-31"\>
<filebook>
<header filebook-id="Some Title">
<currency>GBP</currency>
<display-name xml:lang="x-default">Some Name</display-name>
</header>
<file-tables>
<file_table product-id="111">
<amount quantity="1">21.5000</amount>
<file-info>xxx 01/06/2020:Test</file-info>
<date-from>2020-06-01</date-from>
<date-to>2020-06-02</date-to>
</file_table>
<file_table product-id="222">
<amount quantity="1">18.3000</amount>
<file-info>xxx 01/07/2020: Txt</file-info>
<date-from>2020-07-02</date-from>
<date-to>2020-07-02</date-to>
</file_table>
</file-tables>
</filebook>
</filebooks>'''
doc = SimplifiedDoc(html)
rows = []
header = ['product_id', 'currency', 'amount', 'quantity', 'file_info', 'date_from','date_to']
rows.append(header)
filebooks = doc.selects('filebook')
for filebook in filebooks:
currency = filebook.currency.text
file_tables = filebook.selects('file_table')
for file_table in file_tables:
amount = file_table.amount
row = [
file_table['product-id'], currency, amount.text,
amount['quantity'],
file_table.select('file-info>text()'),
file_table.select('date-from>text()'),
file_table.select('date-to>text()')
]
rows.append(row)
utils.save2csv('data.csv', rows)
结果:
product_id,currency,amount,quantity,file_info,date_from,date_to
111,GBP,21.5000,1,xxx 01/06/2020:Test,2020-06-01,2020-06-02
222,GBP,18.3000,1,xxx 01/07/2020: Txt,2020-07-02,2020-07-02
这里有更多例子:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
另一种方法是使用 xml 解析器,因为这是 xml 格式。这忽略了 csv 方面;这显然不是这里的问题:
cash = """[your xml above - validated as xml]"""
from lxml import etree
doc = etree.XML(cash.encode())
namespaces = {'nspace': 'http://www.something.com/xml/xxx/filebook/2006-10-31'}
rows = []
for member in doc.findall('nspace:filebook',namespaces):
cur = member.find('.//nspace:currency',namespaces).text
for m in member.findall('.//nspace:file_table',namespaces):
row = []
id = m.attrib['product-id']
amount = m.find('./nspace:amount',namespaces).text
file_i = m.find('./nspace:file-info',namespaces).text
date_f = m.find('./nspace:date-from',namespaces).text
date_t = m.find('./nspace:date-to',namespaces).text
row.extend([cur,id,amount,file_i,date_f,date_t])
rows.append(row)
for r in rows:
print(r)
输出:
['GBP', '111', '21.5000', 'xxx 01/06/2020:Test', '2020-06-01', '2020-06-02']
['GBP', '222', '18.3000', 'xxx 01/07/2020: Txt', '2020-07-02', '2020-07-02']
显然,您可以重新排序字段、名称 headers 等
我正在将 XML 文件转换为 CSV 文件,但在转换过程中遇到问题。
我收到一个 AttributeError:'NoneType' 对象没有属性 'tag'。
我已经为解决这个问题做了很多研究,不幸的是我没有成功地得出合适的结果,我看不出哪里错了。
XML 文件的片段
<?xml version="1.0" encoding="UTF-8"?>
<filebooks xmlns="http://www.something.com/xml/xxx/filebook/2006-10-31"\>
<filebook>
<header filebook-id="Some Title">
<currency>GBP</currency>
<display-name xml:lang="x-default">Some Name</display-name>
</header>
<file-tables>
<file_table product-id="111">
<amount quantity="1">21.5000</amount>
<file-info>xxx 01/06/2020:Test</file-info>
<date-from>2020-06-01</date-from>
<date-to>2020-06-02</date-to>
</file_table>
<file_table product-id="222">
<amount quantity="1">18.3000</amount>
<file-info>xxx 01/07/2020: Txt</file-info>
<date-from>2020-07-02</date-from>
<date-to>2020-07-02</date-to>
</file_table>
</file-tables>
</filebook>
</filebooks>
我希望 CSV 文件中的输出格式如下:
product_id; currency; amount; quantity; file_info; date_from; date_to
111;GBP;21.500;1;xxx 01/06/2020:Test;2020-06-01;2020-06-02
222;GBP;18.300;1;xxx 01/07/2020: Txt;2020-07-02;2020-07-02
这是我目前所做的。
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("sample.xml")
root = tree.getroot()
ns = {'nspace': 'http://www.something.com/xml/xxx/filebook/2006-10-31'}
price_data = open('data.csv', 'w')
csvwriter = csv.writer(price_data)
price_head = []
count = 0
print (root.findall('nspace:filebook', ns)) #debugging
for member in root.findall('nspace:filebook', ns):
price = []
if count == 0:
currency = member.find('nspace:currency', ns).tag
price_head.append(currency)
product_id = member.find('nspace:product_id').tag
price_head.append(product_id)
amount = member.find('nspace:amount').tag
price_head.append(amount)
quantity = member.find('nspace:quantity').tag
price_head.append(quantity)
file_info = member.find('nspace:file-info').tag
price_head.append(file_info)
date_from = member.find('nspace:date-from').tag
price_head.append(date_from)
date_to = member.find('nspace:date-to').tag
price_head.append(date_to)
count = count + 1
currency = member.find('nspace:currency', ns).text
price.append(currency)
product_id = member.find('nspace:product_id').text
price.append(product_id)
amount = member.find('nspace:amount').text
price.append(amount)
quantity = member.find('nspace:quantity').text
price.append(quantity)
file_info = member.find('nspace:file-info').text
price.append(file_info)
date_from = member.find('nspace:date-from').text
price.append(date_from)
date_to = member.find('nspace:date-to').text
price.append(date_to)
price_data.close()
另一种方法。
from simplified_scrapy import SimplifiedDoc, utils, req
html = '''<?xml version="1.0" encoding="UTF-8"?>
<filebooks xmlns=\"http://www.something.com/xml/xxx/filebook/2006-10-31"\>
<filebook>
<header filebook-id="Some Title">
<currency>GBP</currency>
<display-name xml:lang="x-default">Some Name</display-name>
</header>
<file-tables>
<file_table product-id="111">
<amount quantity="1">21.5000</amount>
<file-info>xxx 01/06/2020:Test</file-info>
<date-from>2020-06-01</date-from>
<date-to>2020-06-02</date-to>
</file_table>
<file_table product-id="222">
<amount quantity="1">18.3000</amount>
<file-info>xxx 01/07/2020: Txt</file-info>
<date-from>2020-07-02</date-from>
<date-to>2020-07-02</date-to>
</file_table>
</file-tables>
</filebook>
</filebooks>'''
doc = SimplifiedDoc(html)
rows = []
header = ['product_id', 'currency', 'amount', 'quantity', 'file_info', 'date_from','date_to']
rows.append(header)
filebooks = doc.selects('filebook')
for filebook in filebooks:
currency = filebook.currency.text
file_tables = filebook.selects('file_table')
for file_table in file_tables:
amount = file_table.amount
row = [
file_table['product-id'], currency, amount.text,
amount['quantity'],
file_table.select('file-info>text()'),
file_table.select('date-from>text()'),
file_table.select('date-to>text()')
]
rows.append(row)
utils.save2csv('data.csv', rows)
结果:
product_id,currency,amount,quantity,file_info,date_from,date_to
111,GBP,21.5000,1,xxx 01/06/2020:Test,2020-06-01,2020-06-02
222,GBP,18.3000,1,xxx 01/07/2020: Txt,2020-07-02,2020-07-02
这里有更多例子:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
另一种方法是使用 xml 解析器,因为这是 xml 格式。这忽略了 csv 方面;这显然不是这里的问题:
cash = """[your xml above - validated as xml]"""
from lxml import etree
doc = etree.XML(cash.encode())
namespaces = {'nspace': 'http://www.something.com/xml/xxx/filebook/2006-10-31'}
rows = []
for member in doc.findall('nspace:filebook',namespaces):
cur = member.find('.//nspace:currency',namespaces).text
for m in member.findall('.//nspace:file_table',namespaces):
row = []
id = m.attrib['product-id']
amount = m.find('./nspace:amount',namespaces).text
file_i = m.find('./nspace:file-info',namespaces).text
date_f = m.find('./nspace:date-from',namespaces).text
date_t = m.find('./nspace:date-to',namespaces).text
row.extend([cur,id,amount,file_i,date_f,date_t])
rows.append(row)
for r in rows:
print(r)
输出:
['GBP', '111', '21.5000', 'xxx 01/06/2020:Test', '2020-06-01', '2020-06-02']
['GBP', '222', '18.3000', 'xxx 01/07/2020: Txt', '2020-07-02', '2020-07-02']
显然,您可以重新排序字段、名称 headers 等