XML 解析 Python 3
XML parsing in Python 3
我有一个像这样的xml文件,它是以下DF
中的一个系列
userid | fid | response
-----------------------
1125 | 58940 | xml1
3344 | 47839 | xml2
3455 | 12335 | xml3
响应列包含 xml 个这样的文件
HTTP/1.1 100 Continue
HTTP/1.1 200 OK
Expires: 0
Buffer: false
Pragma: No-cache
Cache-Control: no-cache
Server: IBM_CICS_Transaction_Server/4.1.0(zOS)
Connection: close
Content-Type: text/html
Content-Length: 33842
Date: Sat, 02 Aug 2014 09:27:02 GMT
<?xml version="1.0" encoding="UTF-8"?><creditBureau xmlns="http://www.transunion.com/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber>
这只是整个文档的一部分。我必须解析这个大 xml 并将其转换为 json。我遇到的第一个问题是解析这个文件。我当前的代码如下所示:
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[12] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
### Gives an error:XML or text declaration not at start of entity: line 1, column 6
我不明白这里有什么问题。
初始版本
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[13] = '<root>' ### 12 to 13 to overwrite deformed <creditBureau ...>
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
几个问题:
- 您在
<root>
标记后留下了一个声明元素。删除声明。
- 声明有
""
使它们变形。无需声明即可工作。
- 写入 tocng.xml 且不关闭文件句柄。使 ET 读取失败。
如果你想使用我在 previous post 中提供的正则表达式,那么试试这个,因为它会删除 header 而不需要任何行数。 <root>
然后是索引 1 的统计数据。
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
基于新的单行xml
xml 文件中的更多变化可能需要对代码进行调整。
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
# Replace up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
# Strip file and add \n at each instance of >.
file = file.strip()
file = file.replace('>', '>\n')
# Split file and make a list with no empty items.
file_list = file.split('\n')
file_list = [item for item in file_list if item != '']
# Remove known xml declarations.
if file_list[0][:5] == '<?xml':
del file_list[0]
if file_list[0][:13] == '<creditBureau':
del file_list[0]
# Add root tags.
file_list.insert(0, '<root>')
file_list.append('</root>')
#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
输出:
<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>
编辑:如果需要 <creditBureau...>
,请删除或评论这些行:
if file_list[0][:13] == '<creditBureau':
del file_list[0]
最后 xml 发布的初始标签看起来没有格式错误,因此没有进行任何更改来处理它。
xml 字符串的正则表达式自定义
使用正则表达式而不使用列表来处理 xml 字符串,因为 xml 可以
是多行还是单行。
在customize_xml函数中,正则表达式注释显示组号和
模式模式,您可以将其作为参数传递给 customize_xml 函数。
有效模式参数是 None, -1, 0, 1, 2, 3, 4
.
之一
import pandas as pd
import re
def customize_xml(content, mode=0):
'''Customizes xml tags in the content and may insert a <root> tag.'''
# No modification.
if mode in (-1, None):
return content
# Select a pattern (mode) that modifies the whole xml.
pattern = (r'', # 0. <cB>...</cB>
r'<root></root>', # 1. <root><cB>...</cB><root>
r'<root></root>', # 2. <?xml?><root><cB>...</cB><root>
r'<root></root>', # 3. <root>...<root>
r'<root></root>', # 4. <?xml?><root>...<root>
)
# Groups are marked as ... to use for pattern above.
content = re.sub(r'(<\?xml.+?\?>)' #
'(<creditBureau.*?>)' #
'(.+?)' #
'(</creditBureau>)' #
, pattern[mode], content, flags=re.S)
return content
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
f.write(df.loc[0][2])
with open("testfile.txt") as f:
file = f.read()
# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
# Make xml one single line if not already.
file = file.replace('\n', '')
file = customize_xml(file, 3)
# Write customized xml.
with open("tocng.xml", "w") as f:
f.write(file)
# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent=" ")
print(pretty_xml)
在最后添加了漂亮的印刷品。可选,仅用于查看结果。
我有一个像这样的xml文件,它是以下DF
中的一个系列 userid | fid | response
-----------------------
1125 | 58940 | xml1
3344 | 47839 | xml2
3455 | 12335 | xml3
响应列包含 xml 个这样的文件
HTTP/1.1 100 Continue
HTTP/1.1 200 OK
Expires: 0
Buffer: false
Pragma: No-cache
Cache-Control: no-cache
Server: IBM_CICS_Transaction_Server/4.1.0(zOS)
Connection: close
Content-Type: text/html
Content-Length: 33842
Date: Sat, 02 Aug 2014 09:27:02 GMT
<?xml version="1.0" encoding="UTF-8"?><creditBureau xmlns="http://www.transunion.com/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber>
这只是整个文档的一部分。我必须解析这个大 xml 并将其转换为 json。我遇到的第一个问题是解析这个文件。我当前的代码如下所示:
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[12] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
### Gives an error:XML or text declaration not at start of entity: line 1, column 6
我不明白这里有什么问题。
初始版本
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[13] = '<root>' ### 12 to 13 to overwrite deformed <creditBureau ...>
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
几个问题:
- 您在
<root>
标记后留下了一个声明元素。删除声明。 - 声明有
""
使它们变形。无需声明即可工作。 - 写入 tocng.xml 且不关闭文件句柄。使 ET 读取失败。
如果你想使用我在 previous post 中提供的正则表达式,那么试试这个,因为它会删除 header 而不需要任何行数。 <root>
然后是索引 1 的统计数据。
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
基于新的单行xml
xml 文件中的更多变化可能需要对代码进行调整。
import pandas as pd
import re
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
# Replace up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
# Strip file and add \n at each instance of >.
file = file.strip()
file = file.replace('>', '>\n')
# Split file and make a list with no empty items.
file_list = file.split('\n')
file_list = [item for item in file_list if item != '']
# Remove known xml declarations.
if file_list[0][:5] == '<?xml':
del file_list[0]
if file_list[0][:13] == '<creditBureau':
del file_list[0]
# Add root tags.
file_list.insert(0, '<root>')
file_list.append('</root>')
#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
输出:
<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>
编辑:如果需要 <creditBureau...>
,请删除或评论这些行:
if file_list[0][:13] == '<creditBureau':
del file_list[0]
最后 xml 发布的初始标签看起来没有格式错误,因此没有进行任何更改来处理它。
xml 字符串的正则表达式自定义
使用正则表达式而不使用列表来处理 xml 字符串,因为 xml 可以 是多行还是单行。
在customize_xml函数中,正则表达式注释显示组号和
模式模式,您可以将其作为参数传递给 customize_xml 函数。
有效模式参数是 None, -1, 0, 1, 2, 3, 4
.
import pandas as pd
import re
def customize_xml(content, mode=0):
'''Customizes xml tags in the content and may insert a <root> tag.'''
# No modification.
if mode in (-1, None):
return content
# Select a pattern (mode) that modifies the whole xml.
pattern = (r'', # 0. <cB>...</cB>
r'<root></root>', # 1. <root><cB>...</cB><root>
r'<root></root>', # 2. <?xml?><root><cB>...</cB><root>
r'<root></root>', # 3. <root>...<root>
r'<root></root>', # 4. <?xml?><root>...<root>
)
# Groups are marked as ... to use for pattern above.
content = re.sub(r'(<\?xml.+?\?>)' #
'(<creditBureau.*?>)' #
'(.+?)' #
'(</creditBureau>)' #
, pattern[mode], content, flags=re.S)
return content
raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
f.write(df.loc[0][2])
with open("testfile.txt") as f:
file = f.read()
# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
# Make xml one single line if not already.
file = file.replace('\n', '')
file = customize_xml(file, 3)
# Write customized xml.
with open("tocng.xml", "w") as f:
f.write(file)
# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent=" ")
print(pretty_xml)
在最后添加了漂亮的印刷品。可选,仅用于查看结果。