XML 解析 Python 3

XML parsing in Python 3

我有一个像这样的xml文件,它是以下DF

中的一个系列
 userid | fid | response
 -----------------------
 1125 | 58940 | xml1
 3344 | 47839 | xml2
 3455 | 12335 | xml3

响应列包含 xml 个这样的文件

HTTP/1.1 100 Continue

HTTP/1.1 200 OK
Expires: 0
Buffer: false
Pragma: No-cache
Cache-Control: no-cache
Server: IBM_CICS_Transaction_Server/4.1.0(zOS)
Connection: close
Content-Type: text/html
Content-Length: 33842
Date: Sat, 02 Aug 2014 09:27:02 GMT

<?xml version="1.0" encoding="UTF-8"?><creditBureau xmlns="http://www.transunion.com/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber>

这只是整个文档的一部分。我必须解析这个大 xml 并将其转换为 json。我遇到的第一个问题是解析这个文件。我当前的代码如下所示:

 import pandas as pd
 import re

 raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
 df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
 file = open("testfile.txt", "w")
 file.write(df.loc[0][2])
 file.close()

 #Adding Root Element
 with open("testfile.txt") as f:
     file = f.read()
     file_list = file.split('\n')
 file_list[12] = '<root>'
 file_list.append('</root>')
 start = file_list.index('<root>')
 new_list = file_list[start:]

 #Converting to String
 str1 = ''.join(new_list)
 f = open("tocng.xml","w")
 f.write(str1)

 #parsing xml
 import xml.etree.ElementTree as ET
 tree = ET.parse('tocng.xml')
 ### Gives an error:XML or text declaration not at start of entity: line 1, column 6

我不明白这里有什么问题。

初始版本

import pandas as pd
import re

raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    file_list = file.split('\n')
file_list[13] = '<root>' ### 12 to 13 to overwrite deformed <creditBureau ...>
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]

#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

几个问题:

  1. 您在 <root> 标记后留下了一个声明元素。删除声明。
  2. 声明有 "" 使它们变形。无需声明即可工作。
  3. 写入 tocng.xml 且不关闭文件句柄。使 ET 读取失败。

如果你想使用我在 previous post 中提供的正则表达式,那么试试这个,因为它会删除 header 而不需要任何行数。 <root> 然后是索引 1 的统计数据。

import pandas as pd
import re

raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
    file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]

#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

基于新的单行xml

xml 文件中的更多变化可能需要对代码进行调整。

import pandas as pd
import re

raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    # Replace up to <?xml tag.
    file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)
    # Strip file and add \n at each instance of >.
    file = file.strip()
    file = file.replace('>', '>\n')
    # Split file and make a list with no empty items.
    file_list = file.split('\n')
    file_list = [item for item in file_list if item != '']
    # Remove known xml declarations.
    if file_list[0][:5] == '<?xml':
        del file_list[0]
    if file_list[0][:13] == '<creditBureau':
        del file_list[0]
    # Add root tags.
    file_list.insert(0, '<root>')
    file_list.append('</root>')

#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
    f.write(str1)

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

输出:

<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>

编辑:如果需要 <creditBureau...>,请删除或评论这些行:

    if file_list[0][:13] == '<creditBureau':
        del file_list[0]

最后 xml 发布的初始标签看起来没有格式错误,因此没有进行任何更改来处理它。


xml 字符串的正则表达式自定义

使用正则表达式而不使用列表来处理 xml 字符串,因为 xml 可以 是多行还是单行。

在customize_xml函数中,正则表达式注释显示组号和 模式模式,您可以将其作为参数传递给 customize_xml 函数。 有效模式参数是 None, -1, 0, 1, 2, 3, 4.

之一
import pandas as pd
import re


def customize_xml(content, mode=0):
    '''Customizes xml tags in the content and may insert a <root> tag.'''

    # No modification.
    if mode in (-1, None):
        return content

    # Select a pattern (mode) that modifies the whole xml.
    pattern = (r'',                # 0. <cB>...</cB>
               r'<root></root>',   # 1. <root><cB>...</cB><root>
               r'<root></root>', # 2. <?xml?><root><cB>...</cB><root>
               r'<root></root>',       # 3. <root>...<root>
               r'<root></root>',     # 4. <?xml?><root>...<root>
               )

    # Groups are marked as   ... to use for pattern above.
    content = re.sub(r'(<\?xml.+?\?>)'      # 
                      '(<creditBureau.*?>)' # 
                      '(.+?)'               # 
                      '(</creditBureau>)'   # 
                     , pattern[mode], content, flags=re.S)

    return content


raw_data = pd.read_csv('C:\Users\Desktop\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
    f.write(df.loc[0][2])

with open("testfile.txt") as f:
    file = f.read()

# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'', file, flags=re.S)

# Make xml one single line if not already.
file = file.replace('\n', '')

file = customize_xml(file, 3)

# Write customized xml.
with open("tocng.xml", "w") as f:
    f.write(file)

# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent="    ")
print(pretty_xml)

在最后添加了漂亮的印刷品。可选,仅用于查看结果。