如何根据其他解析条件解析XML?
How to parse XML depending on other parsing condition?
我想从 XML 中提取数据,前提是在 XML 中填写了特定字段。
我要提取的元素是:
- 城市:标签 110 代码 c(例如柏林)
- 图书馆代码:标签110代码g(例如D-Bbbf)
- 县代码:标签 043 代码 c(例如 XA-DE)
这是 XML 的一部分:
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record><marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
只有图书馆代码标签被填,我也想提取城市和国家代码
这是我已经编码的:
data = []
# Read the XML file
with open('oefen.xml', 'r', encoding="utf8") as f_in:
soup = BeautifulSoup(f_in.read(), 'html.parser')
for record in soup.find_all(tag="110"):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('[code="c"]')) else None, # select city
'Code' : e.get_text(strip=True) if (e := record.select_one('[code="g"]')) else None # select code
})
for part in 'Code':
if part != None:
for record in soup.find_all(tag="043"):
data.append({
'City Code' : e.get_text(strip=True) if (e := record.select_one('[code="c"]')) else None, # select city code
})
pd.DataFrame(data)
遍历记录然后找到每条记录中的 110 和 043 元素。
尝试这样的事情:
from bs4 import BeautifulSoup
import pandas as pd
xml = '''<marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim">
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record>
<marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
</marc:collection>
'''
data = []
# Read XML from string
soup = BeautifulSoup(xml, 'xml')
# uncomment below to read from XML file
# with open('oefen.xml', 'r', encoding="utf8") as f_in:
# soup = BeautifulSoup(f_in.read(), 'xml')
for record in soup.find_all('marc:record'):
rec = {}
if code := record.find(tag="110"):
if e := code.select_one('[code="c"]'):
rec['City'] = e.get_text(strip=True)
if e := code.select_one('[code="g"]'):
rec['Code'] = e.get_text(strip=True)
if code := record.find(tag="043"):
if e := code.select_one('[code="c"]'):
rec['City Code'] = e.get_text(strip=True)
if rec:
# only add if have at least one field
data.append(rec)
df = pd.DataFrame(data)
print(df)
输出:
City City Code Code
0 Berlin XA-DE NaN
1 London XA-GB GB-Lna
如果不想让 NaN 显示为缺失值,则添加行 df = df.fillna('')
。
与您上一个问题几乎相同的问题 - 只需遍历记录并将新字段添加到您的流程中。使用 xml
解析器你不必关心 namespaces
:
for record in soup.select('record'):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="c"]')) else None,
'Library Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="g"]')) else None,
'Country Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="043"] [code="c"]')) else None
})
例子
xml='''
<root>
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record>
<marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
</root>
'''
from bs4 import BeautifulSoup
import pandas as pd
data = []
soup = BeautifulSoup(xml,'xml')
for record in soup.select('record'):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="c"]')) else None,
'Library Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="g"]')) else None,
'Country Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="043"] [code="c"]')) else None
})
pd.DataFrame(data)
输出
City
Library Code
Country Code
Berlin
None
XA-DE
London
GB-Lna
XA-GB
我想从 XML 中提取数据,前提是在 XML 中填写了特定字段。
我要提取的元素是:
- 城市:标签 110 代码 c(例如柏林)
- 图书馆代码:标签110代码g(例如D-Bbbf)
- 县代码:标签 043 代码 c(例如 XA-DE)
这是 XML 的一部分:
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record><marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
只有图书馆代码标签被填,我也想提取城市和国家代码
这是我已经编码的:
data = []
# Read the XML file
with open('oefen.xml', 'r', encoding="utf8") as f_in:
soup = BeautifulSoup(f_in.read(), 'html.parser')
for record in soup.find_all(tag="110"):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('[code="c"]')) else None, # select city
'Code' : e.get_text(strip=True) if (e := record.select_one('[code="g"]')) else None # select code
})
for part in 'Code':
if part != None:
for record in soup.find_all(tag="043"):
data.append({
'City Code' : e.get_text(strip=True) if (e := record.select_one('[code="c"]')) else None, # select city code
})
pd.DataFrame(data)
遍历记录然后找到每条记录中的 110 和 043 元素。
尝试这样的事情:
from bs4 import BeautifulSoup
import pandas as pd
xml = '''<marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim">
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record>
<marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
</marc:collection>
'''
data = []
# Read XML from string
soup = BeautifulSoup(xml, 'xml')
# uncomment below to read from XML file
# with open('oefen.xml', 'r', encoding="utf8") as f_in:
# soup = BeautifulSoup(f_in.read(), 'xml')
for record in soup.find_all('marc:record'):
rec = {}
if code := record.find(tag="110"):
if e := code.select_one('[code="c"]'):
rec['City'] = e.get_text(strip=True)
if e := code.select_one('[code="g"]'):
rec['Code'] = e.get_text(strip=True)
if code := record.find(tag="043"):
if e := code.select_one('[code="c"]'):
rec['City Code'] = e.get_text(strip=True)
if rec:
# only add if have at least one field
data.append(rec)
df = pd.DataFrame(data)
print(df)
输出:
City City Code Code
0 Berlin XA-DE NaN
1 London XA-GB GB-Lna
如果不想让 NaN 显示为缺失值,则添加行 df = df.fillna('')
。
与您上一个问题几乎相同的问题 - 只需遍历记录并将新字段添加到您的流程中。使用 xml
解析器你不必关心 namespaces
:
for record in soup.select('record'):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="c"]')) else None,
'Library Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="g"]')) else None,
'Country Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="043"] [code="c"]')) else None
})
例子
xml='''
<root>
<marc:record>
<marc:controlfield tag="001">39612</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-DE</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">Bibliothek für Bildungsgeschichtliche Forschung</marc:subfield>
<marc:subfield code="c">Berlin</marc:subfield>
</marc:datafield>
</marc:record>
<marc:record>
<marc:controlfield tag="001">30006648</marc:controlfield>
<marc:controlfield tag="003">DE-633</marc:controlfield>
<marc:controlfield tag="005">20161109000000.0</marc:controlfield>
<marc:controlfield tag="008">161109n|||||||a||| a</marc:controlfield>
<marc:datafield tag="043" ind1=" " ind2=" ">
<marc:subfield code="c">XA-GB</marc:subfield>
</marc:datafield>
<marc:datafield tag="110" ind1="2" ind2=" ">
<marc:subfield code="a">The National Archives</marc:subfield>
<marc:subfield code="c">London</marc:subfield>
<marc:subfield code="g">GB-Lna</marc:subfield>
</marc:datafield>
</marc:record>
</root>
'''
from bs4 import BeautifulSoup
import pandas as pd
data = []
soup = BeautifulSoup(xml,'xml')
for record in soup.select('record'):
data.append({
'City' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="c"]')) else None,
'Library Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="110"] [code="g"]')) else None,
'Country Code' : e.get_text(strip=True) if (e := record.select_one('datafield[tag="043"] [code="c"]')) else None
})
pd.DataFrame(data)
输出
City | Library Code | Country Code |
---|---|---|
Berlin | None | XA-DE |
London | GB-Lna | XA-GB |