如果结果包含 none 或多个结果,如何将 None 附加到列表
How to append None to list if result contains none or multiple results
我有一个包含 URLs 的 csv,其中包含我需要提取的数据。
有时,URL 包含 none 或多个结果,如果是这种情况,我想在列表中附加一个 None
。
这是代码:
import os
import glob
import time
from urllib.request import urlopen
from numpy import full
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb21.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
clean_aut = []
title = []
isbn_clean = []
for row in csvfile['URL']:
#print('row: ' + row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name and title
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
count+=1
这是包含 URL 的列表文件:Pastebin
我该怎么做?
您应该在发布之前清理您的代码。最好坚持 minimal reproducible example。在您的情况下,您可以删除外部循环,以便我们可以专注于导致问题的部分。
关于您的代码:您应该在循环之前设置循环期间未修改的变量。此外,可以使用 any
检查属性,这使您的代码更轻便。为了在找不到作者时将 None
添加到您的列表中,您可以使用布尔值:
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
clean_aut = []
for row in csvfile['URL']:
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
no_aut = True
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if any(datafield_node.get(k) != v for attr_dict in datafield_attribute_filters for k,v in attr_dict.items()):
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name
no_aut = False
if no_aut: clean_aut.append(None)
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
输出:
[None, 'Bergren, Lisa Tawn', 'Rahlwes, Ann-Kathrin', 'Ortner, Helmut', 'Ladwig-Winters, Simone', 'Huonker, Thomas', 'Ritter, Karl-Markus', 'Kerkeling, Hape', 'Rohls, Jan', 'Rohls, Jan', 'Rohls, Jan', 'James, Bethan', None, 'Schmidt, Horst']
Author
0 None
1 Bergren, Lisa Tawn
2 Rahlwes, Ann-Kathrin
3 Ortner, Helmut
4 Ladwig-Winters, Simone
5 Huonker, Thomas
6 Ritter, Karl-Markus
7 Kerkeling, Hape
8 Rohls, Jan
9 Rohls, Jan
10 Rohls, Jan
11 James, Bethan
12 None
13 Schmidt, Horst
我有一个包含 URLs 的 csv,其中包含我需要提取的数据。
有时,URL 包含 none 或多个结果,如果是这种情况,我想在列表中附加一个 None
。
这是代码:
import os
import glob
import time
from urllib.request import urlopen
from numpy import full
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb21.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
clean_aut = []
title = []
isbn_clean = []
for row in csvfile['URL']:
#print('row: ' + row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name and title
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
count+=1
这是包含 URL 的列表文件:Pastebin
我该怎么做?
您应该在发布之前清理您的代码。最好坚持 minimal reproducible example。在您的情况下,您可以删除外部循环,以便我们可以专注于导致问题的部分。
关于您的代码:您应该在循环之前设置循环期间未修改的变量。此外,可以使用 any
检查属性,这使您的代码更轻便。为了在找不到作者时将 None
添加到您的列表中,您可以使用布尔值:
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
clean_aut = []
for row in csvfile['URL']:
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
no_aut = True
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if any(datafield_node.get(k) != v for attr_dict in datafield_attribute_filters for k,v in attr_dict.items()):
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name
no_aut = False
if no_aut: clean_aut.append(None)
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
输出:
[None, 'Bergren, Lisa Tawn', 'Rahlwes, Ann-Kathrin', 'Ortner, Helmut', 'Ladwig-Winters, Simone', 'Huonker, Thomas', 'Ritter, Karl-Markus', 'Kerkeling, Hape', 'Rohls, Jan', 'Rohls, Jan', 'Rohls, Jan', 'James, Bethan', None, 'Schmidt, Horst']
Author
0 None
1 Bergren, Lisa Tawn
2 Rahlwes, Ann-Kathrin
3 Ortner, Helmut
4 Ladwig-Winters, Simone
5 Huonker, Thomas
6 Ritter, Karl-Markus
7 Kerkeling, Hape
8 Rohls, Jan
9 Rohls, Jan
10 Rohls, Jan
11 James, Bethan
12 None
13 Schmidt, Horst