如果结果包含 none 或多个结果，如何将 None 附加到列表

Question

我有一个包含 URLs 的 csv，其中包含我需要提取的数据。有时，URL 包含 none 或多个结果，如果是这种情况，我想在列表中附加一个 None。

这是代码：

import os
import glob
import time
from urllib.request import urlopen
from numpy import full
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb21.csv',recursive=True) #searches all files in folder
print(files)

for file in files:
    if count==0: #to only go through the first file, instead of all files in the folder
        csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
        clean_aut = []
        title = []
        isbn_clean = []
        for row in csvfile['URL']:
            #print('row: ' + row)
            
            with urlopen(str(row)) as response:
                doc = ET.parse(response)  
                root = doc.getroot()
                namespaces = {  # Manually extracted from the XML file, but there could be code written to automatically do that.
            "zs": "http://www.loc.gov/zing/srw/",
            "": "http://www.loc.gov/MARC21/slim",
                }
            datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
            datafield_attribute_filters = [ #which fields to extract
            {
            "tag": "100", #author
            "ind1": "1",
            "ind2": " ",
            }]
            #datafield_attribute_filters = []  # Decomment this line to clear filters (and process each datafield node)
            
            for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
                if datafield_attribute_filters:
                    skip_node = True
                    for attr_dict in datafield_attribute_filters:
                        for k, v in attr_dict.items():
                            if datafield_node.get(k) != v:
                                break
                        else:
                            skip_node = False
                            break
                    if skip_node:
                        continue
                for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
                    clean_aut.append(subfield_node.text) #this gets the author name and title
        origdata=pd.DataFrame({'Author':clean_aut})                      
                    
        print(clean_aut)
        print(origdata)
        count+=1

这是包含 URL 的列表文件：Pastebin

我该怎么做？

Answer 1

您应该在发布之前清理您的代码。最好坚持 minimal reproducible example。在您的情况下，您可以删除外部循环，以便我们可以专注于导致问题的部分。

关于您的代码：您应该在循环之前设置循环期间未修改的变量。此外，可以使用 any 检查属性，这使您的代码更轻便。为了在找不到作者时将 None 添加到您的列表中，您可以使用布尔值：

namespaces = {  # Manually extracted from the XML file, but there could be code written to automatically do that.
    "zs": "http://www.loc.gov/zing/srw/",
    "": "http://www.loc.gov/MARC21/slim",
        }

datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
datafield_attribute_filters = [ #which fields to extract
    {
    "tag": "100", #author
    "ind1": "1",
    "ind2": " ",
    }]

clean_aut = []
for row in csvfile['URL']:
    with urlopen(str(row)) as response:
        doc = ET.parse(response)  
        root = doc.getroot()
    no_aut = True
    for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
        if any(datafield_node.get(k) != v for attr_dict in datafield_attribute_filters for k,v in attr_dict.items()):
            continue      
        for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
            clean_aut.append(subfield_node.text) #this gets the author name
            no_aut = False
    if no_aut: clean_aut.append(None)

origdata=pd.DataFrame({'Author':clean_aut})                      
            
print(clean_aut)
print(origdata)

输出：

[None, 'Bergren, Lisa Tawn', 'Rahlwes, Ann-Kathrin', 'Ortner, Helmut', 'Ladwig-Winters, Simone', 'Huonker, Thomas', 'Ritter, Karl-Markus', 'Kerkeling, Hape', 'Rohls, Jan', 'Rohls, Jan', 'Rohls, Jan', 'James, Bethan', None, 'Schmidt, Horst']

                    Author
0                     None
1       Bergren, Lisa Tawn
2     Rahlwes, Ann-Kathrin
3           Ortner, Helmut
4   Ladwig-Winters, Simone
5          Huonker, Thomas
6      Ritter, Karl-Markus
7          Kerkeling, Hape
8               Rohls, Jan
9               Rohls, Jan
10              Rohls, Jan
11           James, Bethan
12                    None
13          Schmidt, Horst

如果结果包含 none 或多个结果，如何将 None 附加到列表

How to append None to list if result contains none or multiple results

csv

python-3.x

pandas