如果结果包含 none 或多个结果,如何将 None 附加到列表

How to append None to list if result contains none or multiple results

我有一个包含 URLs 的 csv,其中包含我需要提取的数据。 有时,URL 包含 none 或多个结果,如果是这种情况,我想在列表中附加一个 None

这是代码:

import os
import glob
import time
from urllib.request import urlopen
from numpy import full
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb21.csv',recursive=True) #searches all files in folder
print(files)

for file in files:
    if count==0: #to only go through the first file, instead of all files in the folder
        csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
        clean_aut = []
        title = []
        isbn_clean = []
        for row in csvfile['URL']:
            #print('row: ' + row)
            
            with urlopen(str(row)) as response:
                doc = ET.parse(response)  
                root = doc.getroot()
                namespaces = {  # Manually extracted from the XML file, but there could be code written to automatically do that.
            "zs": "http://www.loc.gov/zing/srw/",
            "": "http://www.loc.gov/MARC21/slim",
                }
            datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
            datafield_attribute_filters = [ #which fields to extract
            {
            "tag": "100", #author
            "ind1": "1",
            "ind2": " ",
            }]
            #datafield_attribute_filters = []  # Decomment this line to clear filters (and process each datafield node)
            
            for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
                if datafield_attribute_filters:
                    skip_node = True
                    for attr_dict in datafield_attribute_filters:
                        for k, v in attr_dict.items():
                            if datafield_node.get(k) != v:
                                break
                        else:
                            skip_node = False
                            break
                    if skip_node:
                        continue
                for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
                    clean_aut.append(subfield_node.text) #this gets the author name and title
        origdata=pd.DataFrame({'Author':clean_aut})                      
                    
        print(clean_aut)
        print(origdata)
        count+=1

这是包含 URL 的列表文件:Pastebin

我该怎么做?

您应该在发布之前清理您的代码。最好坚持 minimal reproducible example。在您的情况下,您可以删除外部循环,以便我们可以专注于导致问题的部分。

关于您的代码:您应该在循环之前设置循环期间未修改的变量。此外,可以使用 any 检查属性,这使您的代码更轻便。为了在找不到作者时将 None 添加到您的列表中,您可以使用布尔值:

namespaces = {  # Manually extracted from the XML file, but there could be code written to automatically do that.
    "zs": "http://www.loc.gov/zing/srw/",
    "": "http://www.loc.gov/MARC21/slim",
        }

datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
datafield_attribute_filters = [ #which fields to extract
    {
    "tag": "100", #author
    "ind1": "1",
    "ind2": " ",
    }]

clean_aut = []
for row in csvfile['URL']:
    with urlopen(str(row)) as response:
        doc = ET.parse(response)  
        root = doc.getroot()
    no_aut = True
    for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
        if any(datafield_node.get(k) != v for attr_dict in datafield_attribute_filters for k,v in attr_dict.items()):
            continue      
        for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
            clean_aut.append(subfield_node.text) #this gets the author name
            no_aut = False
    if no_aut: clean_aut.append(None)

origdata=pd.DataFrame({'Author':clean_aut})                      
            
print(clean_aut)
print(origdata)

输出:

[None, 'Bergren, Lisa Tawn', 'Rahlwes, Ann-Kathrin', 'Ortner, Helmut', 'Ladwig-Winters, Simone', 'Huonker, Thomas', 'Ritter, Karl-Markus', 'Kerkeling, Hape', 'Rohls, Jan', 'Rohls, Jan', 'Rohls, Jan', 'James, Bethan', None, 'Schmidt, Horst']

                    Author
0                     None
1       Bergren, Lisa Tawn
2     Rahlwes, Ann-Kathrin
3           Ortner, Helmut
4   Ladwig-Winters, Simone
5          Huonker, Thomas
6      Ritter, Karl-Markus
7          Kerkeling, Hape
8               Rohls, Jan
9               Rohls, Jan
10              Rohls, Jan
11           James, Bethan
12                    None
13          Schmidt, Horst