bs4 非常奇怪的行为

Very strange behavior of bs4

我正在尝试用音频元数据解析我的 html-doc,我已经编写了函数:

from bs4 import *
import warnings
import re
import sys

warnings.filterwarnings('ignore')

def pairing(soup,
            author_class: tuple,
            track_class: tuple,
            subclass: tuple) -> dict:
    
    use_dict = {}

    for pair in zip(
            soup.find_all(author_class[0], {"class": author_class[1]}),
            soup.find_all(track_class[0], {"class": track_class[1]}),
            soup.find_all(subclass[0], {"class": subclass[1]})
    ):
        """
        pair[0] - musician(-s), 
        pair[1] - track_name, 
        pair[2] - subtitle for track(if any)
        """
        print(pair[0],"\n", pair[1], '\n', pair[2])

        track_author = pair[0].find('a').text
        # Clear row
        track_author = re.sub('[!@#$_]', '', track_author)

        try:
            add_meta = pair[2].find('span').text
        except AttributeError:
            print(sys.exc_info())
            add_meta = ""

        track_name = pair[1].text + add_meta
        # Clear
        track_name = re.sub('[!@#$_]', '', track_name)

        use_dict.update({track_author: track_name})
    return use_dict

问题出在附近的 try-except contrusion:当我要求 track_author - 没问题,一切正常。请注意,并非所有曲目都有字幕,因此有时此标签 (span) 为空 (None)。但是我的代码认为它在任何迭代中都是空的。这很奇怪和令人困惑,因为上面带有 track_author 的代码看起来一样并且有效。我需要 return 子类中的文本(如果有),否则 return 空行。
终端输出:

<div class="audio_row__performers"><a>John Paesano</a></div> 
 <span class="audio_row__title_inner _audio_row__title_inner">The Final Lesson</span> 
 <span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle"></span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d283f80>)

<div class="audio_row__performers"><a>The Blue Notes</a></div> 
 <span class="audio_row__title_inner _audio_row__title_inner">Halo Theme</span> 
 <span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle">Piano Rendition</span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d267ec0>)

谁能给我解释一下?

所以,我决定采用另一种方式 - 正则表达式。

def pairing(soup,
            author_class: tuple,
            track_class: tuple,
            subclass: tuple) -> dict:

    use_dict = {}

    for pair in zip(
            soup.find_all(author_class[0], {"class": author_class[1]}),
            soup.find_all(track_class[0], {"class": track_class[1]}),
            soup.find_all(subclass[0], {"class": subclass[1]})
    ):
        """
        pair[0] - musician(-s), 
        pair[1] - track_name, 
        pair[2] - subtitle for track(if any)
        """

        track_author = pair[0].find('a').text
        # Clear row
        track_author = sub('[!@#$_]', '', track_author)

        pair_2_str = str(pair[2])

        regex = "(?<=>).*?(?=<)"
        add_meta = findall(regex, pair_2_str)[0]

        track_name = pair[1].text + f" {add_meta}"
        # Clear
        track_name = sub('[!@#$_]', '', track_name)

        use_dict.update({track_author: track_name})
    return use_dict