bs4 非常奇怪的行为
Very strange behavior of bs4
我正在尝试用音频元数据解析我的 html-doc,我已经编写了函数:
from bs4 import *
import warnings
import re
import sys
warnings.filterwarnings('ignore')
def pairing(soup,
author_class: tuple,
track_class: tuple,
subclass: tuple) -> dict:
use_dict = {}
for pair in zip(
soup.find_all(author_class[0], {"class": author_class[1]}),
soup.find_all(track_class[0], {"class": track_class[1]}),
soup.find_all(subclass[0], {"class": subclass[1]})
):
"""
pair[0] - musician(-s),
pair[1] - track_name,
pair[2] - subtitle for track(if any)
"""
print(pair[0],"\n", pair[1], '\n', pair[2])
track_author = pair[0].find('a').text
# Clear row
track_author = re.sub('[!@#$_]', '', track_author)
try:
add_meta = pair[2].find('span').text
except AttributeError:
print(sys.exc_info())
add_meta = ""
track_name = pair[1].text + add_meta
# Clear
track_name = re.sub('[!@#$_]', '', track_name)
use_dict.update({track_author: track_name})
return use_dict
问题出在附近的 try-except contrusion:当我要求 track_author - 没问题,一切正常。请注意,并非所有曲目都有字幕,因此有时此标签 (span) 为空 (None)。但是我的代码认为它在任何迭代中都是空的。这很奇怪和令人困惑,因为上面带有 track_author 的代码看起来一样并且有效。我需要 return 子类中的文本(如果有),否则 return 空行。
终端输出:
<div class="audio_row__performers"><a>John Paesano</a></div>
<span class="audio_row__title_inner _audio_row__title_inner">The Final Lesson</span>
<span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle"></span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d283f80>)
<div class="audio_row__performers"><a>The Blue Notes</a></div>
<span class="audio_row__title_inner _audio_row__title_inner">Halo Theme</span>
<span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle">Piano Rendition</span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d267ec0>)
谁能给我解释一下?
所以,我决定采用另一种方式 - 正则表达式。
def pairing(soup,
author_class: tuple,
track_class: tuple,
subclass: tuple) -> dict:
use_dict = {}
for pair in zip(
soup.find_all(author_class[0], {"class": author_class[1]}),
soup.find_all(track_class[0], {"class": track_class[1]}),
soup.find_all(subclass[0], {"class": subclass[1]})
):
"""
pair[0] - musician(-s),
pair[1] - track_name,
pair[2] - subtitle for track(if any)
"""
track_author = pair[0].find('a').text
# Clear row
track_author = sub('[!@#$_]', '', track_author)
pair_2_str = str(pair[2])
regex = "(?<=>).*?(?=<)"
add_meta = findall(regex, pair_2_str)[0]
track_name = pair[1].text + f" {add_meta}"
# Clear
track_name = sub('[!@#$_]', '', track_name)
use_dict.update({track_author: track_name})
return use_dict
我正在尝试用音频元数据解析我的 html-doc,我已经编写了函数:
from bs4 import *
import warnings
import re
import sys
warnings.filterwarnings('ignore')
def pairing(soup,
author_class: tuple,
track_class: tuple,
subclass: tuple) -> dict:
use_dict = {}
for pair in zip(
soup.find_all(author_class[0], {"class": author_class[1]}),
soup.find_all(track_class[0], {"class": track_class[1]}),
soup.find_all(subclass[0], {"class": subclass[1]})
):
"""
pair[0] - musician(-s),
pair[1] - track_name,
pair[2] - subtitle for track(if any)
"""
print(pair[0],"\n", pair[1], '\n', pair[2])
track_author = pair[0].find('a').text
# Clear row
track_author = re.sub('[!@#$_]', '', track_author)
try:
add_meta = pair[2].find('span').text
except AttributeError:
print(sys.exc_info())
add_meta = ""
track_name = pair[1].text + add_meta
# Clear
track_name = re.sub('[!@#$_]', '', track_name)
use_dict.update({track_author: track_name})
return use_dict
问题出在附近的 try-except contrusion:当我要求 track_author - 没问题,一切正常。请注意,并非所有曲目都有字幕,因此有时此标签 (span) 为空 (None)。但是我的代码认为它在任何迭代中都是空的。这很奇怪和令人困惑,因为上面带有 track_author 的代码看起来一样并且有效。我需要 return 子类中的文本(如果有),否则 return 空行。
终端输出:
<div class="audio_row__performers"><a>John Paesano</a></div>
<span class="audio_row__title_inner _audio_row__title_inner">The Final Lesson</span>
<span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle"></span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d283f80>)
<div class="audio_row__performers"><a>The Blue Notes</a></div>
<span class="audio_row__title_inner _audio_row__title_inner">Halo Theme</span>
<span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle">Piano Rendition</span>
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'text'"), <traceback object at 0x7fb32d267ec0>)
谁能给我解释一下?
所以,我决定采用另一种方式 - 正则表达式。
def pairing(soup,
author_class: tuple,
track_class: tuple,
subclass: tuple) -> dict:
use_dict = {}
for pair in zip(
soup.find_all(author_class[0], {"class": author_class[1]}),
soup.find_all(track_class[0], {"class": track_class[1]}),
soup.find_all(subclass[0], {"class": subclass[1]})
):
"""
pair[0] - musician(-s),
pair[1] - track_name,
pair[2] - subtitle for track(if any)
"""
track_author = pair[0].find('a').text
# Clear row
track_author = sub('[!@#$_]', '', track_author)
pair_2_str = str(pair[2])
regex = "(?<=>).*?(?=<)"
add_meta = findall(regex, pair_2_str)[0]
track_name = pair[1].text + f" {add_meta}"
# Clear
track_name = sub('[!@#$_]', '', track_name)
use_dict.update({track_author: track_name})
return use_dict