日期时间在两个完全任意的日期出现故障

Datetime glitching out on two completely arbitrary dates

我正在编写一个代码来直接从维基百科上抓取 NBA 球员年龄的数据,几天前差不多完成了,今天又回来检查它是否有效,整个代码看起来像这样(我已经在 yyyy_mm_dd_to_age 函数中添加了 try/except 语句并注释掉了正常工作的额外代码):

import bs4
import pandas as pd
import numpy as np
import requests
import statistics
import datetime as dt
from typing import Generator, Tuple, List, T
def tables_by_class(soup: bs4.BeautifulSoup,
                    html_class: str) -> Generator[pd.DataFrame, None, None]:
    tables = soup.find_all("table", {"class": html_class})
    tables = pd.read_html(str(tables))
    for table in tables:
        yield table
def divs_by_id(soup: bs4.BeautifulSoup,
               html_id: str) -> bs4.element.ResultSet:
    elements = soup.find_all("div", {"id": html_id})
    return elements
def yyyy_mm_dd_to_age(b_date: str) -> float:
    try:
        b_date = dt.datetime.strptime(b_date, "%Y-%m-%d")
        today = dt.date.today()
        today = dt.datetime(year = today.year,
                            month = today.month,
                            day = today.day)
        return round((today - b_date).days / 365.25, 2)
    except ValueError:
        return b_date
def l_flatten(l: List[T]) -> List[T]:
    return [j for i in l for j in i]
def l_to_bins(l: List[float],
              bins: List[float]) -> List[float]:
    pairs = {(i, j): 0 for i, j in zip(bins, bins[1:])}
    for pair in pairs:
        pairs[pair] += len([i for i in l if pair[0] < i <= pair[1]])
    return pairs
def team_data():
    url = "https://en.wikipedia.org/wiki/List_of_current_NBA_team_rosters"
    html_content = requests.get(url).text
    soup = bs4.BeautifulSoup(html_content, "lxml")
    team_names = divs_by_id(soup, "toc")[0]
    team_names = team_names.find_all("a", href = True)
    words_out = ["Conference", "Division", "See", "References", "External"]
    team_names = [i["href"][1:] for i in team_names
                  if not any(j in i["href"] for j in words_out)]
    team_tables = tables_by_class(soup, "sortable")
    team_tables = [table["DOB (YYYY-MM-DD)"].tolist() for table in team_tables]
    team_tables = [[yyyy_mm_dd_to_age(b_date) for b_date in table]
                   for table in team_tables]
    return team_tables
    # this is necessary, but i commented it out to figure out the bug
    #team_tables = [sorted(table) for table in team_tables]
    #age_dict = {name: table for name, table in zip(team_names, team_tables)}
    #age_dict = dict(sorted(age_dict.items()))
    #age_dataframe = pd.DataFrame(list(age_dict.values()),
    #                             index = list(age_dict.keys()))
    #age_dist = l_flatten(age_dataframe.values.tolist())
    #age_dist = l_to_bins(age_dist, [i for i in np.linspace(18, 42, 9)])
    #return age_dataframe, age_dist
data = team_data()
print(data)

输出是这样的:

[[27.56, 24.97, 23.17, 23.85, 26.05, 35.37, 29.4, 21.97, 24.97, 24.5, 22.0, 26.42, 25.94, 23.71, 28.08, 28.08, 27.61, 23.62, 22.87, 23.99], [36.24, 27.28, '1997–01–20', 25.17, 25.02, 26.09, 22.5, 22.01, 33.04, 21.18, 22.28, 32.58, 32.14, 30.11, 29.56, 34.65, 33.18, 36.68, '1999–09–30', 19.94, 20.01, 23.2], [21.34, 30.24, 28.96, 36.31, 26.04, 21.44, 22.18, 21.1, 27.52, 22.33, 26.88, 23.54, 33.03, 27.04, 22.66, 22.99, 23.62, 31.44, 23.55], [20.96, 31.15, 28.18, 27.58, 34.32, 29.25, 28.02, 22.13, 22.29, 24.23, 21.03, 25.05, 28.33, 22.31, 24.68, 25.24, 19.06, 24.62], [22.07, 24.25, 21.94, 20.21, 29.05, 21.94, 28.76, 20.3, 27.44, 35.44, 23.43, 20.63, 24.35, 27.54, 22.74, 27.64, 27.09, 27.01], [23.97, 23.21, 22.22, 27.63, 24.06, 32.19, 21.74, 22.2, 28.23, 25.48, 25.38, 24.66, 26.6, 22.0, 27.2, 30.98, 21.66, 20.14], [23.49, 23.29, 25.85, 21.72, 24.18, 33.1, 24.4, 20.33, 22.57, 20.72, 26.52, 28.72, 30.98, 22.78, 24.27, 27.91, 24.9, 25.06], [22.52, 20.05, 30.4, 23.21, 22.8, 27.59, 20.22, 23.45, 24.68, 30.15, 22.31, 23.22, 25.94, 30.21, 30.49, 23.81, 21.81, 22.16, 20.4, 26.54], [22.24, 23.32, 28.84, 29.56, 24.34, 22.35, 32.53, 19.76, 23.73, 29.38, 27.14, 26.2, 29.56, 25.45, 27.79, 22.06, 25.56, 32.22, 28.11, 21.56], [26.02, 26.86, 29.24, 26.49, 28.77, 23.52, 24.7, 35.45, 31.34, 28.99, 22.78, 33.54, 22.4, 30.18, 23.1, 28.37, 26.86, 26.68, 24.01, 23.76], [29.16, 27.41, 24.06, 20.34, 31.74, 33.19, 27.7, 30.58, 23.13, 23.87, 22.13, 19.82, 26.44, 24.11, 25.83, 20.84, 22.12, 34.97, 29.47, 23.07], [20.15, 21.07, 23.57, 20.64, 22.25, 31.56, 26.82, 20.74, 23.78, 21.59, 26.05, 23.7, 24.04, 25.85, 31.61, 23.88, 27.58, 33.28, 19.14, 23.15], [24.24, 32.08, 32.18, 22.93, 41.35, 21.74, 35.56, 26.05, 32.12, 22.47, 27.45, 23.53, 27.48, 22.37, 23.79, 22.22, 25.55, 36.45, 25.34, 23.32], [21.42, 23.43, 22.77, 22.5, 30.02, 24.43, 23.38, 25.25, 20.68, 27.09, 24.03, 33.54, 32.64, 23.16, 30.69, 24.54, 20.37, 22.89, 20.13, 24.47], [20.78, 28.3, 28.92, 24.21, 28.65, 28.53, 24.47, 23.04, 28.99, 22.98, 23.68, 27.72, 25.04, 23.14, 22.62, 26.23, 29.41, 23.34, 19.99, 23.63], [30.77, 21.9, 21.91, 30.57, 24.51, 26.22, 24.97, 26.08, 31.32, 35.13, 22.62, 21.08, 26.65, 26.3, 24.64, 20.76, 23.3, 26.35, 29.2, 24.5], [24.88, 33.26, 21.09, 23.04, 20.19, 24.07, 27.61, 22.82, 21.04, 25.52, 22.93, 22.27, 23.12, 27.57, 22.14, 25.64, 25.07, 25.92, 22.54, 22.97], [21.34, 26.68, 24.73, 22.49, 30.25, 19.01, 23.26, 24.27, 21.32, 20.34, 20.7, 30.29, 19.8, 20.95, 23.7, 26.79, 22.78, 26.87], [25.89, 20.12, 24.29, 28.56, 30.84, 21.33, 31.25, 21.68, 30.07, 28.67, 28.79, 27.15, 32.59, 28.39, 22.35, 23.89, 29.93, 20.93, 29.03], [24.08, 22.08, 32.49, 23.5, 21.14, 29.36, 34.01, 24.28, 23.34, 35.16, 29.3, 23.6, 34.04, 25.1, 28.36, 24.2, 24.94, 24.34, 32.34], [26.77, 33.43, 30.89, 25.89, 33.59, 31.62, 37.71, 19.03, 28.98, 25.69, 19.38, 27.34, 28.87, 22.32, 28.37, 31.68, 28.51, 26.64, 20.54], [32.84, 31.85, 19.88, 24.33, 31.46, 23.48, 23.45, 32.07, 31.5, 19.6, 25.31, 27.75, 30.3, 24.99, 32.12, 22.18, 21.12, 25.56, 22.81, 24.58], [37.38, 36.29, 21.61, 32.29, 22.86, 28.6, 20.81, 33.88, 20.89, 35.85, 36.79, 33.23, 22.77, 23.69, 26.2, 25.26, 24.64, 23.38, 35.64, 32.92], [23.23, 24.96, 25.13, 31.28, 25.47, 25.62, 28.53, 33.74, 28.05, 36.44, 27.19, 27.64, 28.69, 27.52, 24.59, 21.58], [22.59, 29.38, 23.73, 24.42, 23.82, 21.63, 28.43, 28.83, 28.0, 26.29, 22.53, 28.33, 24.57, 23.11, 22.26, 20.35, 20.04, 25.15, 30.59, 22.09], [22.01, 26.68, 25.12, 30.58, 28.92, 28.16, 22.63, 28.45, 20.91, 29.58, 22.28, 23.81, 29.71, 33.17, 23.71, 23.22, 24.67, 26.2, 30.24, 21.05], [33.93, 23.68, 23.36, 19.85, 26.26, 19.6, 32.8, 19.68, 28.36, 23.73, 20.77, 19.67, 28.75, 21.45, 19.23, 25.97, 29.53, 31.11, 26.05], [28.24, 20.76, 28.07, 23.31, 25.73, 25.07, 22.65, 27.58, 22.08, 25.43, 25.57, 24.95, 23.38, 25.42, 22.18, 22.39, 23.61, 22.76, 20.35, 20.09], [23.12, 23.51, 23.75, 24.56, 26.64, 26.61, 21.4, 27.39, 25.97, 25.28, 24.12, 23.03, 20.53, 22.29, 23.72, 21.33, 29.96, 35.44, 29.44, 21.28], [31.07, 25.73, 23.9, 24.7, 28.23, 22.01, 21.77, 25.97, 29.78, 25.07, 26.0, 18.81, 21.14, 22.84, 27.29, 22.15, 33.32]]

如您所见,除两岁外,所有日期都已成功转换为年龄,我一直在疯狂地试图弄明白。有人可以帮我吗?

一些日期时间格式不使用连字符 "-" 作为分隔符

他们正在使用 "–"

尝试将您的 dt.datetime.strptime(b_date, "%Y-%m-%d") 更改为 try_datetime(b_date)

它可能会起作用

def try_datetime(b_date: str) -> dt.datetime:
    try:
        return dt.datetime.strptime(b_date, "%Y-%m-%d")
    except:
        return dt.datetime.strptime(b_date, "%Y–%m–%d")