为什么我得到一个空数据框?

Why am I getting an empty dataframe?

这是我的初始数据框:

    df.head() 
    
       Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  Unnamed: 0.1.1.1  Unnamed: 0.1.1.1.1         date   time                       game score home_odds draw_odds away_odds  country                 league
    0           0             0             0.0               0.0                 0.0          NaN  22:00            Bahia - Vitoria   0:2      1.82      3.36      4.13   Brazil  Copa do Nordeste 2020
    1           1             1             1.0               1.0                 1.0          NaN  20:00                ABC - Ceara   0:0      3.15      3.09      2.15   Brazil  Copa do Nordeste 2020
    2           2             2             2.0               2.0                 2.0          NaN  20:00      Botafogo PB - Nautico   2:1      2.45      3.07      2.81   Brazil  Copa do Nordeste 2020
    3           3             3             3.0               3.0                 3.0          NaN  20:00     Fortaleza - Santa Cruz   3:0      1.43      4.16      6.56   Brazil  Copa do Nordeste 2020
    4           4             4             4.0               4.0                 4.0  07 Feb 2020  00:00  Sport Recife - Imperatriz   2:2      1.36      4.31      7.66   Brazil  Copa do Nordeste 2020

当我 运行 这段代码时,我得到一个空数据框:

import pandas as pd


def harmonize_game(df: pd.DataFrame) -> pd.DataFrame:
    df["game"] = df["game"].astype(str).str.replace(r"(\(\w+\))", "", regex=True)
    df["game"] = df["game"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
    df["league"] = (
        df["league"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
    )
    df[["home_team", "away_team"]] = df["game"].str.split(" - ", expand=True, n=1)
    df[["home_score", "away_score"]] = df["score"].str.split(":", expand=True)
    print("Data Harmonised")
    return df


def numerical_scores(df: pd.DataFrame) -> pd.DataFrame:
    df["away_score"] = (
        df["away_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
    )
    df["home_score"] = (
        df["home_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
    )
    df = df[df.home_score != "."]
    df = df[df.home_score != ".."]
    df = df[df.home_score != "."]
    df = df[df.home_odds != "-"]
    df = df[df.draw_odds != "-"]
    df = df[df.away_odds != "-"]
    m = (
        df[["home_odds", "draw_odds", "away_odds"]]
        .astype(str)
        .agg(lambda x: x.str.count("/"), 1)
        .ne(0)
        .all(1)
    )
    n = df[["home_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
    o = df[["away_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
    df = df[~m]
    df = df[~n]
    df = df[~o]
    df = df[df.home_score != ""]
    df = df[df.away_score != ""]
    df = df.dropna()
    print("Numerical data harmonised and cleaned")
    return df


def coerce_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.loc[
        :,
        df.columns.intersection(
            [
                "datetime",
                "country",
                "league",
                "home_team",
                "away_team",
                "home_odds",
                "draw_odds",
                "away_odds",
                "home_score",
                "away_score",
            ]
        ),
    ]

    colt = {
        "country": str,
        "league": str,
        "home_team": str,
        "away_team": str,
        "home_odds": float,
        "draw_odds": float,
        "away_odds": float,
        "home_score": int,
        "away_score": int,
    }
    df = df.astype(colt)
    print("Data types recognized")
    return df


def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
    return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


def clean_odds(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["home_odds"] <= 100]
    df = df[df["draw_odds"] <= 100]
    df = df[df["away_odds"] <= 100]
    df = df.drop_duplicates(
        [
            "datetime",
            "home_score",
            "away_score",
            "country",
            "league",
            "home_team",
            "away_team",
        ],
        keep="last",
    )
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    print("Dataframe Cleaned")
    return df


def clean(df: pd.DataFrame) -> pd.DataFrame:
    df = harmonize_game(df)
    df = numerical_scores(df)
    df = coerce_columns(df)
    df = strip_strings(df)
    df = clean_odds(df)
    print("All steps applied")
    return df


def test() -> None:
    df = pd.read_csv()
    clean(df)


if __name__ == "__main__":
    test()

cleaned = pd.DataFrame(test())
cleaned.to_csv()

如何保存

的输出
if __name__ == '__main__':
        test()

到 csv?

根据评论中的建议,您可以这样尝试:

import pandas as pd


def clean(df: pd.DataFrame) -> pd.DataFrame:  # fix type hint
    # df = harmonize_game(df)
    # df = numerical_scores(df)
    # df = coerce_columns(df)
    # df = strip_strings(df)
    # df = clean_odds(df)
    print("All steps applied")
    return df  # add a return statement


def test() -> pd.DataFrame:
    df = pd.DataFrame(
        {
            "Unnamed:": {0: 0, 1: 1},
            "0": {0: 0, 1: 1},
            "Unnamed:.1": {0: 0.0, 1: 1.0},
            "0.1": {0: 0.0, 1: 1.0},
            "Unnamed:.2": {0: 0.0, 1: 1.0},
            "0.1.1": {0: "nan", 1: "nan"},
            "Unnamed:.3": {0: "22:00", 1: "20:00"},
            "0.1.1.1": {0: "Bahia", 1: "ABC"},
            "Unnamed:.4": {0: "-", 1: "-"},
            "0.1.1.1.1": {0: "Vitoria", 1: "Ceara"},
            "date": {0: "0:2", 1: "0:0"},
            "time": {0: 1.82, 1: 3.15},
            "game": {0: 3.36, 1: 3.09},
            "score": {0: 4.13, 1: 2.15},
            "home_odds": {0: "Brazil", 1: "Brazil"},
            "draw_odds": {0: "Copa", 1: "Copa"},
            "away_odds": {0: "do", 1: "do"},
            "country": {0: "Nordeste", 1: "Nordeste"},
            "league": {0: 2020, 1: 2020},
        }
    )  # "pd.read_csv("file.csv")" replaced here just to test that everything works
    return clean(df)


if __name__ == "__main__":
    # fix the indentation and remove unnecessary statement
    cleaned = test()
    cleaned.to_csv("new_file.csv")