为什么我得到一个空数据框?
Why am I getting an empty dataframe?
这是我的初始数据框:
df.head()
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 Unnamed: 0.1.1.1.1 date time game score home_odds draw_odds away_odds country league
0 0 0 0.0 0.0 0.0 NaN 22:00 Bahia - Vitoria 0:2 1.82 3.36 4.13 Brazil Copa do Nordeste 2020
1 1 1 1.0 1.0 1.0 NaN 20:00 ABC - Ceara 0:0 3.15 3.09 2.15 Brazil Copa do Nordeste 2020
2 2 2 2.0 2.0 2.0 NaN 20:00 Botafogo PB - Nautico 2:1 2.45 3.07 2.81 Brazil Copa do Nordeste 2020
3 3 3 3.0 3.0 3.0 NaN 20:00 Fortaleza - Santa Cruz 3:0 1.43 4.16 6.56 Brazil Copa do Nordeste 2020
4 4 4 4.0 4.0 4.0 07 Feb 2020 00:00 Sport Recife - Imperatriz 2:2 1.36 4.31 7.66 Brazil Copa do Nordeste 2020
当我 运行 这段代码时,我得到一个空数据框:
import pandas as pd
def harmonize_game(df: pd.DataFrame) -> pd.DataFrame:
df["game"] = df["game"].astype(str).str.replace(r"(\(\w+\))", "", regex=True)
df["game"] = df["game"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
df["league"] = (
df["league"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
)
df[["home_team", "away_team"]] = df["game"].str.split(" - ", expand=True, n=1)
df[["home_score", "away_score"]] = df["score"].str.split(":", expand=True)
print("Data Harmonised")
return df
def numerical_scores(df: pd.DataFrame) -> pd.DataFrame:
df["away_score"] = (
df["away_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
)
df["home_score"] = (
df["home_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
)
df = df[df.home_score != "."]
df = df[df.home_score != ".."]
df = df[df.home_score != "."]
df = df[df.home_odds != "-"]
df = df[df.draw_odds != "-"]
df = df[df.away_odds != "-"]
m = (
df[["home_odds", "draw_odds", "away_odds"]]
.astype(str)
.agg(lambda x: x.str.count("/"), 1)
.ne(0)
.all(1)
)
n = df[["home_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
o = df[["away_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
df = df[~m]
df = df[~n]
df = df[~o]
df = df[df.home_score != ""]
df = df[df.away_score != ""]
df = df.dropna()
print("Numerical data harmonised and cleaned")
return df
def coerce_columns(df: pd.DataFrame) -> pd.DataFrame:
df = df.loc[
:,
df.columns.intersection(
[
"datetime",
"country",
"league",
"home_team",
"away_team",
"home_odds",
"draw_odds",
"away_odds",
"home_score",
"away_score",
]
),
]
colt = {
"country": str,
"league": str,
"home_team": str,
"away_team": str,
"home_odds": float,
"draw_odds": float,
"away_odds": float,
"home_score": int,
"away_score": int,
}
df = df.astype(colt)
print("Data types recognized")
return df
def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
def clean_odds(df: pd.DataFrame) -> pd.DataFrame:
df = df[df["home_odds"] <= 100]
df = df[df["draw_odds"] <= 100]
df = df[df["away_odds"] <= 100]
df = df.drop_duplicates(
[
"datetime",
"home_score",
"away_score",
"country",
"league",
"home_team",
"away_team",
],
keep="last",
)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
print("Dataframe Cleaned")
return df
def clean(df: pd.DataFrame) -> pd.DataFrame:
df = harmonize_game(df)
df = numerical_scores(df)
df = coerce_columns(df)
df = strip_strings(df)
df = clean_odds(df)
print("All steps applied")
return df
def test() -> None:
df = pd.read_csv()
clean(df)
if __name__ == "__main__":
test()
cleaned = pd.DataFrame(test())
cleaned.to_csv()
如何保存
的输出
if __name__ == '__main__':
test()
到 csv?
根据评论中的建议,您可以这样尝试:
import pandas as pd
def clean(df: pd.DataFrame) -> pd.DataFrame: # fix type hint
# df = harmonize_game(df)
# df = numerical_scores(df)
# df = coerce_columns(df)
# df = strip_strings(df)
# df = clean_odds(df)
print("All steps applied")
return df # add a return statement
def test() -> pd.DataFrame:
df = pd.DataFrame(
{
"Unnamed:": {0: 0, 1: 1},
"0": {0: 0, 1: 1},
"Unnamed:.1": {0: 0.0, 1: 1.0},
"0.1": {0: 0.0, 1: 1.0},
"Unnamed:.2": {0: 0.0, 1: 1.0},
"0.1.1": {0: "nan", 1: "nan"},
"Unnamed:.3": {0: "22:00", 1: "20:00"},
"0.1.1.1": {0: "Bahia", 1: "ABC"},
"Unnamed:.4": {0: "-", 1: "-"},
"0.1.1.1.1": {0: "Vitoria", 1: "Ceara"},
"date": {0: "0:2", 1: "0:0"},
"time": {0: 1.82, 1: 3.15},
"game": {0: 3.36, 1: 3.09},
"score": {0: 4.13, 1: 2.15},
"home_odds": {0: "Brazil", 1: "Brazil"},
"draw_odds": {0: "Copa", 1: "Copa"},
"away_odds": {0: "do", 1: "do"},
"country": {0: "Nordeste", 1: "Nordeste"},
"league": {0: 2020, 1: 2020},
}
) # "pd.read_csv("file.csv")" replaced here just to test that everything works
return clean(df)
if __name__ == "__main__":
# fix the indentation and remove unnecessary statement
cleaned = test()
cleaned.to_csv("new_file.csv")
这是我的初始数据框:
df.head()
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 Unnamed: 0.1.1.1.1 date time game score home_odds draw_odds away_odds country league
0 0 0 0.0 0.0 0.0 NaN 22:00 Bahia - Vitoria 0:2 1.82 3.36 4.13 Brazil Copa do Nordeste 2020
1 1 1 1.0 1.0 1.0 NaN 20:00 ABC - Ceara 0:0 3.15 3.09 2.15 Brazil Copa do Nordeste 2020
2 2 2 2.0 2.0 2.0 NaN 20:00 Botafogo PB - Nautico 2:1 2.45 3.07 2.81 Brazil Copa do Nordeste 2020
3 3 3 3.0 3.0 3.0 NaN 20:00 Fortaleza - Santa Cruz 3:0 1.43 4.16 6.56 Brazil Copa do Nordeste 2020
4 4 4 4.0 4.0 4.0 07 Feb 2020 00:00 Sport Recife - Imperatriz 2:2 1.36 4.31 7.66 Brazil Copa do Nordeste 2020
当我 运行 这段代码时,我得到一个空数据框:
import pandas as pd
def harmonize_game(df: pd.DataFrame) -> pd.DataFrame:
df["game"] = df["game"].astype(str).str.replace(r"(\(\w+\))", "", regex=True)
df["game"] = df["game"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
df["league"] = (
df["league"].astype(str).str.replace(r"(\s\d+\S\d+)$", "", regex=True)
)
df[["home_team", "away_team"]] = df["game"].str.split(" - ", expand=True, n=1)
df[["home_score", "away_score"]] = df["score"].str.split(":", expand=True)
print("Data Harmonised")
return df
def numerical_scores(df: pd.DataFrame) -> pd.DataFrame:
df["away_score"] = (
df["away_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
)
df["home_score"] = (
df["home_score"].astype(str).str.replace(r"[a-zA-Z\s\D]", "", regex=True)
)
df = df[df.home_score != "."]
df = df[df.home_score != ".."]
df = df[df.home_score != "."]
df = df[df.home_odds != "-"]
df = df[df.draw_odds != "-"]
df = df[df.away_odds != "-"]
m = (
df[["home_odds", "draw_odds", "away_odds"]]
.astype(str)
.agg(lambda x: x.str.count("/"), 1)
.ne(0)
.all(1)
)
n = df[["home_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
o = df[["away_score"]].agg(lambda x: x.str.count("-"), 1).ne(0).all(1)
df = df[~m]
df = df[~n]
df = df[~o]
df = df[df.home_score != ""]
df = df[df.away_score != ""]
df = df.dropna()
print("Numerical data harmonised and cleaned")
return df
def coerce_columns(df: pd.DataFrame) -> pd.DataFrame:
df = df.loc[
:,
df.columns.intersection(
[
"datetime",
"country",
"league",
"home_team",
"away_team",
"home_odds",
"draw_odds",
"away_odds",
"home_score",
"away_score",
]
),
]
colt = {
"country": str,
"league": str,
"home_team": str,
"away_team": str,
"home_odds": float,
"draw_odds": float,
"away_odds": float,
"home_score": int,
"away_score": int,
}
df = df.astype(colt)
print("Data types recognized")
return df
def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
def clean_odds(df: pd.DataFrame) -> pd.DataFrame:
df = df[df["home_odds"] <= 100]
df = df[df["draw_odds"] <= 100]
df = df[df["away_odds"] <= 100]
df = df.drop_duplicates(
[
"datetime",
"home_score",
"away_score",
"country",
"league",
"home_team",
"away_team",
],
keep="last",
)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
print("Dataframe Cleaned")
return df
def clean(df: pd.DataFrame) -> pd.DataFrame:
df = harmonize_game(df)
df = numerical_scores(df)
df = coerce_columns(df)
df = strip_strings(df)
df = clean_odds(df)
print("All steps applied")
return df
def test() -> None:
df = pd.read_csv()
clean(df)
if __name__ == "__main__":
test()
cleaned = pd.DataFrame(test())
cleaned.to_csv()
如何保存
的输出if __name__ == '__main__':
test()
到 csv?
根据评论中的建议,您可以这样尝试:
import pandas as pd
def clean(df: pd.DataFrame) -> pd.DataFrame: # fix type hint
# df = harmonize_game(df)
# df = numerical_scores(df)
# df = coerce_columns(df)
# df = strip_strings(df)
# df = clean_odds(df)
print("All steps applied")
return df # add a return statement
def test() -> pd.DataFrame:
df = pd.DataFrame(
{
"Unnamed:": {0: 0, 1: 1},
"0": {0: 0, 1: 1},
"Unnamed:.1": {0: 0.0, 1: 1.0},
"0.1": {0: 0.0, 1: 1.0},
"Unnamed:.2": {0: 0.0, 1: 1.0},
"0.1.1": {0: "nan", 1: "nan"},
"Unnamed:.3": {0: "22:00", 1: "20:00"},
"0.1.1.1": {0: "Bahia", 1: "ABC"},
"Unnamed:.4": {0: "-", 1: "-"},
"0.1.1.1.1": {0: "Vitoria", 1: "Ceara"},
"date": {0: "0:2", 1: "0:0"},
"time": {0: 1.82, 1: 3.15},
"game": {0: 3.36, 1: 3.09},
"score": {0: 4.13, 1: 2.15},
"home_odds": {0: "Brazil", 1: "Brazil"},
"draw_odds": {0: "Copa", 1: "Copa"},
"away_odds": {0: "do", 1: "do"},
"country": {0: "Nordeste", 1: "Nordeste"},
"league": {0: 2020, 1: 2020},
}
) # "pd.read_csv("file.csv")" replaced here just to test that everything works
return clean(df)
if __name__ == "__main__":
# fix the indentation and remove unnecessary statement
cleaned = test()
cleaned.to_csv("new_file.csv")