pandas stack columns from column name, forecasting 任务
pandas stack columns from column name, forecasting task
我正在处理预测数据。
我想根据当前列名创建一个新列 (horizon
),并将两个变量 (["y", "yhat"]
) 的值叠加起来
import pandas as pd
import numpy as np
import itertools
rng = np.random.default_rng(2021)
index = list(itertools.product(
["pixel_1", "pixel_2"],
pd.date_range("2001-01-01", "2002-01-01", freq="D")
))
dat = pd.DataFrame({
"initialisation_time": np.array(index)[:, 1],
"pixel": np.array(index)[:, 0],
"y_0": rng.random(len(index)),
"y_1": rng.random(len(index)),
"yhat_0": rng.random(len(index)),
"yhat_1": rng.random(len(index)),
})
给我一个数据框如下:
time pixel y_0 y_1 yhat_0 yhat_1
0 2001-01-01 pixel_1 0.257135 0.609062 0.989102 0.297949
1 2001-01-02 pixel_1 0.898808 0.509193 0.966388 0.182610
2 2001-01-03 pixel_1 0.724221 0.537482 0.455078 0.141172
3 2001-01-04 pixel_1 0.503014 0.391576 0.382041 0.652105
4 2001-01-05 pixel_1 0.688625 0.865679 0.828888 0.856478
.. ... ... ... ... ... ...
727 2001-12-28 pixel_2 0.697661 0.725550 0.926735 0.527801
728 2001-12-29 pixel_2 0.052295 0.632843 0.536919 0.817767
729 2001-12-30 pixel_2 0.306129 0.426934 0.638589 0.697375
730 2001-12-31 pixel_2 0.633789 0.982248 0.255824 0.982358
731 2002-01-01 pixel_2 0.922154 0.088203 0.887233 0.700154
[732 rows x 6 columns]
## 我想要以下形式的数据框:
将四列 (["y_0", "y_1", "yhat_0", "yhat_1"]
) 转换为三列 (["horizon", "y", "yhat"]
).
target_lookalike = pd.DataFrame({
"initialisation_time": np.tile(np.array(index)[:, 1], 2),
"pixel": np.tile(np.array(index)[:, 0], 2),
"horizon": np.tile([0, 1], len(index)),
"y": rng.random(len(index) * 2),
"y_hat": rng.random(len(index) * 2),
})
initialisation_time pixel horizon y y_hat
0 2001-01-01 pixel_1 0 0.833400 0.457637
1 2001-01-02 pixel_1 1 0.607682 0.302057
2 2001-01-03 pixel_1 0 0.474058 0.045079
3 2001-01-04 pixel_1 1 0.198236 0.586153
4 2001-01-05 pixel_1 0 0.047407 0.179364
... ... ... ... ... ...
1459 2001-12-28 pixel_2 1 0.157809 0.997542
1460 2001-12-29 pixel_2 0 0.659860 0.830327
1461 2001-12-30 pixel_2 1 0.047798 0.541301
1462 2001-12-31 pixel_2 0 0.002146 0.945190
1463 2002-01-01 pixel_2 1 0.636621 0.735699
[1464 rows x 5 columns]
水平线取自列字符串("y_0"
将是 horizon == 0
),并将值分配给 y
或 y_hat
。
也许有人可以找到更简洁的方法来实现同样的事情,但这应该可行:
# df0 is original data
df1 = df0[['time','pixel','y_0','yhat_0']].rename(columns={'y_0':'y','yhat_0':'yhat'})
df1['Horizon'] = 0
df2 = df0[['time','pixel','y_1','yhat_1']].rename(columns={'y_1':'y','yhat_1':'yhat'})
df2['Horizon'] = 1
df3 = df1.append(df2).sort_values('time')
我从 df3
得到以下输出:
time pixel y yhat Horizon
0 2001-01-01 pixel_1 0.257135 0.989102 0
0 2001-01-01 pixel_1 0.609062 0.297949 1
1 2001-01-02 pixel_1 0.898808 0.966388 0
1 2001-01-02 pixel_1 0.509193 0.182610 1
2 2001-01-03 pixel_1 0.724221 0.455078 0
2 2001-01-03 pixel_1 0.537482 0.141172 1
3 2001-01-04 pixel_1 0.503014 0.382041 0
3 2001-01-04 pixel_1 0.391576 0.652105 1
4 2001-01-05 pixel_1 0.688625 0.828888 0
4 2001-01-05 pixel_1 0.865679 0.856478 1
5 2001-12-28 pixel_2 0.697661 0.926735 0
5 2001-12-28 pixel_2 0.725550 0.527801 1
6 2001-12-29 pixel_2 0.052295 0.536919 0
6 2001-12-29 pixel_2 0.632843 0.817767 1
7 2001-12-30 pixel_2 0.306129 0.638589 0
7 2001-12-30 pixel_2 0.426934 0.697375 1
8 2001-12-31 pixel_2 0.633789 0.255824 0
8 2001-12-31 pixel_2 0.982248 0.982358 1
它与您的输出略有不同,每个日期显示两次,但我认为这可能是所需的输出?
以下函数有效:
def unstack_preds_wide_to_long(
preds: pd.DataFrame
) -> pd.DataFrame:
# stack into LONG form
df = preds.set_index(["pixel", "initialisation_time"]).stack().reset_index().rename({0: "value"}, axis=1)
# explode "level_2" column into multiple columns (split on "_")
df = df.join(df["level_2"].str.split("_", 1, expand=True).rename({0: "variable", 1: "horizon"}, axis=1)).drop(columns=["level_2"])
# change column with variables to columns
df = df.set_index(["pixel", "initialisation_time", "horizon"]).pivot(columns="variable")
# drop the extra multi-index level in the columns
df.columns = df.columns.droplevel()
return df.reset_index()
并这样称呼它:
df = unstack_preds_wide_to_long(dat)
df
variable pixel initialisation_time horizon y yhat
0 pixel_1 2001-01-01 0 0.756948 0.141300
1 pixel_1 2001-01-01 1 0.006226 0.654879
2 pixel_1 2001-01-02 0 0.941382 0.461212
3 pixel_1 2001-01-02 1 0.330925 0.278326
4 pixel_1 2001-01-03 0 0.592463 0.498596
... ... ... ... ... ...
1459 pixel_2 2001-12-30 1 0.092664 0.320106
1460 pixel_2 2001-12-31 0 0.076966 0.118248
1461 pixel_2 2001-12-31 1 0.733759 0.116228
1462 pixel_2 2002-01-01 0 0.287662 0.546546
1463 pixel_2 2002-01-01 1 0.635012 0.697661
[1464 rows x 5 columns]
我正在处理预测数据。
我想根据当前列名创建一个新列 (horizon
),并将两个变量 (["y", "yhat"]
) 的值叠加起来
import pandas as pd
import numpy as np
import itertools
rng = np.random.default_rng(2021)
index = list(itertools.product(
["pixel_1", "pixel_2"],
pd.date_range("2001-01-01", "2002-01-01", freq="D")
))
dat = pd.DataFrame({
"initialisation_time": np.array(index)[:, 1],
"pixel": np.array(index)[:, 0],
"y_0": rng.random(len(index)),
"y_1": rng.random(len(index)),
"yhat_0": rng.random(len(index)),
"yhat_1": rng.random(len(index)),
})
给我一个数据框如下:
time pixel y_0 y_1 yhat_0 yhat_1
0 2001-01-01 pixel_1 0.257135 0.609062 0.989102 0.297949
1 2001-01-02 pixel_1 0.898808 0.509193 0.966388 0.182610
2 2001-01-03 pixel_1 0.724221 0.537482 0.455078 0.141172
3 2001-01-04 pixel_1 0.503014 0.391576 0.382041 0.652105
4 2001-01-05 pixel_1 0.688625 0.865679 0.828888 0.856478
.. ... ... ... ... ... ...
727 2001-12-28 pixel_2 0.697661 0.725550 0.926735 0.527801
728 2001-12-29 pixel_2 0.052295 0.632843 0.536919 0.817767
729 2001-12-30 pixel_2 0.306129 0.426934 0.638589 0.697375
730 2001-12-31 pixel_2 0.633789 0.982248 0.255824 0.982358
731 2002-01-01 pixel_2 0.922154 0.088203 0.887233 0.700154
[732 rows x 6 columns]
## 我想要以下形式的数据框:
将四列 (["y_0", "y_1", "yhat_0", "yhat_1"]
) 转换为三列 (["horizon", "y", "yhat"]
).
target_lookalike = pd.DataFrame({
"initialisation_time": np.tile(np.array(index)[:, 1], 2),
"pixel": np.tile(np.array(index)[:, 0], 2),
"horizon": np.tile([0, 1], len(index)),
"y": rng.random(len(index) * 2),
"y_hat": rng.random(len(index) * 2),
})
initialisation_time pixel horizon y y_hat
0 2001-01-01 pixel_1 0 0.833400 0.457637
1 2001-01-02 pixel_1 1 0.607682 0.302057
2 2001-01-03 pixel_1 0 0.474058 0.045079
3 2001-01-04 pixel_1 1 0.198236 0.586153
4 2001-01-05 pixel_1 0 0.047407 0.179364
... ... ... ... ... ...
1459 2001-12-28 pixel_2 1 0.157809 0.997542
1460 2001-12-29 pixel_2 0 0.659860 0.830327
1461 2001-12-30 pixel_2 1 0.047798 0.541301
1462 2001-12-31 pixel_2 0 0.002146 0.945190
1463 2002-01-01 pixel_2 1 0.636621 0.735699
[1464 rows x 5 columns]
水平线取自列字符串("y_0"
将是 horizon == 0
),并将值分配给 y
或 y_hat
。
也许有人可以找到更简洁的方法来实现同样的事情,但这应该可行:
# df0 is original data
df1 = df0[['time','pixel','y_0','yhat_0']].rename(columns={'y_0':'y','yhat_0':'yhat'})
df1['Horizon'] = 0
df2 = df0[['time','pixel','y_1','yhat_1']].rename(columns={'y_1':'y','yhat_1':'yhat'})
df2['Horizon'] = 1
df3 = df1.append(df2).sort_values('time')
我从 df3
得到以下输出:
time pixel y yhat Horizon
0 2001-01-01 pixel_1 0.257135 0.989102 0
0 2001-01-01 pixel_1 0.609062 0.297949 1
1 2001-01-02 pixel_1 0.898808 0.966388 0
1 2001-01-02 pixel_1 0.509193 0.182610 1
2 2001-01-03 pixel_1 0.724221 0.455078 0
2 2001-01-03 pixel_1 0.537482 0.141172 1
3 2001-01-04 pixel_1 0.503014 0.382041 0
3 2001-01-04 pixel_1 0.391576 0.652105 1
4 2001-01-05 pixel_1 0.688625 0.828888 0
4 2001-01-05 pixel_1 0.865679 0.856478 1
5 2001-12-28 pixel_2 0.697661 0.926735 0
5 2001-12-28 pixel_2 0.725550 0.527801 1
6 2001-12-29 pixel_2 0.052295 0.536919 0
6 2001-12-29 pixel_2 0.632843 0.817767 1
7 2001-12-30 pixel_2 0.306129 0.638589 0
7 2001-12-30 pixel_2 0.426934 0.697375 1
8 2001-12-31 pixel_2 0.633789 0.255824 0
8 2001-12-31 pixel_2 0.982248 0.982358 1
它与您的输出略有不同,每个日期显示两次,但我认为这可能是所需的输出?
以下函数有效:
def unstack_preds_wide_to_long(
preds: pd.DataFrame
) -> pd.DataFrame:
# stack into LONG form
df = preds.set_index(["pixel", "initialisation_time"]).stack().reset_index().rename({0: "value"}, axis=1)
# explode "level_2" column into multiple columns (split on "_")
df = df.join(df["level_2"].str.split("_", 1, expand=True).rename({0: "variable", 1: "horizon"}, axis=1)).drop(columns=["level_2"])
# change column with variables to columns
df = df.set_index(["pixel", "initialisation_time", "horizon"]).pivot(columns="variable")
# drop the extra multi-index level in the columns
df.columns = df.columns.droplevel()
return df.reset_index()
并这样称呼它:
df = unstack_preds_wide_to_long(dat)
df
variable pixel initialisation_time horizon y yhat
0 pixel_1 2001-01-01 0 0.756948 0.141300
1 pixel_1 2001-01-01 1 0.006226 0.654879
2 pixel_1 2001-01-02 0 0.941382 0.461212
3 pixel_1 2001-01-02 1 0.330925 0.278326
4 pixel_1 2001-01-03 0 0.592463 0.498596
... ... ... ... ... ...
1459 pixel_2 2001-12-30 1 0.092664 0.320106
1460 pixel_2 2001-12-31 0 0.076966 0.118248
1461 pixel_2 2001-12-31 1 0.733759 0.116228
1462 pixel_2 2002-01-01 0 0.287662 0.546546
1463 pixel_2 2002-01-01 1 0.635012 0.697661
[1464 rows x 5 columns]