为什么不能 Pandas 使用 masks/replace 将 nan 替换为 0 数组?
Why cant Pandas replace nan with an array of 0s using masks/replace?
我有这样一个系列
s = pd.Series([[1,2,3],[1,2,3],np.nan,[1,2,3],[1,2,3],np.nan])
我只想将 NaN
替换为 [0,0,0]
。
我试过了
s.fillna([0,0,0]) # TypeError: "value" parameter must be a scalar or dict, but you passed a "list"
s[s.isna()] = [[0,0,0],[0,0,0]] # just replaces the NaN with a single "0". WHY?!
s.fillna("NAN").replace({"NAN":[0,0,0]}) # ValueError: NumPy boolean array indexing assignment cannot
#assign 3 input values to the 2 output values where the mask is true
s.fillna("NAN").replace({"NAN":[[0,0,0],[0,0,0]]}) # TypeError: NumPy boolean array indexing assignment
# requires a 0 or 1-dimensional input, input has 2 dimensions
我真的不明白,为什么第一种方法行不通(也许我得到了第一种,但第二种我无法理解)。
感谢 SO-question and answer,我们可以通过
is_na = s.isna()
s.loc[is_na] = s.loc[is_na].apply(lambda x: [0,0,0])
但由于 apply
通常很慢我不明白,为什么我们不能使用 replace
或上面的切片
Pandas 痛苦地处理列表,这里是 hacky 解决方案:
s = s.fillna(pd.Series([[0,0,0]] * len(s), index=s.index))
print (s)
0 [1, 2, 3]
1 [1, 2, 3]
2 [0, 0, 0]
3 [1, 2, 3]
4 [1, 2, 3]
5 [0, 0, 0]
dtype: object
Series.reindex
s.dropna().reindex(s.index, fill_value=[0, 0, 0])
0 [1, 2, 3]
1 [1, 2, 3]
2 [0, 0, 0]
3 [1, 2, 3]
4 [1, 2, 3]
5 [0, 0, 0]
dtype: object
documentation表示这个值不能是list
。
Value to use to fill holes (e.g. 0), alternately a
dict/Series/DataFrame of values specifying which value to use for each
index (for a Series) or column (for a DataFrame). Values not in the
dict/Series/DataFrame will not be filled. This value cannot be a list.
这可能是当前实施的局限性,如果不修补源代码,您必须求助于解决方法(如下所述)。
但是,如果您不打算使用锯齿状数组,您真正想要做的可能是将 pd.Series()
替换为 pd.DataFrame()
,例如:
import numpy as np
import pandas as pd
s = pd.DataFrame(
[[1, 2, 3],
[1, 2, 3],
[np.nan],
[1, 2, 3],
[1, 2, 3],
[np.nan]],
dtype=pd.Int64Dtype()) # to mix integers with NaNs
s.fillna(0)
# 0 1 2
# 0 1 2 3
# 1 1 2 3
# 2 0 0 0
# 3 1 2 3
# 4 1 2 3
# 5 0 0 0
如果您确实需要使用交错数组,您可以使用其他答案中建议的任何解决方法,或者您可以尝试其中一项,例如:
ii = s.isna()
nn = ii.sum()
s[ii] = pd.Series([[0, 0, 0]] * nn).to_numpy()
# 0 [1, 2, 3]
# 1 [1, 2, 3]
# 2 [0, 0, 0]
# 3 [1, 2, 3]
# 4 [1, 2, 3]
# 5 [0, 0, 0]
# dtype: object
基本上使用 NumPy 掩码来填充系列。诀窍是为在 NumPy 级别工作的分配生成一个兼容的对象。
如果输入中有太多 NaN,以类似的方式工作可能更有效/更快,但使用 s.notna()
代替,例如:
import pandas as pd
result = pd.Series([[0, 0, 0]] * len(s))
result[s.notna()] = s[s.notna()]
让我们尝试做一些基准测试,其中:
replace_nan_isna()
来自上面
import pandas as pd
def replace_nan_isna(s, value, inplace=False):
if not inplace:
s = s.copy()
ii = s.isna()
nn = ii.sum()
s[ii] = pd.Series([value] * nn).to_numpy()
return s
replace_nan_notna()
也是上面的
import pandas as pd
def replace_nan_notna(s, value, inplace=False):
if inplace:
raise ValueError("In-place not supported!")
result = pd.Series([value] * len(s))
result[s.notna()] = s[s.notna()]
return result
replace_nan_reindex()
来自
def replace_nan_reindex(s, value, inplace=False):
if not inplace:
s = s.copy()
s.dropna().reindex(s.index, fill_value=value)
return s
replace_nan_fillna()
来自
import pandas as pd
def replace_nan_fillna(s, value, inplace=False):
if not inplace:
s = s.copy()
s.fillna(pd.Series([value] * len(s), index=s.index))
return s
使用以下代码:
import numpy as np
import pandas as pd
def gen_data(n=5, k=2, p=0.7, obj=(1, 2, 3)):
return pd.Series(([obj] * int(p * n) + [np.nan] * (n - int(p * n))) * k)
funcs = replace_nan_isna, replace_nan_notna, replace_nan_reindex, replace_nan_fillna
# : inspect results
s = gen_data(5, 1)
for func in funcs:
print(f'{func.__name__:>20s} {func(s, value)}')
print()
# : generate benchmarks
s = gen_data(100, 1000)
value = (0, 0, 0)
base = funcs[0](s, value)
for func in funcs:
print(f'{func.__name__:>20s} {(func(s, value) == base).all()!s:>5}', end=' ')
%timeit func(s, value)
# replace_nan_isna True 100 loops, best of 5: 16.5 ms per loop
# replace_nan_notna True 10 loops, best of 5: 46.5 ms per loop
# replace_nan_reindex True 100 loops, best of 5: 9.74 ms per loop
# replace_nan_fillna True 10 loops, best of 5: 36.4 ms per loop
表明 reindex()
可能是最快的方法。
我有这样一个系列
s = pd.Series([[1,2,3],[1,2,3],np.nan,[1,2,3],[1,2,3],np.nan])
我只想将 NaN
替换为 [0,0,0]
。
我试过了
s.fillna([0,0,0]) # TypeError: "value" parameter must be a scalar or dict, but you passed a "list"
s[s.isna()] = [[0,0,0],[0,0,0]] # just replaces the NaN with a single "0". WHY?!
s.fillna("NAN").replace({"NAN":[0,0,0]}) # ValueError: NumPy boolean array indexing assignment cannot
#assign 3 input values to the 2 output values where the mask is true
s.fillna("NAN").replace({"NAN":[[0,0,0],[0,0,0]]}) # TypeError: NumPy boolean array indexing assignment
# requires a 0 or 1-dimensional input, input has 2 dimensions
我真的不明白,为什么第一种方法行不通(也许我得到了第一种,但第二种我无法理解)。
感谢
is_na = s.isna()
s.loc[is_na] = s.loc[is_na].apply(lambda x: [0,0,0])
但由于 apply
通常很慢我不明白,为什么我们不能使用 replace
或上面的切片
Pandas 痛苦地处理列表,这里是 hacky 解决方案:
s = s.fillna(pd.Series([[0,0,0]] * len(s), index=s.index))
print (s)
0 [1, 2, 3]
1 [1, 2, 3]
2 [0, 0, 0]
3 [1, 2, 3]
4 [1, 2, 3]
5 [0, 0, 0]
dtype: object
Series.reindex
s.dropna().reindex(s.index, fill_value=[0, 0, 0])
0 [1, 2, 3]
1 [1, 2, 3]
2 [0, 0, 0]
3 [1, 2, 3]
4 [1, 2, 3]
5 [0, 0, 0]
dtype: object
documentation表示这个值不能是list
。
Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list.
这可能是当前实施的局限性,如果不修补源代码,您必须求助于解决方法(如下所述)。
但是,如果您不打算使用锯齿状数组,您真正想要做的可能是将 pd.Series()
替换为 pd.DataFrame()
,例如:
import numpy as np
import pandas as pd
s = pd.DataFrame(
[[1, 2, 3],
[1, 2, 3],
[np.nan],
[1, 2, 3],
[1, 2, 3],
[np.nan]],
dtype=pd.Int64Dtype()) # to mix integers with NaNs
s.fillna(0)
# 0 1 2
# 0 1 2 3
# 1 1 2 3
# 2 0 0 0
# 3 1 2 3
# 4 1 2 3
# 5 0 0 0
如果您确实需要使用交错数组,您可以使用其他答案中建议的任何解决方法,或者您可以尝试其中一项,例如:
ii = s.isna()
nn = ii.sum()
s[ii] = pd.Series([[0, 0, 0]] * nn).to_numpy()
# 0 [1, 2, 3]
# 1 [1, 2, 3]
# 2 [0, 0, 0]
# 3 [1, 2, 3]
# 4 [1, 2, 3]
# 5 [0, 0, 0]
# dtype: object
基本上使用 NumPy 掩码来填充系列。诀窍是为在 NumPy 级别工作的分配生成一个兼容的对象。
如果输入中有太多 NaN,以类似的方式工作可能更有效/更快,但使用 s.notna()
代替,例如:
import pandas as pd
result = pd.Series([[0, 0, 0]] * len(s))
result[s.notna()] = s[s.notna()]
让我们尝试做一些基准测试,其中:
replace_nan_isna()
来自上面
import pandas as pd
def replace_nan_isna(s, value, inplace=False):
if not inplace:
s = s.copy()
ii = s.isna()
nn = ii.sum()
s[ii] = pd.Series([value] * nn).to_numpy()
return s
replace_nan_notna()
也是上面的
import pandas as pd
def replace_nan_notna(s, value, inplace=False):
if inplace:
raise ValueError("In-place not supported!")
result = pd.Series([value] * len(s))
result[s.notna()] = s[s.notna()]
return result
replace_nan_reindex()
来自
def replace_nan_reindex(s, value, inplace=False):
if not inplace:
s = s.copy()
s.dropna().reindex(s.index, fill_value=value)
return s
replace_nan_fillna()
来自
import pandas as pd
def replace_nan_fillna(s, value, inplace=False):
if not inplace:
s = s.copy()
s.fillna(pd.Series([value] * len(s), index=s.index))
return s
使用以下代码:
import numpy as np
import pandas as pd
def gen_data(n=5, k=2, p=0.7, obj=(1, 2, 3)):
return pd.Series(([obj] * int(p * n) + [np.nan] * (n - int(p * n))) * k)
funcs = replace_nan_isna, replace_nan_notna, replace_nan_reindex, replace_nan_fillna
# : inspect results
s = gen_data(5, 1)
for func in funcs:
print(f'{func.__name__:>20s} {func(s, value)}')
print()
# : generate benchmarks
s = gen_data(100, 1000)
value = (0, 0, 0)
base = funcs[0](s, value)
for func in funcs:
print(f'{func.__name__:>20s} {(func(s, value) == base).all()!s:>5}', end=' ')
%timeit func(s, value)
# replace_nan_isna True 100 loops, best of 5: 16.5 ms per loop
# replace_nan_notna True 10 loops, best of 5: 46.5 ms per loop
# replace_nan_reindex True 100 loops, best of 5: 9.74 ms per loop
# replace_nan_fillna True 10 loops, best of 5: 36.4 ms per loop
表明 reindex()
可能是最快的方法。