从 pandas 个 cut 中排序 bin
Sort bins from pandas cut
使用 pandas cut 我可以通过提供边缘来定义 bins 并且 pandas 创建像 (a, b]
.
这样的 bins
我的问题是如何对垃圾箱进行排序(从最低到最高)?
import numpy as np
import pandas as pd
y = pd.Series(np.random.randn(100))
x1 = pd.Series(np.sign(np.random.randn(100)))
x2 = pd.cut(pd.Series(np.random.randn(100)), bins = [-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis = 1, keys = ['Y', 'X1', 'X2'])
我有一个中间结果,其中保留了 bin 的顺序
int_output = model.groupby(['X1', 'X2']).mean().unstack()
int_output.columns = int_output.columns.get_level_values(1)
X2 (-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.101475 -0.344419 -0.482992 -0.015179
1.0 0.249961 0.484757 -0.066383 -0.249414
但后来我做了其他任意改变垃圾箱顺序的操作:
output = pd.concat(int_output.to_dict('series'), axis = 1)
(-0.5, 0] (-3, -0.5] (0, 0.5] (0.5, 3]
X1
-1.0 -0.344419 0.101475 -0.482992 -0.015179
1.0 0.484757 0.249961 -0.066383 -0.249414
现在我想在条形图中绘制数据,但我希望 bins 从最低 (-3, -0.5] 到最高 (0.5, 3) 排序。
我想我可以通过操作字符串,在“,”上使用拆分,然后清理括号来实现这一点,但我想知道是否有更好的方法。
丢失 ordered
CategoricalIndex
.
存在主要问题
np.random.seed(12456)
y = pd.Series(np.random.randn(100))
x1 = pd.Series(np.sign(np.random.randn(100)))
x2 = pd.cut(pd.Series(np.random.randn(100)), bins = [-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis = 1, keys = ['Y', 'X1', 'X2'])
int_output = model.groupby(['X1', 'X2']).mean().unstack()
int_output.columns = int_output.columns.get_level_values(1)
print (int_output)
X2 (-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.230060 -0.079266 -0.079834 -0.064455
1.0 -0.451351 0.268688 0.020091 -0.280218
print (int_output.columns)
CategoricalIndex(['(-3, -0.5]', '(-0.5, 0]', '(0, 0.5]', '(0.5, 3]'],
categories=['(-3, -0.5]', '(-0.5, 0]', '(0, 0.5]', '(0.5, 3]'],
ordered=True, name='X2', dtype='category')
output = pd.concat(int_output.to_dict('series'), axis = 1)
print (output)
(-0.5, 0] (-3, -0.5] (0, 0.5] (0.5, 3]
X1
-1.0 -0.079266 0.230060 -0.079834 -0.064455
1.0 0.268688 -0.451351 0.020091 -0.280218
print (output.columns)
Index(['(-0.5, 0]', '(-3, -0.5]', '(0, 0.5]', '(0.5, 3]'], dtype='object')
一种可能的解决方案是 extract
first number from output.columns
, create helper Series and sort it. Last reindex
原始列:
cat = output.columns.str.extract('\((.*),', expand=False).astype(float)
a = pd.Series(cat, index=output.columns).sort_values()
print (a)
(-3, -0.5] -3.0
(-0.5, 0] -0.5
(0, 0.5] 0.0
(0.5, 3] 0.5
dtype: float64
output = output.reindex(columns=a.index)
print (output)
(-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.230060 -0.079266 -0.079834 -0.064455
1.0 -0.451351 0.268688 0.020091 -0.280218
解决您在上面突出显示的问题的一个简单方法是简单地重新排序列:
output[sorted(output.columns)]
我做了一个函数来这样做。
def dfsortbybins(df, col):
"""
param df: pandas dataframe
param col: name of column containing bins
"""
d=dict(zip(bins,[float(s.split(',')[0].split('(')[1]) for s in bins]))
df[f'{col} dfrankbybins']=df.apply(lambda x : d[x[col]] if not pd.isnull(x[col]) else x[col], axis=1)
df=df.sort_values(f'{col} dfrankbybins').drop(f'{col} dfrankbybins',axis=1)
return df
这是另一个函数。与其他解决方案不同,这在多种情况下对我有用。我想我会把它留在这里,希望它能对将来遇到同样问题的一些人派上用场。
def sort_bins(bin_col):
"""
Sorts bins after using pd.cut. Increasing order. Puts "NaN" bin at the beginning.
Input:
bin_col: pd.series containing bins to be sorted
"""
# Dictionary to store first value from each bin
vals = {}
# Iterate through all bins
for i, item in enumerate(bin_col.unique()):
# Check if bin is "nan", if yes, assign low value to put it at the beginning
if item == "nan":
vals[i] = -99999
# If not "nan", get the first value from bin to sort later
else:
vals[i] = float(item.split(",")[0][1:])
# Sort bins according to extracted first values
ixs = list({k: v for k, v in \
sorted(vals.items(), key=lambda item: item[1])}.keys())
# Make sorted list of bins
sorted_bins = bin_col.unique()[list(ixs)]
return sorted_bins
# Example, assuming "age_bin" column has the bins:
sorted_bins = sort_bins(df["age_bin"])
使用 pandas cut 我可以通过提供边缘来定义 bins 并且 pandas 创建像 (a, b]
.
我的问题是如何对垃圾箱进行排序(从最低到最高)?
import numpy as np
import pandas as pd
y = pd.Series(np.random.randn(100))
x1 = pd.Series(np.sign(np.random.randn(100)))
x2 = pd.cut(pd.Series(np.random.randn(100)), bins = [-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis = 1, keys = ['Y', 'X1', 'X2'])
我有一个中间结果,其中保留了 bin 的顺序
int_output = model.groupby(['X1', 'X2']).mean().unstack()
int_output.columns = int_output.columns.get_level_values(1)
X2 (-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.101475 -0.344419 -0.482992 -0.015179
1.0 0.249961 0.484757 -0.066383 -0.249414
但后来我做了其他任意改变垃圾箱顺序的操作:
output = pd.concat(int_output.to_dict('series'), axis = 1)
(-0.5, 0] (-3, -0.5] (0, 0.5] (0.5, 3]
X1
-1.0 -0.344419 0.101475 -0.482992 -0.015179
1.0 0.484757 0.249961 -0.066383 -0.249414
现在我想在条形图中绘制数据,但我希望 bins 从最低 (-3, -0.5] 到最高 (0.5, 3) 排序。
我想我可以通过操作字符串,在“,”上使用拆分,然后清理括号来实现这一点,但我想知道是否有更好的方法。
丢失 ordered
CategoricalIndex
.
np.random.seed(12456)
y = pd.Series(np.random.randn(100))
x1 = pd.Series(np.sign(np.random.randn(100)))
x2 = pd.cut(pd.Series(np.random.randn(100)), bins = [-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis = 1, keys = ['Y', 'X1', 'X2'])
int_output = model.groupby(['X1', 'X2']).mean().unstack()
int_output.columns = int_output.columns.get_level_values(1)
print (int_output)
X2 (-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.230060 -0.079266 -0.079834 -0.064455
1.0 -0.451351 0.268688 0.020091 -0.280218
print (int_output.columns)
CategoricalIndex(['(-3, -0.5]', '(-0.5, 0]', '(0, 0.5]', '(0.5, 3]'],
categories=['(-3, -0.5]', '(-0.5, 0]', '(0, 0.5]', '(0.5, 3]'],
ordered=True, name='X2', dtype='category')
output = pd.concat(int_output.to_dict('series'), axis = 1)
print (output)
(-0.5, 0] (-3, -0.5] (0, 0.5] (0.5, 3]
X1
-1.0 -0.079266 0.230060 -0.079834 -0.064455
1.0 0.268688 -0.451351 0.020091 -0.280218
print (output.columns)
Index(['(-0.5, 0]', '(-3, -0.5]', '(0, 0.5]', '(0.5, 3]'], dtype='object')
一种可能的解决方案是 extract
first number from output.columns
, create helper Series and sort it. Last reindex
原始列:
cat = output.columns.str.extract('\((.*),', expand=False).astype(float)
a = pd.Series(cat, index=output.columns).sort_values()
print (a)
(-3, -0.5] -3.0
(-0.5, 0] -0.5
(0, 0.5] 0.0
(0.5, 3] 0.5
dtype: float64
output = output.reindex(columns=a.index)
print (output)
(-3, -0.5] (-0.5, 0] (0, 0.5] (0.5, 3]
X1
-1.0 0.230060 -0.079266 -0.079834 -0.064455
1.0 -0.451351 0.268688 0.020091 -0.280218
解决您在上面突出显示的问题的一个简单方法是简单地重新排序列:
output[sorted(output.columns)]
我做了一个函数来这样做。
def dfsortbybins(df, col):
"""
param df: pandas dataframe
param col: name of column containing bins
"""
d=dict(zip(bins,[float(s.split(',')[0].split('(')[1]) for s in bins]))
df[f'{col} dfrankbybins']=df.apply(lambda x : d[x[col]] if not pd.isnull(x[col]) else x[col], axis=1)
df=df.sort_values(f'{col} dfrankbybins').drop(f'{col} dfrankbybins',axis=1)
return df
这是另一个函数。与其他解决方案不同,这在多种情况下对我有用。我想我会把它留在这里,希望它能对将来遇到同样问题的一些人派上用场。
def sort_bins(bin_col):
"""
Sorts bins after using pd.cut. Increasing order. Puts "NaN" bin at the beginning.
Input:
bin_col: pd.series containing bins to be sorted
"""
# Dictionary to store first value from each bin
vals = {}
# Iterate through all bins
for i, item in enumerate(bin_col.unique()):
# Check if bin is "nan", if yes, assign low value to put it at the beginning
if item == "nan":
vals[i] = -99999
# If not "nan", get the first value from bin to sort later
else:
vals[i] = float(item.split(",")[0][1:])
# Sort bins according to extracted first values
ixs = list({k: v for k, v in \
sorted(vals.items(), key=lambda item: item[1])}.keys())
# Make sorted list of bins
sorted_bins = bin_col.unique()[list(ixs)]
return sorted_bins
# Example, assuming "age_bin" column has the bins:
sorted_bins = sort_bins(df["age_bin"])