如何解码 pandas DataFrame 中的 LabelEncoder 编码列?
How to decode LabelEncoder encoded column in pandas DataFrame?
我正在 dataset。我在那里通过将分类对象转换为数字来练习特征工程,代码行如下:
import pandas as pd
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
label_encoder = preprocessing.LabelEncoder()
for col in colsObj:
df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
df[col] = label_encoder.inverse_transform(df[col])
df.head()
但是这里 inverse_tranform()
没有返回原始数据集。请帮助我!
为了正确工作,有必要将 LabelEncoder
转换为字典数据类型:
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave NaN Reg
2 20 RL 80.0 9600 Pave NaN Reg
3 60 RL 68.0 11250 Pave NaN IR1
4 70 RL 60.0 9550 Pave NaN IR1
5 60 RL 84.0 14260 Pave NaN IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 NaN NaN NaN
2 Lvl AllPub FR2 ... 0 NaN NaN NaN
3 Lvl AllPub Inside ... 0 NaN NaN NaN
4 Lvl AllPub Corner ... 0 NaN NaN NaN
5 Lvl AllPub FR2 ... 0 NaN NaN NaN
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000
[5 rows x 80 columns]
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
from collections import defaultdict
di = defaultdict(preprocessing.LabelEncoder)
for col in colsObj:
df[col] = di[col].fit_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 3 65.0 8450 1 0 3
2 20 3 80.0 9600 1 0 3
3 60 3 68.0 11250 1 0 0
4 70 3 60.0 9550 1 0 0
5 60 3 84.0 14260 1 0 0
LandContour Utilities LotConfig ... PoolArea PoolQC Fence \
Id ...
1 3 0 4 ... 0 2 2
2 3 0 2 ... 0 2 2
3 3 0 4 ... 0 2 2
4 3 0 0 ... 0 2 2
5 3 0 2 ... 0 2 2
MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 2 0 2 2008 8 4 208500
2 2 0 5 2007 8 4 181500
3 2 0 9 2008 8 4 223500
4 2 0 2 2006 8 0 140000
5 2 0 12 2008 8 4 250000
[5 rows x 80 columns]
print (di)
defaultdict(<class 'sklearn.preprocessing._label.LabelEncoder'>, {'Alley': LabelEncoder(), 'BldgType': LabelEncoder(), 'BsmtCond': LabelEncoder(), 'BsmtExposure': LabelEncoder(), 'BsmtFinType1': LabelEncoder(), 'BsmtFinType2': LabelEncoder(), 'BsmtQual': LabelEncoder(), 'CentralAir': LabelEncoder(), 'Condition1': LabelEncoder(), 'Condition2': LabelEncoder(), 'Electrical': LabelEncoder(), 'ExterCond': LabelEncoder(), 'ExterQual': LabelEncoder(), 'Exterior1st': LabelEncoder(), 'Exterior2nd': LabelEncoder(), 'Fence': LabelEncoder(), 'FireplaceQu': LabelEncoder(), 'Foundation': LabelEncoder(), 'Functional': LabelEncoder(), 'GarageCond': LabelEncoder(), 'GarageFinish': LabelEncoder(), 'GarageQual': LabelEncoder(), 'GarageType': LabelEncoder(), 'Heating': LabelEncoder(), 'HeatingQC': LabelEncoder(), 'HouseStyle': LabelEncoder(), 'KitchenQual': LabelEncoder(), 'LandContour': LabelEncoder(), 'LandSlope': LabelEncoder(), 'LotConfig': LabelEncoder(), 'LotShape': LabelEncoder(), 'MSZoning': LabelEncoder(), 'MasVnrType': LabelEncoder(), 'MiscFeature': LabelEncoder(), 'Neighborhood': LabelEncoder(), 'PavedDrive': LabelEncoder(), 'PoolQC': LabelEncoder(), 'RoofMatl': LabelEncoder(), 'RoofStyle': LabelEncoder(), 'SaleCondition': LabelEncoder(), 'SaleType': LabelEncoder(), 'Street': LabelEncoder(), 'Utilities': LabelEncoder()})
for col in colsObj:
df[col] = di[col].inverse_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave Grvl Reg
2 20 RL 80.0 9600 Pave Grvl Reg
3 60 RL 68.0 11250 Pave Grvl IR1
4 70 RL 60.0 9550 Pave Grvl IR1
5 60 RL 84.0 14260 Pave Grvl IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 Gd MnPrv Shed
2 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
3 Lvl AllPub Inside ... 0 Gd MnPrv Shed
4 Lvl AllPub Corner ... 0 Gd MnPrv Shed
5 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000
我正在 dataset。我在那里通过将分类对象转换为数字来练习特征工程,代码行如下:
import pandas as pd
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
label_encoder = preprocessing.LabelEncoder()
for col in colsObj:
df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
df[col] = label_encoder.inverse_transform(df[col])
df.head()
但是这里 inverse_tranform()
没有返回原始数据集。请帮助我!
为了正确工作,有必要将 LabelEncoder
转换为字典数据类型:
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave NaN Reg
2 20 RL 80.0 9600 Pave NaN Reg
3 60 RL 68.0 11250 Pave NaN IR1
4 70 RL 60.0 9550 Pave NaN IR1
5 60 RL 84.0 14260 Pave NaN IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 NaN NaN NaN
2 Lvl AllPub FR2 ... 0 NaN NaN NaN
3 Lvl AllPub Inside ... 0 NaN NaN NaN
4 Lvl AllPub Corner ... 0 NaN NaN NaN
5 Lvl AllPub FR2 ... 0 NaN NaN NaN
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000
[5 rows x 80 columns]
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
from collections import defaultdict
di = defaultdict(preprocessing.LabelEncoder)
for col in colsObj:
df[col] = di[col].fit_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 3 65.0 8450 1 0 3
2 20 3 80.0 9600 1 0 3
3 60 3 68.0 11250 1 0 0
4 70 3 60.0 9550 1 0 0
5 60 3 84.0 14260 1 0 0
LandContour Utilities LotConfig ... PoolArea PoolQC Fence \
Id ...
1 3 0 4 ... 0 2 2
2 3 0 2 ... 0 2 2
3 3 0 4 ... 0 2 2
4 3 0 0 ... 0 2 2
5 3 0 2 ... 0 2 2
MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 2 0 2 2008 8 4 208500
2 2 0 5 2007 8 4 181500
3 2 0 9 2008 8 4 223500
4 2 0 2 2006 8 0 140000
5 2 0 12 2008 8 4 250000
[5 rows x 80 columns]
print (di)
defaultdict(<class 'sklearn.preprocessing._label.LabelEncoder'>, {'Alley': LabelEncoder(), 'BldgType': LabelEncoder(), 'BsmtCond': LabelEncoder(), 'BsmtExposure': LabelEncoder(), 'BsmtFinType1': LabelEncoder(), 'BsmtFinType2': LabelEncoder(), 'BsmtQual': LabelEncoder(), 'CentralAir': LabelEncoder(), 'Condition1': LabelEncoder(), 'Condition2': LabelEncoder(), 'Electrical': LabelEncoder(), 'ExterCond': LabelEncoder(), 'ExterQual': LabelEncoder(), 'Exterior1st': LabelEncoder(), 'Exterior2nd': LabelEncoder(), 'Fence': LabelEncoder(), 'FireplaceQu': LabelEncoder(), 'Foundation': LabelEncoder(), 'Functional': LabelEncoder(), 'GarageCond': LabelEncoder(), 'GarageFinish': LabelEncoder(), 'GarageQual': LabelEncoder(), 'GarageType': LabelEncoder(), 'Heating': LabelEncoder(), 'HeatingQC': LabelEncoder(), 'HouseStyle': LabelEncoder(), 'KitchenQual': LabelEncoder(), 'LandContour': LabelEncoder(), 'LandSlope': LabelEncoder(), 'LotConfig': LabelEncoder(), 'LotShape': LabelEncoder(), 'MSZoning': LabelEncoder(), 'MasVnrType': LabelEncoder(), 'MiscFeature': LabelEncoder(), 'Neighborhood': LabelEncoder(), 'PavedDrive': LabelEncoder(), 'PoolQC': LabelEncoder(), 'RoofMatl': LabelEncoder(), 'RoofStyle': LabelEncoder(), 'SaleCondition': LabelEncoder(), 'SaleType': LabelEncoder(), 'Street': LabelEncoder(), 'Utilities': LabelEncoder()})
for col in colsObj:
df[col] = di[col].inverse_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave Grvl Reg
2 20 RL 80.0 9600 Pave Grvl Reg
3 60 RL 68.0 11250 Pave Grvl IR1
4 70 RL 60.0 9550 Pave Grvl IR1
5 60 RL 84.0 14260 Pave Grvl IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 Gd MnPrv Shed
2 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
3 Lvl AllPub Inside ... 0 Gd MnPrv Shed
4 Lvl AllPub Corner ... 0 Gd MnPrv Shed
5 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000