使用 get_dummies 将分类值转换为数字?
Use get_dummies to turn categorical values to numeric?
我正在制作一个简单的可重现示例来了解训练和测试的工作原理:
例子
我想根据 Location
的来源预测 Ages
:
import pandas as pd
# create a simple dataset of people
data = {'Name': ["John", "Anna", "Peter", "Linda","John","John","John"],
'Location' : ["Paris","Paris","Paris","Paris", "New York", "Berlin", "London"],
'Age' : [24, 23, 21, 24,36,34,36]
}
df = pd.DataFrame(data)
在下面的这一部分,城市名称有问题,因此我决定使用虚拟变量,但带有 get_dummies
的行不正确。我认为它需要将 Name
和 Location
字符串都转换为虚拟变量,这就是我尝试过的但正确的方法是什么?
from sklearn.model_selection import train_test_split
X = df.drop('Age', axis=1)
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
X_train = pd.get_dummies(df.columns) #<---- here is the issue probably
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
svclassifier.fit(X_train, y_train)
将一个数组作为输入,但你输入一个 pandas DataFrame。尝试使用 sklearn.preprocessing.LabelEncoder
代替 pd.get_dummies
。
编辑: LabelEncoder
和 OneHotEncoder
的示例:
# create a simple dataset of people
data = {'Name': ["John", "Anna", "Peter", "Linda","John","John","John"],
'Location' : ["Paris","Paris","Paris","Paris", "New York", "Berlin", "London"],
'Age' : [24, 23, 21, 24,36,34,36]
}
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
import numpy as np
X = data['Location']
y = data['Age']
# Label
print("Label Encoded")
le = LabelEncoder()
le.fit(X)
X_enc = le.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size = 0.20, random_state=42)
svclassifier = SVC(kernel='linear')
svclassifier.fit(np.reshape(X_train,(X_train.shape[0],1)), y_train)
y_pred = svclassifier.predict(np.reshape(X_test, (X_test.shape[0],1)))
print(f"y_pred: {y_pred}, y_test: {y_test}")
# OneHot
print("OneHot Encoded")
ohe = OneHotEncoder()
ohe.fit(np.reshape(X,(len(X),1)))
X_oh = ohe.transform(np.reshape(X,(len(X),1)))
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size = 0.20, random_state=42)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(f"y_pred: {y_pred}, y_test: {y_test}")
给出:
Label Encoded
y_pred: [24 24], y_test: [24, 23]
OneHot Encoded
y_pred: [24 24], y_test: [24, 23]
还不错。
您没有定义特征 (X) 和目标 (y)。 X 是模型学习预测目标 y 的地方。由于您的特征是 Name 和 Location 是绝对的,因此您需要使用像 get_dummies.
这样的自动编码器
from sklearn.model_selection import train_test_split
#features
X = pd.get_dummies(df[['Name','Location']])
#Target
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
我正在制作一个简单的可重现示例来了解训练和测试的工作原理:
例子
我想根据 Location
的来源预测 Ages
:
import pandas as pd
# create a simple dataset of people
data = {'Name': ["John", "Anna", "Peter", "Linda","John","John","John"],
'Location' : ["Paris","Paris","Paris","Paris", "New York", "Berlin", "London"],
'Age' : [24, 23, 21, 24,36,34,36]
}
df = pd.DataFrame(data)
在下面的这一部分,城市名称有问题,因此我决定使用虚拟变量,但带有 get_dummies
的行不正确。我认为它需要将 Name
和 Location
字符串都转换为虚拟变量,这就是我尝试过的但正确的方法是什么?
from sklearn.model_selection import train_test_split
X = df.drop('Age', axis=1)
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
X_train = pd.get_dummies(df.columns) #<---- here is the issue probably
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
svclassifier.fit(X_train, y_train)
将一个数组作为输入,但你输入一个 pandas DataFrame。尝试使用 sklearn.preprocessing.LabelEncoder
代替 pd.get_dummies
。
编辑: LabelEncoder
和 OneHotEncoder
的示例:
# create a simple dataset of people
data = {'Name': ["John", "Anna", "Peter", "Linda","John","John","John"],
'Location' : ["Paris","Paris","Paris","Paris", "New York", "Berlin", "London"],
'Age' : [24, 23, 21, 24,36,34,36]
}
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
import numpy as np
X = data['Location']
y = data['Age']
# Label
print("Label Encoded")
le = LabelEncoder()
le.fit(X)
X_enc = le.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size = 0.20, random_state=42)
svclassifier = SVC(kernel='linear')
svclassifier.fit(np.reshape(X_train,(X_train.shape[0],1)), y_train)
y_pred = svclassifier.predict(np.reshape(X_test, (X_test.shape[0],1)))
print(f"y_pred: {y_pred}, y_test: {y_test}")
# OneHot
print("OneHot Encoded")
ohe = OneHotEncoder()
ohe.fit(np.reshape(X,(len(X),1)))
X_oh = ohe.transform(np.reshape(X,(len(X),1)))
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size = 0.20, random_state=42)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(f"y_pred: {y_pred}, y_test: {y_test}")
给出:
Label Encoded
y_pred: [24 24], y_test: [24, 23]
OneHot Encoded
y_pred: [24 24], y_test: [24, 23]
还不错。
您没有定义特征 (X) 和目标 (y)。 X 是模型学习预测目标 y 的地方。由于您的特征是 Name 和 Location 是绝对的,因此您需要使用像 get_dummies.
这样的自动编码器from sklearn.model_selection import train_test_split
#features
X = pd.get_dummies(df[['Name','Location']])
#Target
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)