onehotencoder的使用
Usage of onehotencoder
我是 python 的新手。我之前只有 VBA 中的代码。最近开始使用 python 进行数据挖掘,但在使用 python
时遇到了问题
我在使用 onehotencoder 正确转换我的分类特征时遇到问题,这是我的代码
from __future__ import print_function
import os import subprocess from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing import csv
import pandas as pd import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
datapoint = []
with open('raw2.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader: # Reading each row
data_point = []
for column in row: # Reading each column of the row
data_point.append((column))
datapoint.append(data_point)
datapoint = np.array(datapoint)
print(datapoint)
enc = preprocessing.OneHotEncoder()
enc.fit(datapoint)
enc.transform(datapoint).toarray()
features = list(df.columns[1:8])
print("* features:", features, sep="\n")
"#fit the decision tree"
y = df[,0]
X = df[features]
dt = DecisionTreeClassifier(min_samples_split=5, random_state=51)
dt.fit(X, y)
""produce graphic visualization""
def visualize_tree(tree, feature_names):
"""Create tree png using graphviz.
Args
----
tree -- scikit-learn DecsisionTree.
feature_names -- list of feature names.
"""
with open("dt.dot", 'w') as f:
export_graphviz(tree, out_file=f,
feature_names=feature_names)
command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
try:
subprocess.check_call(command)
except:
exit("Could not run dot, ie graphviz, to "
"produce visualization")
visualize_tree(dt, features)
这是我的第一个数据集的样本
['Tobermory' 'Car' '2-3hr' 'Fall' '<0' '3 days' 'Male' '18 - 23']
这是我运行进入
的错误
ValueError Traceback (most recent call
last) <ipython-input-13-0bb2597d0276> in <module>()
25 enc = preprocessing.OneHotEncoder()
---> 26 enc.fit(datapoint)
27 enc.transform(datapoint).toarray()
ValueError: invalid literal for int() with base 10: 'Tobermory'
我相信您正在寻找 sklearn.preprocessing.LabelBinarizer
。
OneHotEncoder
接受一个整数并从中创建虚拟变量。
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
我是 python 的新手。我之前只有 VBA 中的代码。最近开始使用 python 进行数据挖掘,但在使用 python
时遇到了问题我在使用 onehotencoder 正确转换我的分类特征时遇到问题,这是我的代码
from __future__ import print_function
import os import subprocess from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing import csv
import pandas as pd import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
datapoint = []
with open('raw2.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader: # Reading each row
data_point = []
for column in row: # Reading each column of the row
data_point.append((column))
datapoint.append(data_point)
datapoint = np.array(datapoint)
print(datapoint)
enc = preprocessing.OneHotEncoder()
enc.fit(datapoint)
enc.transform(datapoint).toarray()
features = list(df.columns[1:8])
print("* features:", features, sep="\n")
"#fit the decision tree"
y = df[,0]
X = df[features]
dt = DecisionTreeClassifier(min_samples_split=5, random_state=51)
dt.fit(X, y)
""produce graphic visualization""
def visualize_tree(tree, feature_names):
"""Create tree png using graphviz.
Args
----
tree -- scikit-learn DecsisionTree.
feature_names -- list of feature names.
"""
with open("dt.dot", 'w') as f:
export_graphviz(tree, out_file=f,
feature_names=feature_names)
command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
try:
subprocess.check_call(command)
except:
exit("Could not run dot, ie graphviz, to "
"produce visualization")
visualize_tree(dt, features)
这是我的第一个数据集的样本
['Tobermory' 'Car' '2-3hr' 'Fall' '<0' '3 days' 'Male' '18 - 23']
这是我运行进入
的错误ValueError Traceback (most recent call
last) <ipython-input-13-0bb2597d0276> in <module>()
25 enc = preprocessing.OneHotEncoder()
---> 26 enc.fit(datapoint)
27 enc.transform(datapoint).toarray()
ValueError: invalid literal for int() with base 10: 'Tobermory'
我相信您正在寻找 sklearn.preprocessing.LabelBinarizer
。
OneHotEncoder
接受一个整数并从中创建虚拟变量。
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html