SVM 在我的数据中提供了一个糟糕的结果。怎么修?
SVM provided a bad result in my data. How to fix?
我有一个数据集,其中包含 510
个用于训练的样本和 127
个用于测试的样本,每个样本具有 7680
个特征。我想设计一个模型来预测训练数据中的高度(厘米)-标签。目前,我使用 SVM 但它提供了非常糟糕的结果。你能看看我的代码并给我一些评论吗?您可以使用 dataset 和可运行代码
在您的机器上尝试
import numpy as np
from sklearn.svm import SVR
# Training Data
train_X = np.loadtxt('trainX.txt') # 510 x 7680
train_Y = np.loadtxt('trainY.txt') # 510 x 1
test_X = np.loadtxt('testX.txt') # 127 x 7680
test_Y = np.loadtxt('testY.txt') # 127 x 1
my_svr = SVR(C=1000, epsilon=0.2)
my_svr.fit(train_X,train_Y)
p_regression = my_svr.predict(test_X)
print (p_regression)
print (test_Y)
一些结果:
p_regression
[15.67367165 16.35094166 13.10510262 14.03943211 12.7116549 11.45071423
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821 12.42556347 14.54712287 12.25965911 9.04101931 14.03604126
12.41237627 13.51951317 10.36302674 9.86389635 11.41448842 15.67146184
14.74764672 11.22794536 12.04429175 12.48199183 14.29790809 16.21724184
10.94478135 9.68210872 14.8663311 8.62974573 15.17281425 12.97230127
9.46515876 16.24388177 10.35742683 15.65336366 11.04652502 16.35094166
14.03943211 10.29066405 13.27225207 9.44959181 10.45775627 13.23953143
14.95568324 11.35994414 10.69531821 12.42556347 14.54712287 12.25965911
9.04101931 14.03604126 12.41237627 13.51951317 10.36302674 9.86389635
11.41448842 15.67146184 14.74764672 11.22794536 12.04429175 12.48199183
14.29790809 16.21724184 10.94478135 9.68210872 14.8663311 8.62974573
15.17281425 12.97230127 9.46515876 16.24388177 10.35742683 15.65336366
11.04652502 16.35094166 14.03943211 10.29066405 13.27225207 9.44959181
10.45775627 13.23953143 14.95568324 11.35994414 10.69531821 12.42556347
14.54712287 12.25965911 9.04101931 14.03604126 12.41237627 13.51951317
10.36302674 9.86389635 11.41448842 15.67146184 14.74764672 11.22794536
12.04429175 12.48199183 14.29790809 16.21724184 10.94478135 9.68210872
14.8663311 8.62974573 15.17281425 12.97230127 9.46515876 16.24388177
10.35742683 15.65336366 11.04652502 16.35094166 14.03943211 10.29066405
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821]
test_Y
[13. 14. 13. 15. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6. 6. 8.
9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12. 15. 15.
16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6.
6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12.
15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3.
4. 6. 6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14.
13. 12. 15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6.
4.]
根据你的数据集,你的特征似乎太高了。最好在开始使用 SVM 处理之前使用特征分组算法。
我同意 - "there is something "wrong" with the test set"
。我得到了 MSE 的类似结果 - 大约。 21.
我还尝试将训练和测试数据集放在一起并将其提供给 GridSearchCV。
以下是这些尝试的结果:
In [33]: print_grid_results(grid)
----------------------------- [SVR_rbf] ------------------------------
Score: 48.98%
Parameters: {'SVR_rbf__C': 5, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 64.07%
Parameters: {'SVR_linear__C': 0.1, 'SVR_linear__max_iter': 500}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 63.98%
Parameters: {'Ridge__alpha': 100, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 60.36%
Parameters: {'Lasso__alpha': 0.001, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 44.01%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 100}
**********************************************************************
另外,不同的分组给出了非常不同的测试分数:
In [43]: clf = grid['SVR_linear']
In [44]: {k:v for k,v in clf.cv_results_.items() if k.endswith('_test_score')}
Out[44]:
{'mean_test_score': array([0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 ]),
'rank_test_score': array([1, 4, 7, 1, 4, 7, 1, 4, 7]),
'split0_test_score': array([0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802]),
'split1_test_score': array([0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475]),
'split2_test_score': array([0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214]),
'std_test_score': array([0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954])}
完整代码如下:
import os
#import contextlib
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data_split(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return X_train, y_train[0], X_test, y_test[0]
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def fit_all_classifiers_grid(X, y, classifiers, **common_grid_kwargs):
grids = {}
for clf in classifiers:
print('{:-^70}'.format(' [' + clf['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(clf['name'], clf['clf']) ])
grids[clf['name']] = (GridSearchCV(pipe,
param_grid=clf['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[clf['name']], './{}.pkl'.format(clf['name']))
return grids
def test_dataset(grid, X_test, y_test):
res = {}
for name, clf in grid.items():
y_pred = clf.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def print_grid_results(grids):
for name, clf in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(clf.best_score_))
print('Parameters:\t{}'.format(clf.best_params_))
print('*' * 70)
classifiers = [
{ 'name': 'SVR_rbf',
'clf': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5],
'SVR_rbf__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'SVR_linear',
'clf': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.1, 1, 5],
'SVR_linear__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'Ridge',
'clf': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 1, 5, 10, 50, 100],
'Ridge__max_iter': [200, 500]
}
},
{ 'name': 'Lasso',
'clf': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.001, 0.01, 0.1, 1, 5, 10],
'Lasso__max_iter': [1000, 5000]
}
},
{ 'name': 'RandomForest',
'clf': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [10, 100],
'RandomForest__max_depth': [3, 5],
}
},
]
def main(path):
#path = r'D:\data\work\.ML\SO094242-SVM provided a bad result in my data'
os.chdir(path)
X, y = get_data(path)
grid = fit_all_classifiers_grid(X, y, classifiers, cv=3, verbose=2, n_jobs=-1)
print_grid_results(grid)
#X_train, y_train, X_test, y_test = get_data_split(path)
#grid = fit_all_classifiers_grid(X_train, y_train, classifiers, cv=2, verbose=2, n_jobs=-1)
#res = test_dataset(grid, X_test, y_test)
#print(res)
PS 抱歉使用名称 classifier
而不是 regressor
- 我只是在搜索最佳 classifier[=30= 时重新使用了我的旧代码]..
这是一个类似的方法。我们将把数据集分成 train
和 test
两个。 train
数据集将用于调整超参数和拟合不同模型。然后我们将选择最佳(根据 MSE)模型并从 test
数据集中预测值。
所有经过训练(拟合)的模型都将保存为 Pickle 文件,因此稍后可以使用 joblib.load()
方法加载它们。
输出:
----------------------------- [SVR_rbf] ------------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
---------------------------- [SVR_linear] ----------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
------------------------------ [Ridge] -------------------------------
Fitting 3 folds for each of 7 candidates, totalling 21 fits
------------------------------ [Lasso] -------------------------------
Fitting 3 folds for each of 6 candidates, totalling 18 fits
--------------------------- [RandomForest] ---------------------------
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------------------------- [SVR_rbf] ------------------------------
Score: 44.88%
Parameters: {'SVR_rbf__C': 10, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 33.40%
Parameters: {'SVR_linear__C': 0.01, 'SVR_linear__max_iter': 1000}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 34.83%
Parameters: {'Ridge__alpha': 500, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 22.90%
Parameters: {'Lasso__alpha': 0.1, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 36.87%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 250}
**********************************************************************
Mean Squared Error: {'SVR_rbf': 5.375, 'SVR_linear': 7.036, 'Ridge': 7.02, 'Lasso': 8.108, 'RandomForest': 9.475}
代码:
import os
#import contextlib
from operator import itemgetter
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def get_data_split(path='.', test_size=0.25):
X, y = get_data(path)
return train_test_split(X, y, test_size=test_size)
def tune_models_hyperparams(X, y, models, **common_grid_kwargs):
grids = {}
for model in models:
print('{:-^70}'.format(' [' + model['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(model['name'], model['model']) ])
grids[model['name']] = (GridSearchCV(pipe,
param_grid=model['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[model['name']], './{}.pkl'.format(model['name']))
return grids
def get_best_model(grid, X_test, y_test,
metric_func=mean_squared_error):
res = {name : round(metric_func(y_test, model.predict(X_test)), 3)
for name, model in grid.items()}
print('Mean Squared Error:', res)
best_model_name = min(res, key=itemgetter(1))
return grid[best_model_name]
def test_dataset(grid, X_test, y_test):
res = {}
for name, model in grid.items():
y_pred = model.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def predict(grid, X_test, model_name):
return grid[model_name].predict(X_test)
def print_grid_results(grids):
for name, model in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(model.best_score_))
print('Parameters:\t{}'.format(model.best_params_))
print('*' * 70)
models = [
{ 'name': 'SVR_rbf',
'model': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5, 10],
'SVR_rbf__max_iter': [500]
}
},
{ 'name': 'SVR_linear',
'model': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.01, 0.1, 1, 5],
'SVR_linear__max_iter': [1000]
}
},
{ 'name': 'Ridge',
'model': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 0.5, 5, 10, 50, 100, 500],
'Ridge__max_iter': [200]
}
},
{ 'name': 'Lasso',
'model': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'Lasso__max_iter': [1000]
}
},
{ 'name': 'RandomForest',
'model': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [50, 250, 500],
'RandomForest__max_depth': [5],
}
},
]
def main(path):
os.chdir(str(path))
X_train, X_test, y_train, y_test = \
get_data_split(path, test_size=127/510.)
grid = tune_models_hyperparams(X_train, y_train, models, cv=3,
verbose=2, n_jobs=-1)
print_grid_results(grid)
model = get_best_model(grid, X_test, y_test)
df = pd.DataFrame({'predicted': model.predict(X_test)})
df.to_csv('predicted.csv', index=False)
if __name__ == "__main__":
p = Path(__file__).parent.resolve()
main(p)
我有一个数据集,其中包含 510
个用于训练的样本和 127
个用于测试的样本,每个样本具有 7680
个特征。我想设计一个模型来预测训练数据中的高度(厘米)-标签。目前,我使用 SVM 但它提供了非常糟糕的结果。你能看看我的代码并给我一些评论吗?您可以使用 dataset 和可运行代码
import numpy as np
from sklearn.svm import SVR
# Training Data
train_X = np.loadtxt('trainX.txt') # 510 x 7680
train_Y = np.loadtxt('trainY.txt') # 510 x 1
test_X = np.loadtxt('testX.txt') # 127 x 7680
test_Y = np.loadtxt('testY.txt') # 127 x 1
my_svr = SVR(C=1000, epsilon=0.2)
my_svr.fit(train_X,train_Y)
p_regression = my_svr.predict(test_X)
print (p_regression)
print (test_Y)
一些结果:
p_regression
[15.67367165 16.35094166 13.10510262 14.03943211 12.7116549 11.45071423
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821 12.42556347 14.54712287 12.25965911 9.04101931 14.03604126
12.41237627 13.51951317 10.36302674 9.86389635 11.41448842 15.67146184
14.74764672 11.22794536 12.04429175 12.48199183 14.29790809 16.21724184
10.94478135 9.68210872 14.8663311 8.62974573 15.17281425 12.97230127
9.46515876 16.24388177 10.35742683 15.65336366 11.04652502 16.35094166
14.03943211 10.29066405 13.27225207 9.44959181 10.45775627 13.23953143
14.95568324 11.35994414 10.69531821 12.42556347 14.54712287 12.25965911
9.04101931 14.03604126 12.41237627 13.51951317 10.36302674 9.86389635
11.41448842 15.67146184 14.74764672 11.22794536 12.04429175 12.48199183
14.29790809 16.21724184 10.94478135 9.68210872 14.8663311 8.62974573
15.17281425 12.97230127 9.46515876 16.24388177 10.35742683 15.65336366
11.04652502 16.35094166 14.03943211 10.29066405 13.27225207 9.44959181
10.45775627 13.23953143 14.95568324 11.35994414 10.69531821 12.42556347
14.54712287 12.25965911 9.04101931 14.03604126 12.41237627 13.51951317
10.36302674 9.86389635 11.41448842 15.67146184 14.74764672 11.22794536
12.04429175 12.48199183 14.29790809 16.21724184 10.94478135 9.68210872
14.8663311 8.62974573 15.17281425 12.97230127 9.46515876 16.24388177
10.35742683 15.65336366 11.04652502 16.35094166 14.03943211 10.29066405
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821]
test_Y
[13. 14. 13. 15. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6. 6. 8.
9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12. 15. 15.
16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6.
6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12.
15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3.
4. 6. 6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14.
13. 12. 15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6.
4.]
根据你的数据集,你的特征似乎太高了。最好在开始使用 SVM 处理之前使用特征分组算法。
我同意 "there is something "wrong" with the test set"
。我得到了 MSE 的类似结果 - 大约。 21.
我还尝试将训练和测试数据集放在一起并将其提供给 GridSearchCV。
以下是这些尝试的结果:
In [33]: print_grid_results(grid)
----------------------------- [SVR_rbf] ------------------------------
Score: 48.98%
Parameters: {'SVR_rbf__C': 5, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 64.07%
Parameters: {'SVR_linear__C': 0.1, 'SVR_linear__max_iter': 500}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 63.98%
Parameters: {'Ridge__alpha': 100, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 60.36%
Parameters: {'Lasso__alpha': 0.001, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 44.01%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 100}
**********************************************************************
另外,不同的分组给出了非常不同的测试分数:
In [43]: clf = grid['SVR_linear']
In [44]: {k:v for k,v in clf.cv_results_.items() if k.endswith('_test_score')}
Out[44]:
{'mean_test_score': array([0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 ]),
'rank_test_score': array([1, 4, 7, 1, 4, 7, 1, 4, 7]),
'split0_test_score': array([0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802]),
'split1_test_score': array([0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475]),
'split2_test_score': array([0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214]),
'std_test_score': array([0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954])}
完整代码如下:
import os
#import contextlib
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data_split(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return X_train, y_train[0], X_test, y_test[0]
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def fit_all_classifiers_grid(X, y, classifiers, **common_grid_kwargs):
grids = {}
for clf in classifiers:
print('{:-^70}'.format(' [' + clf['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(clf['name'], clf['clf']) ])
grids[clf['name']] = (GridSearchCV(pipe,
param_grid=clf['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[clf['name']], './{}.pkl'.format(clf['name']))
return grids
def test_dataset(grid, X_test, y_test):
res = {}
for name, clf in grid.items():
y_pred = clf.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def print_grid_results(grids):
for name, clf in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(clf.best_score_))
print('Parameters:\t{}'.format(clf.best_params_))
print('*' * 70)
classifiers = [
{ 'name': 'SVR_rbf',
'clf': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5],
'SVR_rbf__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'SVR_linear',
'clf': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.1, 1, 5],
'SVR_linear__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'Ridge',
'clf': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 1, 5, 10, 50, 100],
'Ridge__max_iter': [200, 500]
}
},
{ 'name': 'Lasso',
'clf': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.001, 0.01, 0.1, 1, 5, 10],
'Lasso__max_iter': [1000, 5000]
}
},
{ 'name': 'RandomForest',
'clf': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [10, 100],
'RandomForest__max_depth': [3, 5],
}
},
]
def main(path):
#path = r'D:\data\work\.ML\SO094242-SVM provided a bad result in my data'
os.chdir(path)
X, y = get_data(path)
grid = fit_all_classifiers_grid(X, y, classifiers, cv=3, verbose=2, n_jobs=-1)
print_grid_results(grid)
#X_train, y_train, X_test, y_test = get_data_split(path)
#grid = fit_all_classifiers_grid(X_train, y_train, classifiers, cv=2, verbose=2, n_jobs=-1)
#res = test_dataset(grid, X_test, y_test)
#print(res)
PS 抱歉使用名称 classifier
而不是 regressor
- 我只是在搜索最佳 classifier[=30= 时重新使用了我的旧代码]..
这是一个类似的方法。我们将把数据集分成 train
和 test
两个。 train
数据集将用于调整超参数和拟合不同模型。然后我们将选择最佳(根据 MSE)模型并从 test
数据集中预测值。
所有经过训练(拟合)的模型都将保存为 Pickle 文件,因此稍后可以使用 joblib.load()
方法加载它们。
输出:
----------------------------- [SVR_rbf] ------------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
---------------------------- [SVR_linear] ----------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
------------------------------ [Ridge] -------------------------------
Fitting 3 folds for each of 7 candidates, totalling 21 fits
------------------------------ [Lasso] -------------------------------
Fitting 3 folds for each of 6 candidates, totalling 18 fits
--------------------------- [RandomForest] ---------------------------
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------------------------- [SVR_rbf] ------------------------------
Score: 44.88%
Parameters: {'SVR_rbf__C': 10, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 33.40%
Parameters: {'SVR_linear__C': 0.01, 'SVR_linear__max_iter': 1000}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 34.83%
Parameters: {'Ridge__alpha': 500, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 22.90%
Parameters: {'Lasso__alpha': 0.1, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 36.87%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 250}
**********************************************************************
Mean Squared Error: {'SVR_rbf': 5.375, 'SVR_linear': 7.036, 'Ridge': 7.02, 'Lasso': 8.108, 'RandomForest': 9.475}
代码:
import os
#import contextlib
from operator import itemgetter
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def get_data_split(path='.', test_size=0.25):
X, y = get_data(path)
return train_test_split(X, y, test_size=test_size)
def tune_models_hyperparams(X, y, models, **common_grid_kwargs):
grids = {}
for model in models:
print('{:-^70}'.format(' [' + model['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(model['name'], model['model']) ])
grids[model['name']] = (GridSearchCV(pipe,
param_grid=model['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[model['name']], './{}.pkl'.format(model['name']))
return grids
def get_best_model(grid, X_test, y_test,
metric_func=mean_squared_error):
res = {name : round(metric_func(y_test, model.predict(X_test)), 3)
for name, model in grid.items()}
print('Mean Squared Error:', res)
best_model_name = min(res, key=itemgetter(1))
return grid[best_model_name]
def test_dataset(grid, X_test, y_test):
res = {}
for name, model in grid.items():
y_pred = model.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def predict(grid, X_test, model_name):
return grid[model_name].predict(X_test)
def print_grid_results(grids):
for name, model in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(model.best_score_))
print('Parameters:\t{}'.format(model.best_params_))
print('*' * 70)
models = [
{ 'name': 'SVR_rbf',
'model': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5, 10],
'SVR_rbf__max_iter': [500]
}
},
{ 'name': 'SVR_linear',
'model': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.01, 0.1, 1, 5],
'SVR_linear__max_iter': [1000]
}
},
{ 'name': 'Ridge',
'model': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 0.5, 5, 10, 50, 100, 500],
'Ridge__max_iter': [200]
}
},
{ 'name': 'Lasso',
'model': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'Lasso__max_iter': [1000]
}
},
{ 'name': 'RandomForest',
'model': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [50, 250, 500],
'RandomForest__max_depth': [5],
}
},
]
def main(path):
os.chdir(str(path))
X_train, X_test, y_train, y_test = \
get_data_split(path, test_size=127/510.)
grid = tune_models_hyperparams(X_train, y_train, models, cv=3,
verbose=2, n_jobs=-1)
print_grid_results(grid)
model = get_best_model(grid, X_test, y_test)
df = pd.DataFrame({'predicted': model.predict(X_test)})
df.to_csv('predicted.csv', index=False)
if __name__ == "__main__":
p = Path(__file__).parent.resolve()
main(p)