检索预测结果是否定的
Retrieving prediction result in negative
我有以下使用回归算法预测时间的示例数据。
我也使用 lightgbm 和 xgboost 训练了模型,但是,我检索到非常糟糕的预测结果并且预测结果是负面的。我不确定我在这里做错了什么。谢谢
size channels time
3 3 4.980278
3 16 4.972054
3 64 4.899884
3 256 5.499221
3 512 5.599495
3 1024 5.936933
16 3 5.221653
16 16 5.994821
16 64 6.648254
16 256 7.176828
16 512 8.1707
16 1024 8.651496
64 3 7.801533
64 16 7.398248
64 64 8.395648
64 256 17.49494
64 512 26.43354
64 1024 49.55192
256 3 12.36093
256 16 20.50781
256 64 46.49553
256 256 170.5452
256 512 333.8809
256 1024 675.9459
512 3 22.44313
512 16 53.82643
512 64 164.3493
512 256 659.4345
512 512 1306.881
512 1024 3122.403
lightbgm代码
x_train, x_val, y_train, y_val = train_test_split(X,Y, train_size=0.8)
print(f"Number of training examples {len(x_train)}")
print(f"Number fo testing examples {len(x_val)}")
regressor = lightgbm.LGBMRegressor()
regressor.fit(x_train,y_train)
train_pred = regressor.predict(x_train)
train_rmse = mean_squared_error(train_pred, y_train) ** 0.5
print(f"Train RMSE is {train_rmse}")
val_pred = regressor.predict(x_val)
val_rmse = mean_squared_error(val_pred, y_val)**0.5
print(f"Test RMSE is {val_rmse}")
R_squared = r2_score(val_pred,y_val)
print('R2',R_squared)
结果
Train RMSE is 5385.50, Test RMSE is 1245.1,R2 -2.9991290197894976e+31
使用 optuna 优化的 XGBoost 代码
def optimize(trial,x,y,regressor):
max_depth = trial.suggest_int("max_depth",3,10)
n_estimators = trial.suggest_int("n_estimators",5000,10000)
max_leaves= trial.suggest_int("max_leaves",1,10)
learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0, 1.0)
min_child_weight = trial.suggest_uniform('min_child_weight',1,3)
subsample = trial.suggest_uniform('subsample', 0.5, 1)
model = xgb.XGBRegressor(
objective ='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
colsample_bytree=colsample_bytree,
min_child_weight=min_child_weight,
max_leaves=max_leaves,
subsample = subsample
)
kf=model_selection.KFold(n_splits=5)
error=[]
for idx in kf.split(X=x , y=y):
train_idx , test_idx= idx[0],idx[1]
xtrain=x[train_idx]
ytrain=y[train_idx]
xtest=x[test_idx]
ytest=y[test_idx]
model.fit(xtrain,ytrain)
y_pred = model.predict(xtest)
fold_err = metrics.mean_squared_error(ytest,y_pred)
error.append(np.sqrt(fold_err))
return np.mean(error)
best_params={{'max_depth': 9, 'n_estimators': 9242, 'max_leaves': 7, 'learning_rate': 0.0015809052065858954, 'colsample_bytree': 0.4908644884609704, 'min_child_weight': 2.3502876962874435, 'subsample': 0.5927926099148189}
def optimize_xgb(X,y):
list_of_y = ["Target 1"]
for i,m in zip(range(y.shape[1]),list_of_y):
print("{} optimized Parameters on MSE Error".format(m))
optimization_function = partial(optimize , x=X,y=y[:,i],regressor="random_forest")
study = optuna.create_study(direction="minimize")
study.optimize(optimization_function,n_trials=50)
optimize_xgb(X_train, y_train)
def modeling(X,
y,
optimize = "no",
max_depth=50,
n_estimators=3000,
max_leaves=30,
learning_rate=0.01,
colsample_bytree=1.0,
gamma=0.0001,
min_child_weight=2,
reg_lambda=0.0001):
if optimize == "no":
model = xgb.XGBRegressor(objective='reg:squarederror')
else:
model = xgb.XGBRegressor(objective='reg:squarederror',
**best_params)
if y.shape[1] ==1:
model_xgb = model.fit(X, y)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = []
for i in range(y.shape[1]):
scores.append(np.abs(cross_val_score(model, X, y[:,i], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)))
print('Mean MSE of the {} target : {} ({})'.format(i,scores[i].mean(), scores[i].std()) )
return model_xgb
model_xgb = modeling(X_train,y_train, optimize="yes")
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
MSE = mse(y_pred,y_test)
RMSE = np.sqrt(MSE)
print("TEST MSE",MSE)
R_squared = r2_score(y_pred,y_test)
print("RMSE: ", np.round(RMSE, 2))
print("R-Squared: ", np.round(R_squared, 2))
xgboost 的结果
TEST MSE 2653915.139388934,RMSE: 1629.08,R-Squared: -1.69
首先,我们不得不说梯度提升模型可以 return 训练范围内外的值。
GB 回归器使用前一阶段的残差拟合树。所以,
如果阶段 t
的预测值大于目标变量,阶段 t
的残差将为负值,那么阶段 t+1
的回归树将处理该负目标来自先前残差的值。
逐步添加这些获得的树,模型可以预测负值。
在预测模型中处理负值的常用技术是对数变换。
目标变量的变换是Y -> log(Y+c)
,其中c
是常量。人们通常会选择 Y -> log(Y+0.001)
或任何其他“非常小”的正数。
我有以下使用回归算法预测时间的示例数据。 我也使用 lightgbm 和 xgboost 训练了模型,但是,我检索到非常糟糕的预测结果并且预测结果是负面的。我不确定我在这里做错了什么。谢谢
size channels time
3 3 4.980278
3 16 4.972054
3 64 4.899884
3 256 5.499221
3 512 5.599495
3 1024 5.936933
16 3 5.221653
16 16 5.994821
16 64 6.648254
16 256 7.176828
16 512 8.1707
16 1024 8.651496
64 3 7.801533
64 16 7.398248
64 64 8.395648
64 256 17.49494
64 512 26.43354
64 1024 49.55192
256 3 12.36093
256 16 20.50781
256 64 46.49553
256 256 170.5452
256 512 333.8809
256 1024 675.9459
512 3 22.44313
512 16 53.82643
512 64 164.3493
512 256 659.4345
512 512 1306.881
512 1024 3122.403
lightbgm代码
x_train, x_val, y_train, y_val = train_test_split(X,Y, train_size=0.8)
print(f"Number of training examples {len(x_train)}")
print(f"Number fo testing examples {len(x_val)}")
regressor = lightgbm.LGBMRegressor()
regressor.fit(x_train,y_train)
train_pred = regressor.predict(x_train)
train_rmse = mean_squared_error(train_pred, y_train) ** 0.5
print(f"Train RMSE is {train_rmse}")
val_pred = regressor.predict(x_val)
val_rmse = mean_squared_error(val_pred, y_val)**0.5
print(f"Test RMSE is {val_rmse}")
R_squared = r2_score(val_pred,y_val)
print('R2',R_squared)
结果
Train RMSE is 5385.50, Test RMSE is 1245.1,R2 -2.9991290197894976e+31
使用 optuna 优化的 XGBoost 代码
def optimize(trial,x,y,regressor):
max_depth = trial.suggest_int("max_depth",3,10)
n_estimators = trial.suggest_int("n_estimators",5000,10000)
max_leaves= trial.suggest_int("max_leaves",1,10)
learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0, 1.0)
min_child_weight = trial.suggest_uniform('min_child_weight',1,3)
subsample = trial.suggest_uniform('subsample', 0.5, 1)
model = xgb.XGBRegressor(
objective ='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
colsample_bytree=colsample_bytree,
min_child_weight=min_child_weight,
max_leaves=max_leaves,
subsample = subsample
)
kf=model_selection.KFold(n_splits=5)
error=[]
for idx in kf.split(X=x , y=y):
train_idx , test_idx= idx[0],idx[1]
xtrain=x[train_idx]
ytrain=y[train_idx]
xtest=x[test_idx]
ytest=y[test_idx]
model.fit(xtrain,ytrain)
y_pred = model.predict(xtest)
fold_err = metrics.mean_squared_error(ytest,y_pred)
error.append(np.sqrt(fold_err))
return np.mean(error)
best_params={{'max_depth': 9, 'n_estimators': 9242, 'max_leaves': 7, 'learning_rate': 0.0015809052065858954, 'colsample_bytree': 0.4908644884609704, 'min_child_weight': 2.3502876962874435, 'subsample': 0.5927926099148189}
def optimize_xgb(X,y):
list_of_y = ["Target 1"]
for i,m in zip(range(y.shape[1]),list_of_y):
print("{} optimized Parameters on MSE Error".format(m))
optimization_function = partial(optimize , x=X,y=y[:,i],regressor="random_forest")
study = optuna.create_study(direction="minimize")
study.optimize(optimization_function,n_trials=50)
optimize_xgb(X_train, y_train)
def modeling(X,
y,
optimize = "no",
max_depth=50,
n_estimators=3000,
max_leaves=30,
learning_rate=0.01,
colsample_bytree=1.0,
gamma=0.0001,
min_child_weight=2,
reg_lambda=0.0001):
if optimize == "no":
model = xgb.XGBRegressor(objective='reg:squarederror')
else:
model = xgb.XGBRegressor(objective='reg:squarederror',
**best_params)
if y.shape[1] ==1:
model_xgb = model.fit(X, y)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = []
for i in range(y.shape[1]):
scores.append(np.abs(cross_val_score(model, X, y[:,i], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)))
print('Mean MSE of the {} target : {} ({})'.format(i,scores[i].mean(), scores[i].std()) )
return model_xgb
model_xgb = modeling(X_train,y_train, optimize="yes")
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
MSE = mse(y_pred,y_test)
RMSE = np.sqrt(MSE)
print("TEST MSE",MSE)
R_squared = r2_score(y_pred,y_test)
print("RMSE: ", np.round(RMSE, 2))
print("R-Squared: ", np.round(R_squared, 2))
xgboost 的结果
TEST MSE 2653915.139388934,RMSE: 1629.08,R-Squared: -1.69
首先,我们不得不说梯度提升模型可以 return 训练范围内外的值。
GB 回归器使用前一阶段的残差拟合树。所以,
如果阶段 t
的预测值大于目标变量,阶段 t
的残差将为负值,那么阶段 t+1
的回归树将处理该负目标来自先前残差的值。
逐步添加这些获得的树,模型可以预测负值。
在预测模型中处理负值的常用技术是对数变换。
目标变量的变换是Y -> log(Y+c)
,其中c
是常量。人们通常会选择 Y -> log(Y+0.001)
或任何其他“非常小”的正数。