GaussianProcessRegressor ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size
GaussianProcessRegressor ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size
我是运行以下代码:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
print(X.shape)
print(X.values[:,list_of_relevant_features].dtype)
print(y.values.dtype)
kernel = DotProduct() + WhiteKernel()
model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
model_gp.fit(X.values[:,list_of_relevant_features], y.values)
print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values))
我输入的形状是:
(19142, 21)
dtypes分别是:float64
在编辑中添加:
X 和 y 是 Pandas 数据帧。
在 .values
之后它们都是 numpy 数组
我收到错误:
ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
我无法想象 20000 * 20 的数据集对于高斯过程来说实际上太大了,我错了吗?
整个错误信息:
ValueError Traceback (most recent call last)
filepath in
482 kernel = DotProduct() + WhiteKernel()
483 model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
----> 484 model_gp.fit(X.values[:,list_of_relevant_features], y.values)
485 print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values))
486
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in fit(self, X, y)
238 optima = [(self._constrained_optimization(obj_func,
239 self.kernel_.theta,
--> 240 self.kernel_.bounds))]
241
242 # Additional runs are performed from log-uniform chosen initial
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in _constrained_optimization(self, obj_func, initial_theta, bounds)
503 opt_res = scipy.optimize.minimize(
504 obj_func, initial_theta, method="L-BFGS-B", jac=True,
--> 505 bounds=bounds)
506 _check_optimize_result("lbfgs", opt_res)
507 theta_opt, func_min = opt_res.x, opt_res.fun
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
616 elif meth == 'l-bfgs-b':
617 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 618 callback=callback, **options)
619 elif meth == 'tnc':
620 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
307 bounds=new_bounds,
--> 308 finite_diff_rel_step=finite_diff_rel_step)
309
310 func_and_grad = sf.fun_and_grad
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
260 # calculation reduces overall function evaluations.
261 sf = ScalarFunction(fun, x0, args, grad, hess,
--> 262 finite_diff_rel_step, bounds, epsilon=epsilon)
263
264 return sf
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
74
75 self._update_fun_impl = update_fun
---> 76 self._update_fun()
77
78 # Gradient evaluation
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in _update_fun(self)
164 def _update_fun(self):
165 if not self.f_updated:
--> 166 self._update_fun_impl()
167 self.f_updated = True
168
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in update_fun()
71
72 def update_fun():
---> 73 self.f = fun_wrapped(self.x)
74
75 self._update_fun_impl = update_fun
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in fun_wrapped(x)
68 def fun_wrapped(x):
69 self.nfev += 1
---> 70 return fun(x, *args)
71
72 def update_fun():
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in __call__(self, x, *args)
72 def __call__(self, x, *args):
73 """ returns the the function value """
---> 74 self._compute_if_needed(x, *args)
75 return self._value
76
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in _compute_if_needed(self, x, *args)
66 if not np.all(x == self.x) or self._value is None or self.jac is None:
67 self.x = np.asarray(x).copy()
---> 68 fg = self.fun(x, *args)
69 self.jac = fg[1]
70 self._value = fg[0]
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in obj_func(theta, eval_gradient)
229 if eval_gradient:
230 lml, grad = self.log_marginal_likelihood(
--> 231 theta, eval_gradient=True, clone_kernel=False)
232 return -lml, -grad
233 else:
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in log_marginal_likelihood(self, theta, eval_gradient, clone_kernel)
460
461 if eval_gradient:
--> 462 K, K_gradient = kernel(self.X_train_, eval_gradient=True)
463 else:
464 K = kernel(self.X_train_)
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
813 if eval_gradient:
--> 814 K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
815 K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
816 return K1 + K2, np.dstack((K1_gradient, K2_gradient))
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
2110 X = np.atleast_2d(X)
2111 if Y is None:
-> 2112 K = np.inner(X, X) + self.sigma_0 ** 2
2113 else:
2114 if eval_gradient:
<__array_function__ internals> in inner(*args, **kwargs)
ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
我相信这是因为点积内核而发生的:第 2112 行中的回溯导致 numpy
inner product. So, the memory error you get actually arises in numpy
and not in Scikit-learn
. See also this SO question and this answer,它表明在 numpy
计算内部产品结果的预期数组大小可能导致 32 位整数溢出 Python。我的 python 设置是 64 位的,所以我无法进行一致的测试,但以下代码片段运行时没有错误:
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.datasets import load_boston
b = load_boston()
X = [pd.DataFrame(b['data'])]
y = b['target']
for i in range(50):
X.append(pd.DataFrame(b['data']))
y = np.append(y,b['target'])
X = pd.concat(X)
X = pd.concat([X,X[X.columns[0:8]]],axis=1)
print(X.values.shape,y.shape)
kernel = DotProduct() + WhiteKernel()
model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
model_gp.fit(X.values, y)
我建议 运行 您的模型具有较少的特征,以便查看在哪个数组形状上引发内存错误。或者,您可以尝试不需要 X
.
内积的不同内核
我是运行以下代码:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
print(X.shape)
print(X.values[:,list_of_relevant_features].dtype)
print(y.values.dtype)
kernel = DotProduct() + WhiteKernel()
model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
model_gp.fit(X.values[:,list_of_relevant_features], y.values)
print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values))
我输入的形状是:
(19142, 21)
dtypes分别是:float64
在编辑中添加:
X 和 y 是 Pandas 数据帧。
在 .values
之后它们都是 numpy 数组
我收到错误:
ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
我无法想象 20000 * 20 的数据集对于高斯过程来说实际上太大了,我错了吗?
整个错误信息:
ValueError Traceback (most recent call last)
filepath in
482 kernel = DotProduct() + WhiteKernel()
483 model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
----> 484 model_gp.fit(X.values[:,list_of_relevant_features], y.values)
485 print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values))
486
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in fit(self, X, y)
238 optima = [(self._constrained_optimization(obj_func,
239 self.kernel_.theta,
--> 240 self.kernel_.bounds))]
241
242 # Additional runs are performed from log-uniform chosen initial
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in _constrained_optimization(self, obj_func, initial_theta, bounds)
503 opt_res = scipy.optimize.minimize(
504 obj_func, initial_theta, method="L-BFGS-B", jac=True,
--> 505 bounds=bounds)
506 _check_optimize_result("lbfgs", opt_res)
507 theta_opt, func_min = opt_res.x, opt_res.fun
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
616 elif meth == 'l-bfgs-b':
617 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 618 callback=callback, **options)
619 elif meth == 'tnc':
620 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
307 bounds=new_bounds,
--> 308 finite_diff_rel_step=finite_diff_rel_step)
309
310 func_and_grad = sf.fun_and_grad
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
260 # calculation reduces overall function evaluations.
261 sf = ScalarFunction(fun, x0, args, grad, hess,
--> 262 finite_diff_rel_step, bounds, epsilon=epsilon)
263
264 return sf
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
74
75 self._update_fun_impl = update_fun
---> 76 self._update_fun()
77
78 # Gradient evaluation
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in _update_fun(self)
164 def _update_fun(self):
165 if not self.f_updated:
--> 166 self._update_fun_impl()
167 self.f_updated = True
168
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in update_fun()
71
72 def update_fun():
---> 73 self.f = fun_wrapped(self.x)
74
75 self._update_fun_impl = update_fun
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\_differentiable_functions.py in fun_wrapped(x)
68 def fun_wrapped(x):
69 self.nfev += 1
---> 70 return fun(x, *args)
71
72 def update_fun():
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in __call__(self, x, *args)
72 def __call__(self, x, *args):
73 """ returns the the function value """
---> 74 self._compute_if_needed(x, *args)
75 return self._value
76
d:\Toms_venv\venv\lib\site-packages\scipy\optimize\optimize.py in _compute_if_needed(self, x, *args)
66 if not np.all(x == self.x) or self._value is None or self.jac is None:
67 self.x = np.asarray(x).copy()
---> 68 fg = self.fun(x, *args)
69 self.jac = fg[1]
70 self._value = fg[0]
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in obj_func(theta, eval_gradient)
229 if eval_gradient:
230 lml, grad = self.log_marginal_likelihood(
--> 231 theta, eval_gradient=True, clone_kernel=False)
232 return -lml, -grad
233 else:
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\_gpr.py in log_marginal_likelihood(self, theta, eval_gradient, clone_kernel)
460
461 if eval_gradient:
--> 462 K, K_gradient = kernel(self.X_train_, eval_gradient=True)
463 else:
464 K = kernel(self.X_train_)
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
813 if eval_gradient:
--> 814 K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
815 K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
816 return K1 + K2, np.dstack((K1_gradient, K2_gradient))
d:\Toms_venv\venv\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
2110 X = np.atleast_2d(X)
2111 if Y is None:
-> 2112 K = np.inner(X, X) + self.sigma_0 ** 2
2113 else:
2114 if eval_gradient:
<__array_function__ internals> in inner(*args, **kwargs)
ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
我相信这是因为点积内核而发生的:第 2112 行中的回溯导致 numpy
inner product. So, the memory error you get actually arises in numpy
and not in Scikit-learn
. See also this SO question and this answer,它表明在 numpy
计算内部产品结果的预期数组大小可能导致 32 位整数溢出 Python。我的 python 设置是 64 位的,所以我无法进行一致的测试,但以下代码片段运行时没有错误:
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.datasets import load_boston
b = load_boston()
X = [pd.DataFrame(b['data'])]
y = b['target']
for i in range(50):
X.append(pd.DataFrame(b['data']))
y = np.append(y,b['target'])
X = pd.concat(X)
X = pd.concat([X,X[X.columns[0:8]]],axis=1)
print(X.values.shape,y.shape)
kernel = DotProduct() + WhiteKernel()
model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
model_gp.fit(X.values, y)
我建议 运行 您的模型具有较少的特征,以便查看在哪个数组形状上引发内存错误。或者,您可以尝试不需要 X
.