用户使用 scikit-learn 定义的 SVM 内核
User defined SVM kernel with scikit-learn
我在scikit-learn中自己定义内核时遇到了问题。
我自己定义了高斯核并且能够拟合 SVM 但没有用它来进行预测。
更准确地说,我有以下代码
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.utils import shuffle
import scipy.sparse as sparse
import numpy as np
digits = load_digits(2)
X, y = shuffle(digits.data, digits.target)
gamma = 1.0
X_train, X_test = X[:100, :], X[100:, :]
y_train, y_test = y[:100], y[100:]
m1 = SVC(kernel='rbf',gamma=1)
m1.fit(X_train, y_train)
m1.predict(X_test)
def my_kernel(x,y):
d = x - y
c = np.dot(d,d.T)
return np.exp(-gamma*c)
m2 = SVC(kernel=my_kernel)
m2.fit(X_train, y_train)
m2.predict(X_test)
m1 和 m2 应该是一样的,但是 m2.predict(X_test) return 错误:
operands could not be broadcast together with shapes (260,64) (100,64)
没看懂问题
此外,如果 x 是一个数据点,m1.predict(x) 给出 +1/-1 结果,如预期的那样,但 m2.predict(x) 给出 +1/-1 的数组-1...
不知道为什么。
错误在 x - y
行。你不能像那样减去两者,因为两者的第一个维度可能不相等。以下是 rbf
内核在 scikit-learn 中的实现方式,摘自 here(仅保留要点):
def row_norms(X, squared=False):
if issparse(X):
norms = csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
if not squared:
np.sqrt(norms, norms)
return norms
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
[...]
Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
"""
X, Y = check_pairwise_arrays(X, Y)
if Y_norm_squared is not None:
YY = check_array(Y_norm_squared)
if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]
if X is Y: # shortcut in the common case euclidean_distances(X, X)
XX = YY.T
else:
XX = row_norms(X, squared=True)[:, np.newaxis]
distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)
if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0
return distances if squared else np.sqrt(distances, out=distances)
def rbf_kernel(X, Y=None, gamma=None):
X, Y = check_pairwise_arrays(X, Y)
if gamma is None:
gamma = 1.0 / X.shape[1]
K = euclidean_distances(X, Y, squared=True)
K *= -gamma
np.exp(K, K) # exponentiate K in-place
return K
您可能想更深入地研究代码,但请查看 euclidean_distances
函数的注释。您要实现的目标的天真实现是这样的:
def my_kernel(x,y):
d = np.zeros((x.shape[0], y.shape[0]))
for i, row_x in enumerate(x):
for j, row_y in enumerate(y):
d[i, j] = np.exp(-gamma * np.linalg.norm(row_x - row_y))
return d
我在scikit-learn中自己定义内核时遇到了问题。 我自己定义了高斯核并且能够拟合 SVM 但没有用它来进行预测。
更准确地说,我有以下代码
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.utils import shuffle
import scipy.sparse as sparse
import numpy as np
digits = load_digits(2)
X, y = shuffle(digits.data, digits.target)
gamma = 1.0
X_train, X_test = X[:100, :], X[100:, :]
y_train, y_test = y[:100], y[100:]
m1 = SVC(kernel='rbf',gamma=1)
m1.fit(X_train, y_train)
m1.predict(X_test)
def my_kernel(x,y):
d = x - y
c = np.dot(d,d.T)
return np.exp(-gamma*c)
m2 = SVC(kernel=my_kernel)
m2.fit(X_train, y_train)
m2.predict(X_test)
m1 和 m2 应该是一样的,但是 m2.predict(X_test) return 错误:
operands could not be broadcast together with shapes (260,64) (100,64)
没看懂问题
此外,如果 x 是一个数据点,m1.predict(x) 给出 +1/-1 结果,如预期的那样,但 m2.predict(x) 给出 +1/-1 的数组-1... 不知道为什么。
错误在 x - y
行。你不能像那样减去两者,因为两者的第一个维度可能不相等。以下是 rbf
内核在 scikit-learn 中的实现方式,摘自 here(仅保留要点):
def row_norms(X, squared=False):
if issparse(X):
norms = csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
if not squared:
np.sqrt(norms, norms)
return norms
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
[...]
Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
"""
X, Y = check_pairwise_arrays(X, Y)
if Y_norm_squared is not None:
YY = check_array(Y_norm_squared)
if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]
if X is Y: # shortcut in the common case euclidean_distances(X, X)
XX = YY.T
else:
XX = row_norms(X, squared=True)[:, np.newaxis]
distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)
if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0
return distances if squared else np.sqrt(distances, out=distances)
def rbf_kernel(X, Y=None, gamma=None):
X, Y = check_pairwise_arrays(X, Y)
if gamma is None:
gamma = 1.0 / X.shape[1]
K = euclidean_distances(X, Y, squared=True)
K *= -gamma
np.exp(K, K) # exponentiate K in-place
return K
您可能想更深入地研究代码,但请查看 euclidean_distances
函数的注释。您要实现的目标的天真实现是这样的:
def my_kernel(x,y):
d = np.zeros((x.shape[0], y.shape[0]))
for i, row_x in enumerate(x):
for j, row_y in enumerate(y):
d[i, j] = np.exp(-gamma * np.linalg.norm(row_x - row_y))
return d