Scikit 学习自定义转换器维度不匹配
Scikit learn Custom Transformer dimension mismatch
我来自 R,所以 scikit API 仍然让我感到困惑。我正在按照本教程 http://michelleful.github.io/code-blog/2015/06/20/pipelines/ 学习管道。因此,让我们创建一个假数据集仅供参考:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
我的目标非常简单:使用来自 x1 和 x2 的单独 tfidf 矩阵以及来自 x1 和 x2 的一些自定义特征(即字长等)训练 y 的线性回归。
让我们从仅使用来自 x1 的 tfidf 的更简单的任务开始。这是完整的代码:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
我收到错误 ValueError: dimension mismatch
,可能是因为某些术语不会出现在两个 train/validation 折叠中。这样做的正确方法是什么?谢谢!
将您的 ColumnNgram 更改为:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
你应该在fit()
中声明和了解训练数据。目前,您正在对 transform()
的每次调用中的数据进行重新拟合,这显然会 return 在训练和验证集中的不同特征中,正如您所建议的那样。
正确的方法是保留一个TfidfVectorizer
,它在fit()中学习数据,然后在transform()
中只转换新数据,而不是重新拟合新数据。
我来自 R,所以 scikit API 仍然让我感到困惑。我正在按照本教程 http://michelleful.github.io/code-blog/2015/06/20/pipelines/ 学习管道。因此,让我们创建一个假数据集仅供参考:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
我的目标非常简单:使用来自 x1 和 x2 的单独 tfidf 矩阵以及来自 x1 和 x2 的一些自定义特征(即字长等)训练 y 的线性回归。
让我们从仅使用来自 x1 的 tfidf 的更简单的任务开始。这是完整的代码:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
我收到错误 ValueError: dimension mismatch
,可能是因为某些术语不会出现在两个 train/validation 折叠中。这样做的正确方法是什么?谢谢!
将您的 ColumnNgram 更改为:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
你应该在fit()
中声明和了解训练数据。目前,您正在对 transform()
的每次调用中的数据进行重新拟合,这显然会 return 在训练和验证集中的不同特征中,正如您所建议的那样。
正确的方法是保留一个TfidfVectorizer
,它在fit()中学习数据,然后在transform()
中只转换新数据,而不是重新拟合新数据。