sklearn 管道不工作
sklearn pipeline is not working
我是 sklearn 管道的新手,正在从 sklearn 文档中学习它。我在 movie review 数据的情绪分析中使用了它。数据包含两列,第一列 class
和第二列 text
.
input_file_df = pd.read_csv("movie-pang.csv")
x_train = input_file_df["text"] #used complete data as train data
y_train = input_file_df["class"]
我只使用了一个功能,sentiment score for each sentence.
我为此编写了自定义转换器:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return np.asarray(sentiment_score_list)
def fit(self, x, y=None):
return self
我使用的管道是:
pipeline = Pipeline([
("word_level_sentiment",GetWorldLevelSentiment()),
("clf", MultinomialNB())])
然后在管道上调用 fit:
pipeline.fit(x_train, y_train)
但这给我带来了以下错误:
This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.
有人可以指导我我做错了什么吗??会有很大的帮助。
这对我有用:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return pandas.DataFrame(sentiment_score-list)
def fit(self, x, y=None):
return self
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
# Текстовый трансформатор
class TextTransformer(BaseEstimator, TransformerMixin):
"""
Преобразование текстовых признаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None, *parg, **kwarg):
return self
def transform(self, X):
return X[self.key]
# Числовой трансформатор
class NumberTransformer(BaseEstimator, TransformerMixin):
"""
Преобразование числовых признаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
def fit_predict(model, X_train, X_test, y_train):
# использовать частотный векторизатор обратной частоты документа,
vec_tdidf = CountVectorizer(ngram_range=(2,3), max_df=0.93, min_df=0.05)
#Текстовый признак clean
text = Pipeline([
('transformer', TextTransformer(key='clear_text')),
('vectorizer', vec_tdidf)
])
#Числовой признак word_clean_count
word_numeric = Pipeline([
('transformer', NumberTransformer(key='word_count'))
])
posting_day = Pipeline([
('transformer', NumberTransformer(key='posting_day'))
])
posting_month = Pipeline([
('transformer', NumberTransformer(key='posting_month'))
])
post_theme = Pipeline([
('transformer', NumberTransformer(key='theme'))
])
# Объединение всех признаков
features = FeatureUnion([('Text_Feature', text),
('Num1_Feature', word_numeric),
('Num3_Feature', posting_day),
('Num4_Feature', posting_month),
('Num6_Feature', post_theme)
])
# Классификатор
clf = model
# Объединение классификатора и признаков
pipe = Pipeline([('features', features),
('clf',clf)
])
# Обучение модели
pipe_fit=pipe.fit(X_train, y_train)
# Предсказание данных
preds = pipe_fit.predict(X_test)
return preds, pipe_fit
我是 sklearn 管道的新手,正在从 sklearn 文档中学习它。我在 movie review 数据的情绪分析中使用了它。数据包含两列,第一列 class
和第二列 text
.
input_file_df = pd.read_csv("movie-pang.csv")
x_train = input_file_df["text"] #used complete data as train data
y_train = input_file_df["class"]
我只使用了一个功能,sentiment score for each sentence.
我为此编写了自定义转换器:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return np.asarray(sentiment_score_list)
def fit(self, x, y=None):
return self
我使用的管道是:
pipeline = Pipeline([
("word_level_sentiment",GetWorldLevelSentiment()),
("clf", MultinomialNB())])
然后在管道上调用 fit:
pipeline.fit(x_train, y_train)
但这给我带来了以下错误:
This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.
有人可以指导我我做错了什么吗??会有很大的帮助。
这对我有用:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return pandas.DataFrame(sentiment_score-list)
def fit(self, x, y=None):
return self
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
# Текстовый трансформатор
class TextTransformer(BaseEstimator, TransformerMixin):
"""
Преобразование текстовых признаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None, *parg, **kwarg):
return self
def transform(self, X):
return X[self.key]
# Числовой трансформатор
class NumberTransformer(BaseEstimator, TransformerMixin):
"""
Преобразование числовых признаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
def fit_predict(model, X_train, X_test, y_train):
# использовать частотный векторизатор обратной частоты документа,
vec_tdidf = CountVectorizer(ngram_range=(2,3), max_df=0.93, min_df=0.05)
#Текстовый признак clean
text = Pipeline([
('transformer', TextTransformer(key='clear_text')),
('vectorizer', vec_tdidf)
])
#Числовой признак word_clean_count
word_numeric = Pipeline([
('transformer', NumberTransformer(key='word_count'))
])
posting_day = Pipeline([
('transformer', NumberTransformer(key='posting_day'))
])
posting_month = Pipeline([
('transformer', NumberTransformer(key='posting_month'))
])
post_theme = Pipeline([
('transformer', NumberTransformer(key='theme'))
])
# Объединение всех признаков
features = FeatureUnion([('Text_Feature', text),
('Num1_Feature', word_numeric),
('Num3_Feature', posting_day),
('Num4_Feature', posting_month),
('Num6_Feature', post_theme)
])
# Классификатор
clf = model
# Объединение классификатора и признаков
pipe = Pipeline([('features', features),
('clf',clf)
])
# Обучение модели
pipe_fit=pipe.fit(X_train, y_train)
# Предсказание данных
preds = pipe_fit.predict(X_test)
return preds, pipe_fit