如何使用 Tf-idf 特征来训练你的模型?
How to use Tf-idf features for training your model?
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
feature1 = tfidf.fit_transform(df.Rejoined_Stem)
array_of_feature = feature1.toarray()
我使用上面的代码为我的文本文档获取功能。
from sklearn.naive_bayes import MultinomialNB # Multinomial Naive Bayes on Lemmatized Text
X_train, X_test, y_train, y_test = train_test_split(df['Rejoined_Lemmatize'], df['Product'], random_state = 0)
X_train_counts = tfidf.fit_transform(X_train)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_pred = clf.predict(tfidf.transform(X_test))
然后我使用这段代码来训练我的模型。
有人可以解释一下在训练模型时究竟是如何使用上述特征的,因为在训练时没有在任何地方使用 feature1 变量??
不,您没有使用 feature1
,因为您执行了另一个转换 X_train_count
。
让我们按照逻辑流程检查您的代码,并仅使用在特征提取和模型训练中使用的变量。
# imports used
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# you initiated your transformer to `fit_transform` X_train, and `transform` X_test
tfidf = TfidfVectorizer(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
X_train_counts = tfidf.fit_transform(X_train)
X_test_counts = tfidf.transform(X_test)
# you initiated your model and fit X_train_counts and y_train
clf = MultinomialNB()
cls.fit(X_train_counts, y_train)
# you predicted from your transformed features
y_pred = clf.predict(X_test_counts)
有一种更好的方法来使用 Scikit-learn API,它可以消除混淆并帮助您避免混淆。这种方式使用 Pipelines
# imports used: see Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# get the params
tfidf_params = dict(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
# create a Pipeline that will do features transformation then pass to the model
clf = Pipeline(steps=[
('features', TfidfVectorizer(**tfidf_params)),
('model', MultinomialNB())
])
# Use clf as a model, fit X_train and y_train
cls.fit(X_train, y_train)
# predicted
y_pred = clf.predict(X_test)
管道做什么,在.fit
中是对数据做fit_transform,然后传递给模型。在 .predict
中,它将在传递给模型之前进行转换。
这种方法的最大好处是您可以轻松轻松地切换模型或转换器。这是模型基线比较的示例:
# collection to store results
from collections import defaultdict
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# models to test
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
# insistent our storage
bench_mark = defaultdict(list)
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# get the transformer params
tfidf_params = dict(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
# list of models we would like to complete
models = [
PassiveAggressiveClassifier(C=1e-1,max_iter=1e3, tol=1e3),
RidgeClassifierCV(scoring='roc_auc', cv=10),
LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', random_state=1, n_jobs=-1),
SGDClassifier(loss='log', random_state=1, max_iter=101),
]
# train, test and store each model
for model in models:
# our pipeline is changed to accept model
clf = Pipeline(steps=[
('features', TfidfVectorizer(**tfidf_params)),
('model', model) #just model not model() as we have done that in models list
])
clf.fit(X_train,y_train)
score = clf.score(X_test,y_test)
model_name = clf.named_steps['model'].__class__.__name__ # hack to get name
model_params = clf.named_steps['model']. get_params()
print(f'{model_name} Scored: {score:.3f}\n')
bench_mark['model_name'].append(model_name)
bench_mark['score'].append(score)
bench_mark['model'].append(clf)
bench_mark['used_params'].append(model_params)
# in the end, place the bench_mark to DataFrame
models_df = pd.DataFrame(bench_mark)
# now you have the trained modes in DataFrame, their scores and parameters.
#You can access and use any model.
logistic_reg = models_df[models_df['model_name']=='LogisticRegressionCV']['model'].iloc[0]
y_preds = logistic_reg.predict(X_test)
希望对您有所帮助
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
feature1 = tfidf.fit_transform(df.Rejoined_Stem)
array_of_feature = feature1.toarray()
我使用上面的代码为我的文本文档获取功能。
from sklearn.naive_bayes import MultinomialNB # Multinomial Naive Bayes on Lemmatized Text
X_train, X_test, y_train, y_test = train_test_split(df['Rejoined_Lemmatize'], df['Product'], random_state = 0)
X_train_counts = tfidf.fit_transform(X_train)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_pred = clf.predict(tfidf.transform(X_test))
然后我使用这段代码来训练我的模型。 有人可以解释一下在训练模型时究竟是如何使用上述特征的,因为在训练时没有在任何地方使用 feature1 变量??
不,您没有使用 feature1
,因为您执行了另一个转换 X_train_count
。
让我们按照逻辑流程检查您的代码,并仅使用在特征提取和模型训练中使用的变量。
# imports used
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# you initiated your transformer to `fit_transform` X_train, and `transform` X_test
tfidf = TfidfVectorizer(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
X_train_counts = tfidf.fit_transform(X_train)
X_test_counts = tfidf.transform(X_test)
# you initiated your model and fit X_train_counts and y_train
clf = MultinomialNB()
cls.fit(X_train_counts, y_train)
# you predicted from your transformed features
y_pred = clf.predict(X_test_counts)
有一种更好的方法来使用 Scikit-learn API,它可以消除混淆并帮助您避免混淆。这种方式使用 Pipelines
# imports used: see Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# get the params
tfidf_params = dict(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
# create a Pipeline that will do features transformation then pass to the model
clf = Pipeline(steps=[
('features', TfidfVectorizer(**tfidf_params)),
('model', MultinomialNB())
])
# Use clf as a model, fit X_train and y_train
cls.fit(X_train, y_train)
# predicted
y_pred = clf.predict(X_test)
管道做什么,在.fit
中是对数据做fit_transform,然后传递给模型。在 .predict
中,它将在传递给模型之前进行转换。
这种方法的最大好处是您可以轻松轻松地切换模型或转换器。这是模型基线比较的示例:
# collection to store results
from collections import defaultdict
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# models to test
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
# insistent our storage
bench_mark = defaultdict(list)
# split data random state 0 and test_size 0.25 default as you did not give the test_size
X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
# get the transformer params
tfidf_params = dict(sublinear_tf= True,
min_df = 5,
norm= 'l2',
ngram_range= (1,2),
stop_words ='english')
# list of models we would like to complete
models = [
PassiveAggressiveClassifier(C=1e-1,max_iter=1e3, tol=1e3),
RidgeClassifierCV(scoring='roc_auc', cv=10),
LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', random_state=1, n_jobs=-1),
SGDClassifier(loss='log', random_state=1, max_iter=101),
]
# train, test and store each model
for model in models:
# our pipeline is changed to accept model
clf = Pipeline(steps=[
('features', TfidfVectorizer(**tfidf_params)),
('model', model) #just model not model() as we have done that in models list
])
clf.fit(X_train,y_train)
score = clf.score(X_test,y_test)
model_name = clf.named_steps['model'].__class__.__name__ # hack to get name
model_params = clf.named_steps['model']. get_params()
print(f'{model_name} Scored: {score:.3f}\n')
bench_mark['model_name'].append(model_name)
bench_mark['score'].append(score)
bench_mark['model'].append(clf)
bench_mark['used_params'].append(model_params)
# in the end, place the bench_mark to DataFrame
models_df = pd.DataFrame(bench_mark)
# now you have the trained modes in DataFrame, their scores and parameters.
#You can access and use any model.
logistic_reg = models_df[models_df['model_name']=='LogisticRegressionCV']['model'].iloc[0]
y_preds = logistic_reg.predict(X_test)
希望对您有所帮助