从 Surprise 中的预定义折叠加载数据时如何构建完整的训练集?
How to build a full trainset when loading data from predefined folds in Surprise?
我正在使用 Surprise 来评估各种推荐系统算法。我想计算所有可能的用户和项目排列的预测和预测覆盖率。我的数据是从预定义的拆分中加载的。
我计算预测覆盖率的策略是
- 构建完整的训练集并进行拟合
- 获取所有用户和项目的列表
- 遍历列表并进行预测
- 计算预测无法计算预测覆盖率的例外情况。
尝试调用 data.build_full_trainset())
产生以下错误:
AttributeError: 'DatasetUserFolds' object has no attribute 'build_full_trainset'
有没有办法在从预定义的折叠加载数据时构建完整的训练集?
或者,我会尝试将来自 Surprise 的外部数据合并到一个数据框中,然后重做该过程。或者有更好的方法吗?
谢谢。
# %% #https://surprise.readthedocs.io/en/stable/getting_started.html#basic-usage
import random
import pickle
import numpy as np
import pandas as pd
# from survey.data_cleaning import long_ratings
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
# from surprise.model_selection import LeaveOneOut, KFold
from surprise.model_selection import PredefinedKFold
#set random seed for reproducibility
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)
path = 'data/recommenders/'
def load_splits():
"""
Loads splits from files load data from splits created by colab code and stored to files. used in surprise_recommenders.py
returns splits as dataset
"""
# path to dataset folder
files_dir = 'data/recommenders/splits/'
# This time, we'll use the built-in reader.
reader = Reader(line_format='user item rating', sep=' ', skip_lines=0, rating_scale=(1, 5))
# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]
data = Dataset.load_from_folds(folds_files, reader=reader)
return data
data = load_splits()
pkf = PredefinedKFold()
algos = {
'NormalPredictor': {'constructor': NormalPredictor,
'param_grid': {}
}}
key = "stratified_5_fold"
cv_results={}
print(f"Performing {key} cross validation.")
for algo_name, v in algos.items():
print("Working on algorithm: ", algo_name)
gs = GridSearchCV(v['constructor'], v['param_grid'], measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# Predict on full dataset
# Use the weights that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset()) #predefined folds breaks it.
cv_results[algo_name] = pd.DataFrame.from_dict(gs.cv_results)
TLDR; Surprise 中的 model_selection 文档指出了一种“改装”方法,它将适合整个训练集上的数据,但它明确不适用于预定义的折叠。
另一个主要问题:oyyablokov's comment on this issue 建议您无法用具有 NaN 的数据拟合模型。因此,即使你有一个完整的训练集,如何创建一个完整的预测矩阵来计算诸如预测覆盖率之类的东西,这需要所有用户和项目组合有或没有评级?
我的解决方法是创建 3 个 Surprise 数据集。
- 从预定义的数据集折叠计算best_params
- 评分的完整数据集(结合惊喜之外的所有折叠)
- 完整的预测矩阵数据集,包括用户和项目的所有可能组合(有或没有评级)。
通过网格搜索交叉验证找到最佳参数后,您可以通过以下方式找到您的预测和覆盖范围:
import pandas as pd
from surprise import Dataset, Reader
def get_pred_coverage(data_matrix, algo_constructor, best_params, verbose=False):
"""
Calculates coverage
inputs:
data_matrix: Numpy Matrix with 0, 1, 2 columns as user, service, rating
algo_constructor: the Surprise algorithm constructor to pass the best params into
best_params: Surprise gs.best_params to pass into algo.
returns: coverage & full predictions
"""
reader=Reader(rating_scale=(1,5))
full_predictions = [] #list to store prediction results
df = pd.DataFrame(data_matrix)
if verbose: print(df.info())
df_no_nan = df.dropna(subset=[2])
if verbose: print(df_no_nan.head())
no_nan_dataset = Dataset.load_from_df(df_no_nan[[0,1,2]], reader)
full_dataset = Dataset.load_from_df(df[[0, 1, 2]], reader)
#Predict on full dataset
# Use the weights that yields the best rmse:
algo = algo_constructor(**best_params) # Pass the dictionary as double star keyword arguments to the algorithm constructor
#Create a no-nan trainset to fit on
no_nan_trainset = no_nan_dataset.build_full_trainset()
algo.fit(no_nan_trainset)
if verbose: print('Number of trainset users: ', no_nan_trainset.n_users, '\n')
if verbose: print('Number of trainset items: ', no_nan_trainset.n_items, '\n')
pred_set = full_dataset.build_full_trainset()
if verbose: print('Number of users: ', pred_set.n_users, '\n')
if verbose: print('Number of items: ', pred_set.n_items, '\n')
#get all item ids
pred_set_iids = list(pred_set.all_items())
# print(f'pred_set iids are {pred_set_iids}')
iid_converter = lambda x: pred_set.to_raw_iid(x)
pred_set_raw_iids = list(map(iid_converter, pred_set_iids))
#get all user ids
pred_set_uids = list(pred_set.all_users())
uid_converter = lambda x: pred_set.to_raw_uid(x)
pred_set_raw_uids = list(map(uid_converter, pred_set_uids))
# print(f'pred_set uids are {pred_set_uids}')
for user in pred_set_raw_uids:
for item in pred_set_raw_iids:
r_ui = float(df[2].loc[(df[0] == user) & (df[1]== item)]) #find the rating, by user and value
# print(f"r_ui is type {type(r_ui)} and value {r_ui}")
prediction = algo.predict(uid = user, iid = item, r_ui=r_ui)
# print(prediction)
full_predictions.append(prediction)
#access a tuple
#5th element, dicitonary item "was_impossible"
impossible_count = 0
for prediction in full_predictions:
impossible_count += prediction[4]['was_impossible']
if verbose: print(f"for algo {algo}, impossible_count is {impossible_count} ")
prediction_coverage = (pred_set.n_users*pred_set.n_items - impossible_count)/(pred_set.n_users*pred_set.n_items)
print(f"prediction_coverage is {prediction_coverage}")
return prediction_coverage, full_predictions
我正在使用 Surprise 来评估各种推荐系统算法。我想计算所有可能的用户和项目排列的预测和预测覆盖率。我的数据是从预定义的拆分中加载的。
我计算预测覆盖率的策略是
- 构建完整的训练集并进行拟合
- 获取所有用户和项目的列表
- 遍历列表并进行预测
- 计算预测无法计算预测覆盖率的例外情况。
尝试调用 data.build_full_trainset())
产生以下错误:
AttributeError: 'DatasetUserFolds' object has no attribute 'build_full_trainset'
有没有办法在从预定义的折叠加载数据时构建完整的训练集?
或者,我会尝试将来自 Surprise 的外部数据合并到一个数据框中,然后重做该过程。或者有更好的方法吗?
谢谢。
# %% #https://surprise.readthedocs.io/en/stable/getting_started.html#basic-usage
import random
import pickle
import numpy as np
import pandas as pd
# from survey.data_cleaning import long_ratings
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
# from surprise.model_selection import LeaveOneOut, KFold
from surprise.model_selection import PredefinedKFold
#set random seed for reproducibility
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)
path = 'data/recommenders/'
def load_splits():
"""
Loads splits from files load data from splits created by colab code and stored to files. used in surprise_recommenders.py
returns splits as dataset
"""
# path to dataset folder
files_dir = 'data/recommenders/splits/'
# This time, we'll use the built-in reader.
reader = Reader(line_format='user item rating', sep=' ', skip_lines=0, rating_scale=(1, 5))
# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]
data = Dataset.load_from_folds(folds_files, reader=reader)
return data
data = load_splits()
pkf = PredefinedKFold()
algos = {
'NormalPredictor': {'constructor': NormalPredictor,
'param_grid': {}
}}
key = "stratified_5_fold"
cv_results={}
print(f"Performing {key} cross validation.")
for algo_name, v in algos.items():
print("Working on algorithm: ", algo_name)
gs = GridSearchCV(v['constructor'], v['param_grid'], measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# Predict on full dataset
# Use the weights that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset()) #predefined folds breaks it.
cv_results[algo_name] = pd.DataFrame.from_dict(gs.cv_results)
TLDR; Surprise 中的 model_selection 文档指出了一种“改装”方法,它将适合整个训练集上的数据,但它明确不适用于预定义的折叠。
另一个主要问题:oyyablokov's comment on this issue 建议您无法用具有 NaN 的数据拟合模型。因此,即使你有一个完整的训练集,如何创建一个完整的预测矩阵来计算诸如预测覆盖率之类的东西,这需要所有用户和项目组合有或没有评级?
我的解决方法是创建 3 个 Surprise 数据集。
- 从预定义的数据集折叠计算best_params
- 评分的完整数据集(结合惊喜之外的所有折叠)
- 完整的预测矩阵数据集,包括用户和项目的所有可能组合(有或没有评级)。
通过网格搜索交叉验证找到最佳参数后,您可以通过以下方式找到您的预测和覆盖范围:
import pandas as pd
from surprise import Dataset, Reader
def get_pred_coverage(data_matrix, algo_constructor, best_params, verbose=False):
"""
Calculates coverage
inputs:
data_matrix: Numpy Matrix with 0, 1, 2 columns as user, service, rating
algo_constructor: the Surprise algorithm constructor to pass the best params into
best_params: Surprise gs.best_params to pass into algo.
returns: coverage & full predictions
"""
reader=Reader(rating_scale=(1,5))
full_predictions = [] #list to store prediction results
df = pd.DataFrame(data_matrix)
if verbose: print(df.info())
df_no_nan = df.dropna(subset=[2])
if verbose: print(df_no_nan.head())
no_nan_dataset = Dataset.load_from_df(df_no_nan[[0,1,2]], reader)
full_dataset = Dataset.load_from_df(df[[0, 1, 2]], reader)
#Predict on full dataset
# Use the weights that yields the best rmse:
algo = algo_constructor(**best_params) # Pass the dictionary as double star keyword arguments to the algorithm constructor
#Create a no-nan trainset to fit on
no_nan_trainset = no_nan_dataset.build_full_trainset()
algo.fit(no_nan_trainset)
if verbose: print('Number of trainset users: ', no_nan_trainset.n_users, '\n')
if verbose: print('Number of trainset items: ', no_nan_trainset.n_items, '\n')
pred_set = full_dataset.build_full_trainset()
if verbose: print('Number of users: ', pred_set.n_users, '\n')
if verbose: print('Number of items: ', pred_set.n_items, '\n')
#get all item ids
pred_set_iids = list(pred_set.all_items())
# print(f'pred_set iids are {pred_set_iids}')
iid_converter = lambda x: pred_set.to_raw_iid(x)
pred_set_raw_iids = list(map(iid_converter, pred_set_iids))
#get all user ids
pred_set_uids = list(pred_set.all_users())
uid_converter = lambda x: pred_set.to_raw_uid(x)
pred_set_raw_uids = list(map(uid_converter, pred_set_uids))
# print(f'pred_set uids are {pred_set_uids}')
for user in pred_set_raw_uids:
for item in pred_set_raw_iids:
r_ui = float(df[2].loc[(df[0] == user) & (df[1]== item)]) #find the rating, by user and value
# print(f"r_ui is type {type(r_ui)} and value {r_ui}")
prediction = algo.predict(uid = user, iid = item, r_ui=r_ui)
# print(prediction)
full_predictions.append(prediction)
#access a tuple
#5th element, dicitonary item "was_impossible"
impossible_count = 0
for prediction in full_predictions:
impossible_count += prediction[4]['was_impossible']
if verbose: print(f"for algo {algo}, impossible_count is {impossible_count} ")
prediction_coverage = (pred_set.n_users*pred_set.n_items - impossible_count)/(pred_set.n_users*pred_set.n_items)
print(f"prediction_coverage is {prediction_coverage}")
return prediction_coverage, full_predictions