我在使用 Netflix 数据时遇到数据准备问题
I'm facing issues with Data Preparation while using Netflix Data
我在使用 Netflix 数据时遇到数据准备问题。我刚刚从 Github 克隆了一个 repo,我在尝试 运行 Jupyter Notebook 中的代码时遇到了问题。
%%time
%run ./DeepRecommender/data_utils/netflix_data_convert.py $NF_PRIZE_DATASET $NF_DATA
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
D:\Major Project\Code\RS\DeepRecommender\data_utils\netflix_data_convert.py in <module>
184
185 if __name__ == "__main__":
--> 186 main(sys.argv)
187
D:\Major Project\Code\RS\DeepRecommender\data_utils\netflix_data_convert.py in main(args)
93
94 text_files = [path.join(folder, f)
---> 95 for f in listdir(folder)
96 if path.isfile(path.join(folder, f)) and ('.txt' in f)]
97
FileNotFoundError: [WinError 3] The system cannot find the path specified: '/datadrive\netflix\download\training_set'
Wall time: 162 ms
有人可以帮我吗?
我正在使用 Windows 10 家。此代码适用于 Ubuntu,是否只是因为我使用 Windows 才出现此问题?
如果您需要任何其他详细信息,请告诉我。
我添加了 [netflix_data_convert.py][2]
中的代码。
# Copyright (c) 2017 NVIDIA Corporation
from os import listdir, path, makedirs
import random
import sys
import time
import datetime
def print_stats(data):
total_ratings = 0
print("STATS")
for user in data:
total_ratings += len(data[user])
print("Total Ratings: {}".format(total_ratings))
print("Total User count: {}".format(len(data.keys())))
def save_data_to_file(data, filename):
with open(filename, 'w') as out:
for userId in data:
for record in data[userId]:
out.write("{}\t{}\t{}\n".format(userId, record[0], record[1]))
def create_NETFLIX_data_timesplit(all_data,
train_min,
train_max,
test_min,
test_max):
"""
Creates time-based split of NETFLIX data into train, and (validation, test)
:param all_data:
:param train_min:
:param train_max:
:param test_min:
:param test_max:
:return:
"""
train_min_ts = time.mktime(datetime.datetime.strptime(train_min,"%Y-%m-%d").timetuple())
train_max_ts = time.mktime(datetime.datetime.strptime(train_max, "%Y-%m-%d").timetuple())
test_min_ts = time.mktime(datetime.datetime.strptime(test_min, "%Y-%m-%d").timetuple())
test_max_ts = time.mktime(datetime.datetime.strptime(test_max, "%Y-%m-%d").timetuple())
training_data = dict()
validation_data = dict()
test_data = dict()
train_set_items = set()
for userId, userRatings in all_data.items():
time_sorted_ratings = sorted(userRatings, key=lambda x: x[2]) # sort by timestamp
for rating_item in time_sorted_ratings:
if rating_item[2] >= train_min_ts and rating_item[2] <= train_max_ts:
if not userId in training_data:
training_data[userId] = []
training_data[userId].append(rating_item)
train_set_items.add(rating_item[0]) # keep track of items from training set
elif rating_item[2] >= test_min_ts and rating_item[2] <= test_max_ts:
if not userId in training_data: # only include users seen in the training set
continue
p = random.random()
if p <=0.5:
if not userId in validation_data:
validation_data[userId] = []
validation_data[userId].append(rating_item)
else:
if not userId in test_data:
test_data[userId] = []
test_data[userId].append(rating_item)
# remove items not not seen in training set
for userId, userRatings in test_data.items():
test_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
for userId, userRatings in validation_data.items():
validation_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
return training_data, validation_data, test_data
def main(args):
# create necessary folders:
for output_dir in [
"Netflix/N3M_TRAIN", "Netflix/N3M_VALID", "Netflix/N3M_TEST", "Netflix/N6M_TRAIN",
"Netflix/N6M_VALID", "Netflix/N6M_TEST", "Netflix/N1Y_TRAIN", "Netflix/N1Y_VALID",
"Netflix/N1Y_TEST", "Netflix/NF_TRAIN", "Netflix/NF_VALID", "Netflix/NF_TEST"]:
makedirs(output_dir, exist_ok=True)
user2id_map = dict()
item2id_map = dict()
userId = 0
itemId = 0
all_data = dict()
folder = args[1]
out_folder = args[2]
text_files = [path.join(folder, f)
for f in listdir(folder)
if path.isfile(path.join(folder, f)) and ('.txt' in f)]
for text_file in text_files:
with open(text_file, 'r') as f:
print("Processing: {}".format(text_file))
lines = f.readlines()
item = int(lines[0][:-2]) # remove newline and :
if not item in item2id_map:
item2id_map[item] = itemId
itemId += 1
for rating in lines[1:]:
parts = rating.strip().split(",")
user = int(parts[0])
if not user in user2id_map:
user2id_map[user] = userId
userId += 1
rating = float(parts[1])
ts = int(time.mktime(datetime.datetime.strptime(parts[2],"%Y-%m-%d").timetuple()))
if user2id_map[user] not in all_data:
all_data[user2id_map[user]] = []
all_data[user2id_map[user]].append((item2id_map[item], rating, ts))
print("STATS FOR ALL INPUT DATA")
print_stats(all_data)
# Netflix full
(nf_train, nf_valid, nf_test) = create_NETFLIX_data_timesplit(all_data,
"1999-12-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix full train")
print_stats(nf_train)
save_data_to_file(nf_train, out_folder + "/NF_TRAIN/nf.train.txt")
print("Netflix full valid")
print_stats(nf_valid)
save_data_to_file(nf_valid, out_folder + "/NF_VALID/nf.valid.txt")
print("Netflix full test")
print_stats(nf_test)
save_data_to_file(nf_test, out_folder + "/NF_TEST/nf.test.txt")
(n3m_train, n3m_valid, n3m_test) = create_NETFLIX_data_timesplit(all_data,
"2005-09-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix 3m train")
print_stats(n3m_train)
save_data_to_file(n3m_train, out_folder+"/N3M_TRAIN/n3m.train.txt")
print("Netflix 3m valid")
print_stats(n3m_valid)
save_data_to_file(n3m_valid, out_folder + "/N3M_VALID/n3m.valid.txt")
print("Netflix 3m test")
print_stats(n3m_test)
save_data_to_file(n3m_test, out_folder + "/N3M_TEST/n3m.test.txt")
(n6m_train, n6m_valid, n6m_test) = create_NETFLIX_data_timesplit(all_data,
"2005-06-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix 6m train")
print_stats(n6m_train)
save_data_to_file(n6m_train, out_folder+"/N6M_TRAIN/n6m.train.txt")
print("Netflix 6m valid")
print_stats(n6m_valid)
save_data_to_file(n6m_valid, out_folder + "/N6M_VALID/n6m.valid.txt")
print("Netflix 6m test")
print_stats(n6m_test)
save_data_to_file(n6m_test, out_folder + "/N6M_TEST/n6m.test.txt")
# Netflix 1 year
(n1y_train, n1y_valid, n1y_test) = create_NETFLIX_data_timesplit(all_data,
"2004-06-01",
"2005-05-31",
"2005-06-01",
"2005-06-30")
print("Netflix 1y train")
print_stats(n1y_train)
save_data_to_file(n1y_train, out_folder + "/N1Y_TRAIN/n1y.train.txt")
print("Netflix 1y valid")
print_stats(n1y_valid)
save_data_to_file(n1y_valid, out_folder + "/N1Y_VALID/n1y.valid.txt")
print("Netflix 1y test")
print_stats(n1y_test)
save_data_to_file(n1y_test, out_folder + "/N1Y_TEST/n1y.test.txt")
if __name__ == "__main__":
main(sys.argv)
我试过了,效果很好。
其实我把$NF_PRIZE_DATASET
换成了training_set
(这是DeepRecommender
文件夹根目录下的文件夹,training_set
里面是我从Netflix Dataset) 和 $NF_DATA
与 NF_DATA
%%time
%run ./DeepRecommender/data_utils/netflix_data_convert.py training_set NF_DATA
我在使用 Netflix 数据时遇到数据准备问题。我刚刚从 Github 克隆了一个 repo,我在尝试 运行 Jupyter Notebook 中的代码时遇到了问题。
%%time
%run ./DeepRecommender/data_utils/netflix_data_convert.py $NF_PRIZE_DATASET $NF_DATA
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
D:\Major Project\Code\RS\DeepRecommender\data_utils\netflix_data_convert.py in <module>
184
185 if __name__ == "__main__":
--> 186 main(sys.argv)
187
D:\Major Project\Code\RS\DeepRecommender\data_utils\netflix_data_convert.py in main(args)
93
94 text_files = [path.join(folder, f)
---> 95 for f in listdir(folder)
96 if path.isfile(path.join(folder, f)) and ('.txt' in f)]
97
FileNotFoundError: [WinError 3] The system cannot find the path specified: '/datadrive\netflix\download\training_set'
Wall time: 162 ms
有人可以帮我吗?
我正在使用 Windows 10 家。此代码适用于 Ubuntu,是否只是因为我使用 Windows 才出现此问题?
如果您需要任何其他详细信息,请告诉我。
我添加了 [netflix_data_convert.py][2]
中的代码。
# Copyright (c) 2017 NVIDIA Corporation
from os import listdir, path, makedirs
import random
import sys
import time
import datetime
def print_stats(data):
total_ratings = 0
print("STATS")
for user in data:
total_ratings += len(data[user])
print("Total Ratings: {}".format(total_ratings))
print("Total User count: {}".format(len(data.keys())))
def save_data_to_file(data, filename):
with open(filename, 'w') as out:
for userId in data:
for record in data[userId]:
out.write("{}\t{}\t{}\n".format(userId, record[0], record[1]))
def create_NETFLIX_data_timesplit(all_data,
train_min,
train_max,
test_min,
test_max):
"""
Creates time-based split of NETFLIX data into train, and (validation, test)
:param all_data:
:param train_min:
:param train_max:
:param test_min:
:param test_max:
:return:
"""
train_min_ts = time.mktime(datetime.datetime.strptime(train_min,"%Y-%m-%d").timetuple())
train_max_ts = time.mktime(datetime.datetime.strptime(train_max, "%Y-%m-%d").timetuple())
test_min_ts = time.mktime(datetime.datetime.strptime(test_min, "%Y-%m-%d").timetuple())
test_max_ts = time.mktime(datetime.datetime.strptime(test_max, "%Y-%m-%d").timetuple())
training_data = dict()
validation_data = dict()
test_data = dict()
train_set_items = set()
for userId, userRatings in all_data.items():
time_sorted_ratings = sorted(userRatings, key=lambda x: x[2]) # sort by timestamp
for rating_item in time_sorted_ratings:
if rating_item[2] >= train_min_ts and rating_item[2] <= train_max_ts:
if not userId in training_data:
training_data[userId] = []
training_data[userId].append(rating_item)
train_set_items.add(rating_item[0]) # keep track of items from training set
elif rating_item[2] >= test_min_ts and rating_item[2] <= test_max_ts:
if not userId in training_data: # only include users seen in the training set
continue
p = random.random()
if p <=0.5:
if not userId in validation_data:
validation_data[userId] = []
validation_data[userId].append(rating_item)
else:
if not userId in test_data:
test_data[userId] = []
test_data[userId].append(rating_item)
# remove items not not seen in training set
for userId, userRatings in test_data.items():
test_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
for userId, userRatings in validation_data.items():
validation_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
return training_data, validation_data, test_data
def main(args):
# create necessary folders:
for output_dir in [
"Netflix/N3M_TRAIN", "Netflix/N3M_VALID", "Netflix/N3M_TEST", "Netflix/N6M_TRAIN",
"Netflix/N6M_VALID", "Netflix/N6M_TEST", "Netflix/N1Y_TRAIN", "Netflix/N1Y_VALID",
"Netflix/N1Y_TEST", "Netflix/NF_TRAIN", "Netflix/NF_VALID", "Netflix/NF_TEST"]:
makedirs(output_dir, exist_ok=True)
user2id_map = dict()
item2id_map = dict()
userId = 0
itemId = 0
all_data = dict()
folder = args[1]
out_folder = args[2]
text_files = [path.join(folder, f)
for f in listdir(folder)
if path.isfile(path.join(folder, f)) and ('.txt' in f)]
for text_file in text_files:
with open(text_file, 'r') as f:
print("Processing: {}".format(text_file))
lines = f.readlines()
item = int(lines[0][:-2]) # remove newline and :
if not item in item2id_map:
item2id_map[item] = itemId
itemId += 1
for rating in lines[1:]:
parts = rating.strip().split(",")
user = int(parts[0])
if not user in user2id_map:
user2id_map[user] = userId
userId += 1
rating = float(parts[1])
ts = int(time.mktime(datetime.datetime.strptime(parts[2],"%Y-%m-%d").timetuple()))
if user2id_map[user] not in all_data:
all_data[user2id_map[user]] = []
all_data[user2id_map[user]].append((item2id_map[item], rating, ts))
print("STATS FOR ALL INPUT DATA")
print_stats(all_data)
# Netflix full
(nf_train, nf_valid, nf_test) = create_NETFLIX_data_timesplit(all_data,
"1999-12-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix full train")
print_stats(nf_train)
save_data_to_file(nf_train, out_folder + "/NF_TRAIN/nf.train.txt")
print("Netflix full valid")
print_stats(nf_valid)
save_data_to_file(nf_valid, out_folder + "/NF_VALID/nf.valid.txt")
print("Netflix full test")
print_stats(nf_test)
save_data_to_file(nf_test, out_folder + "/NF_TEST/nf.test.txt")
(n3m_train, n3m_valid, n3m_test) = create_NETFLIX_data_timesplit(all_data,
"2005-09-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix 3m train")
print_stats(n3m_train)
save_data_to_file(n3m_train, out_folder+"/N3M_TRAIN/n3m.train.txt")
print("Netflix 3m valid")
print_stats(n3m_valid)
save_data_to_file(n3m_valid, out_folder + "/N3M_VALID/n3m.valid.txt")
print("Netflix 3m test")
print_stats(n3m_test)
save_data_to_file(n3m_test, out_folder + "/N3M_TEST/n3m.test.txt")
(n6m_train, n6m_valid, n6m_test) = create_NETFLIX_data_timesplit(all_data,
"2005-06-01",
"2005-11-30",
"2005-12-01",
"2005-12-31")
print("Netflix 6m train")
print_stats(n6m_train)
save_data_to_file(n6m_train, out_folder+"/N6M_TRAIN/n6m.train.txt")
print("Netflix 6m valid")
print_stats(n6m_valid)
save_data_to_file(n6m_valid, out_folder + "/N6M_VALID/n6m.valid.txt")
print("Netflix 6m test")
print_stats(n6m_test)
save_data_to_file(n6m_test, out_folder + "/N6M_TEST/n6m.test.txt")
# Netflix 1 year
(n1y_train, n1y_valid, n1y_test) = create_NETFLIX_data_timesplit(all_data,
"2004-06-01",
"2005-05-31",
"2005-06-01",
"2005-06-30")
print("Netflix 1y train")
print_stats(n1y_train)
save_data_to_file(n1y_train, out_folder + "/N1Y_TRAIN/n1y.train.txt")
print("Netflix 1y valid")
print_stats(n1y_valid)
save_data_to_file(n1y_valid, out_folder + "/N1Y_VALID/n1y.valid.txt")
print("Netflix 1y test")
print_stats(n1y_test)
save_data_to_file(n1y_test, out_folder + "/N1Y_TEST/n1y.test.txt")
if __name__ == "__main__":
main(sys.argv)
我试过了,效果很好。
其实我把$NF_PRIZE_DATASET
换成了training_set
(这是DeepRecommender
文件夹根目录下的文件夹,training_set
里面是我从Netflix Dataset) 和 $NF_DATA
与 NF_DATA
%%time
%run ./DeepRecommender/data_utils/netflix_data_convert.py training_set NF_DATA