Pytorch 中 Re-implement 矩阵分解时的问题
Issue when Re-implement Matrix Factorization in Pytorch
我尝试在 Pytorch 中实现矩阵分解作为 data extractor and model。
原模型写在mxnet
。这里我尝试在Pytorch中使用同样的思路。
下面是我的代码,可以直接在codelab
中运行
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import collections
from collections import defaultdict
from IPython import display
import math
from matplotlib import pyplot as plt
import os
import pandas as pd
import random
import re
import shutil
import sys
import tarfile
import time
import requests
import zipfile
import hashlib
# ============data obtained, not change the original code
DATA_HUB= {}
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
def download(name, cache_dir=os.path.join('..', 'data')):
"""Download a file inserted into DATA_HUB, return the local filename."""
assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname # Hit cache
print(f'Downloading {fname} from {url}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
def download_extract(name, folder=None):
"""Download and extract a zip/tar file."""
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, 'Only zip/tar files can be extracted.'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir
#1. obtain dataset
DATA_HUB['ml-100k'] = ('http://files.grouplens.org/datasets/movielens/ml-100k.zip',
'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')
def read_data_ml100k():
data_dir = download_extract('ml-100k')
names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names,
engine='python')
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]
return data, num_users, num_items
# 2. Split data
#@save
def split_data_ml100k(data, num_users, num_items,
split_mode='random', test_ratio=0.1):
"""Split the dataset in random mode or seq-aware mode."""
if split_mode == 'seq-aware':
train_items, test_items, train_list = {}, {}, []
for line in data.itertuples():
u, i, rating, time = line[1], line[2], line[3], line[4]
train_items.setdefault(u, []).append((u, i, rating, time))
if u not in test_items or test_items[u][-1] < time:
test_items[u] = (i, rating, time)
for u in range(1, num_users + 1):
train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
test_data = [(key, *value) for key, value in test_items.items()]
train_data = [item for item in train_list if item not in test_data]
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
else:
mask = [True if x == 1 else False for x in np.random.uniform(
0, 1, (len(data))) < 1 - test_ratio]
neg_mask = [not x for x in mask]
train_data, test_data = data[mask], data[neg_mask]
return train_data, test_data
#@save
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
users, items, scores = [], [], []
inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
for line in data.itertuples():
user_index, item_index = int(line[1] - 1), int(line[2] - 1)
score = int(line[3]) if feedback == 'explicit' else 1
users.append(user_index)
items.append(item_index)
scores.append(score)
if feedback == 'implicit':
inter.setdefault(user_index, []).append(item_index)
else:
inter[item_index, user_index] = score
return users, items, scores, inter
#@save
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
test_ratio=0.1, batch_size=256):
data, num_users, num_items = read_data_ml100k()
train_data, test_data = split_data_ml100k(data, num_users, num_items, split_mode, test_ratio)
train_u, train_i, train_r, _ = load_data_ml100k(train_data, num_users, num_items, feedback)
test_u, test_i, test_r, _ = load_data_ml100k(test_data, num_users, num_items, feedback)
# Create Dataset
train_set = MyData(np.array(train_u), np.array(train_i), np.array(train_r))
test_set = MyData(np.array(test_u), np.array(test_i), np.array(test_r))
# Create Dataloader
train_iter = DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_iter = DataLoader(test_set, batch_size=batch_size)
return num_users, num_items, train_iter, test_iter
class MyData(Dataset):
def __init__(self, user, item, score):
self.user = torch.tensor(user)
self.item = torch.tensor(item)
self.score = torch.tensor(score)
def __len__(self):
return len(self.user)
def __getitem__(self, idx):
return self.user[idx], self.item[idx], self.score[idx]
# create a nn class (just-for-fun choice :-)
class RMSELoss(nn.Module):
def __init__(self, eps=1e-6):
'''You should be careful with NaN which will appear if the mse=0, adding self.eps'''
super().__init__()
self.mse = nn.MSELoss()
self.eps = eps
def forward(self,yhat,y):
loss = torch.sqrt(self.mse(yhat,y) + self.eps)
return loss
class MF(nn.Module):
def __init__(self, num_factors, num_users, num_items, **kwargs):
super(MF, self).__init__(**kwargs)
self.P = nn.Embedding(num_embeddings=num_users, embedding_dim=num_factors)
self.Q = nn.Embedding(num_embeddings=num_items, embedding_dim=num_factors)
self.user_bias = nn.Embedding(num_users, 1)
self.item_bias = nn.Embedding(num_items, 1)
def forward(self, user_id, item_id):
P_u = self.P(user_id)
Q_i = self.Q(item_id)
b_u = self.user_bias(user_id)
b_i = self.item_bias(item_id)
outputs = (P_u * Q_i).sum() + b_u.squeeze() + b_i.squeeze()
return outputs
# train
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Hyper parameters
num_epochs = 50
batch_size = 512
lr = 0.001
num_users, num_items, train_iter, test_iter = split_and_load_ml100k(test_ratio=0.1, batch_size=batch_size)
model = MF(30, num_users, num_items).to(device)
# Loss and Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = RMSELoss()
# Train the Model
train_rmse = []
test_rmse = []
for epoch in range(num_epochs):
train_loss = 0
num_train = 0
model.train()
for users, items, scores in train_iter:
users = users.to(device)
items = items.to(device)
scores = scores.float().to(device)
# Forward pass
outputs = model(users, items)
loss = criterion(outputs, scores)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
num_train += scores.shape[0]
train_rmse.append(train_loss / num_train)
model.eval()
test_loss = 0
num_test = 0
with torch.no_grad():
for users, items, scores in test_iter:
users = users.to(device)
items = items.to(device)
scores = scores.float().to(device)
outputs = model(users, items)
loss = criterion(outputs, scores)
test_loss += loss.item()
num_test += scores.shape[0]
test_rmse.append(test_loss / num_test)
# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
x = list(range(num_epochs))
fig = plt.figure()
ax = plt.axes()
plt.plot(x, train_rmse, label='train_rmse');
plt.plot(x, test_rmse, label='test_rmse');
leg = ax.legend();
我得到了结果
MXNET
结果在这里
为什么我练不出漂亮的身材。而我的 train_rmse
大于 test_rmse
。
我稍微修改了你的代码,得到了与 mxnet 类似的结果。这是 colab 中的 code。
- 型号。你在求和运算中错过了
axis=1
。
outputs = (P_u * Q_i).sum(axis=1) + b_u.squeeze() + b_i.squeeze()
默认的 sum
运算会将张量中的所有元素相加并生成标量。将标量添加到张量很好,这样您就不会发现错误。
- 优化器。我使用相同的优化器 - Adam 作为 mxnet 的实现。同样,我也添加了权重衰减。
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
- 初始化。使用正态分布初始化权重。
nn.init.normal_(self.P.weight, std=0.01)
nn.init.normal_(self.Q.weight, std=0.01)
nn.init.normal_(self.user_bias.weight, std=0.01)
nn.init.normal_(self.item_bias.weight, std=0.01)
其他,
您不需要添加 num_train
批量大小。损失已经在 MSELoss.
中除以 batch size
num_train += 1
我尝试在 Pytorch 中实现矩阵分解作为 data extractor and model。
原模型写在mxnet
。这里我尝试在Pytorch中使用同样的思路。
下面是我的代码,可以直接在codelab
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import collections
from collections import defaultdict
from IPython import display
import math
from matplotlib import pyplot as plt
import os
import pandas as pd
import random
import re
import shutil
import sys
import tarfile
import time
import requests
import zipfile
import hashlib
# ============data obtained, not change the original code
DATA_HUB= {}
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
def download(name, cache_dir=os.path.join('..', 'data')):
"""Download a file inserted into DATA_HUB, return the local filename."""
assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname # Hit cache
print(f'Downloading {fname} from {url}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
def download_extract(name, folder=None):
"""Download and extract a zip/tar file."""
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, 'Only zip/tar files can be extracted.'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir
#1. obtain dataset
DATA_HUB['ml-100k'] = ('http://files.grouplens.org/datasets/movielens/ml-100k.zip',
'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')
def read_data_ml100k():
data_dir = download_extract('ml-100k')
names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names,
engine='python')
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]
return data, num_users, num_items
# 2. Split data
#@save
def split_data_ml100k(data, num_users, num_items,
split_mode='random', test_ratio=0.1):
"""Split the dataset in random mode or seq-aware mode."""
if split_mode == 'seq-aware':
train_items, test_items, train_list = {}, {}, []
for line in data.itertuples():
u, i, rating, time = line[1], line[2], line[3], line[4]
train_items.setdefault(u, []).append((u, i, rating, time))
if u not in test_items or test_items[u][-1] < time:
test_items[u] = (i, rating, time)
for u in range(1, num_users + 1):
train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
test_data = [(key, *value) for key, value in test_items.items()]
train_data = [item for item in train_list if item not in test_data]
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
else:
mask = [True if x == 1 else False for x in np.random.uniform(
0, 1, (len(data))) < 1 - test_ratio]
neg_mask = [not x for x in mask]
train_data, test_data = data[mask], data[neg_mask]
return train_data, test_data
#@save
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
users, items, scores = [], [], []
inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
for line in data.itertuples():
user_index, item_index = int(line[1] - 1), int(line[2] - 1)
score = int(line[3]) if feedback == 'explicit' else 1
users.append(user_index)
items.append(item_index)
scores.append(score)
if feedback == 'implicit':
inter.setdefault(user_index, []).append(item_index)
else:
inter[item_index, user_index] = score
return users, items, scores, inter
#@save
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
test_ratio=0.1, batch_size=256):
data, num_users, num_items = read_data_ml100k()
train_data, test_data = split_data_ml100k(data, num_users, num_items, split_mode, test_ratio)
train_u, train_i, train_r, _ = load_data_ml100k(train_data, num_users, num_items, feedback)
test_u, test_i, test_r, _ = load_data_ml100k(test_data, num_users, num_items, feedback)
# Create Dataset
train_set = MyData(np.array(train_u), np.array(train_i), np.array(train_r))
test_set = MyData(np.array(test_u), np.array(test_i), np.array(test_r))
# Create Dataloader
train_iter = DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_iter = DataLoader(test_set, batch_size=batch_size)
return num_users, num_items, train_iter, test_iter
class MyData(Dataset):
def __init__(self, user, item, score):
self.user = torch.tensor(user)
self.item = torch.tensor(item)
self.score = torch.tensor(score)
def __len__(self):
return len(self.user)
def __getitem__(self, idx):
return self.user[idx], self.item[idx], self.score[idx]
# create a nn class (just-for-fun choice :-)
class RMSELoss(nn.Module):
def __init__(self, eps=1e-6):
'''You should be careful with NaN which will appear if the mse=0, adding self.eps'''
super().__init__()
self.mse = nn.MSELoss()
self.eps = eps
def forward(self,yhat,y):
loss = torch.sqrt(self.mse(yhat,y) + self.eps)
return loss
class MF(nn.Module):
def __init__(self, num_factors, num_users, num_items, **kwargs):
super(MF, self).__init__(**kwargs)
self.P = nn.Embedding(num_embeddings=num_users, embedding_dim=num_factors)
self.Q = nn.Embedding(num_embeddings=num_items, embedding_dim=num_factors)
self.user_bias = nn.Embedding(num_users, 1)
self.item_bias = nn.Embedding(num_items, 1)
def forward(self, user_id, item_id):
P_u = self.P(user_id)
Q_i = self.Q(item_id)
b_u = self.user_bias(user_id)
b_i = self.item_bias(item_id)
outputs = (P_u * Q_i).sum() + b_u.squeeze() + b_i.squeeze()
return outputs
# train
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Hyper parameters
num_epochs = 50
batch_size = 512
lr = 0.001
num_users, num_items, train_iter, test_iter = split_and_load_ml100k(test_ratio=0.1, batch_size=batch_size)
model = MF(30, num_users, num_items).to(device)
# Loss and Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = RMSELoss()
# Train the Model
train_rmse = []
test_rmse = []
for epoch in range(num_epochs):
train_loss = 0
num_train = 0
model.train()
for users, items, scores in train_iter:
users = users.to(device)
items = items.to(device)
scores = scores.float().to(device)
# Forward pass
outputs = model(users, items)
loss = criterion(outputs, scores)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
num_train += scores.shape[0]
train_rmse.append(train_loss / num_train)
model.eval()
test_loss = 0
num_test = 0
with torch.no_grad():
for users, items, scores in test_iter:
users = users.to(device)
items = items.to(device)
scores = scores.float().to(device)
outputs = model(users, items)
loss = criterion(outputs, scores)
test_loss += loss.item()
num_test += scores.shape[0]
test_rmse.append(test_loss / num_test)
# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
x = list(range(num_epochs))
fig = plt.figure()
ax = plt.axes()
plt.plot(x, train_rmse, label='train_rmse');
plt.plot(x, test_rmse, label='test_rmse');
leg = ax.legend();
我得到了结果
MXNET
结果在这里
为什么我练不出漂亮的身材。而我的 train_rmse
大于 test_rmse
。
我稍微修改了你的代码,得到了与 mxnet 类似的结果。这是 colab 中的 code。
- 型号。你在求和运算中错过了
axis=1
。
outputs = (P_u * Q_i).sum(axis=1) + b_u.squeeze() + b_i.squeeze()
默认的 sum
运算会将张量中的所有元素相加并生成标量。将标量添加到张量很好,这样您就不会发现错误。
- 优化器。我使用相同的优化器 - Adam 作为 mxnet 的实现。同样,我也添加了权重衰减。
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
- 初始化。使用正态分布初始化权重。
nn.init.normal_(self.P.weight, std=0.01)
nn.init.normal_(self.Q.weight, std=0.01)
nn.init.normal_(self.user_bias.weight, std=0.01)
nn.init.normal_(self.item_bias.weight, std=0.01)
其他,
您不需要添加 num_train
批量大小。损失已经在 MSELoss.
num_train += 1