Commit e6d2ae49 by 20200318029

lecture 06

parent 5ec39085
......@@ -4,4 +4,5 @@
*.docx
__pycache__/
data/
.DS_Store
\ No newline at end of file
.DS_Store
*.json
\ No newline at end of file
# Neural Collaborative Filtering
Pytorch implementation of the algorithm described in [Xiangnan He, et. all
2017, Neural Collaborative Filtering](https://arxiv.org/pdf/1708.05031.pdf).
A detail description of the algorithm components and the training/validation
process can be found in the notebooks.
Examples of how to run it can be found in `run_experiments.sh`. I have tried a
number of different combinations including [Cyclic learning
rates](https://arxiv.org/pdf/1506.01186.pdf).
import numpy as np
import pandas as pd
import pickle
import torch
import gzip
from sklearn.neighbors import NearestNeighbors
from pathlib import Path
from mlp import MLP
from gmf import GMF
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
DATA_PATH = Path(".")
MODEL_DIR = "models"
asin2id_map = pickle.load(open(DATA_PATH/'item_mappings.p', 'rb'))
id2asin_map = {k:v for v,k in asin2id_map.items()}
df_movies_meta_data = getDF(DATA_PATH/'meta_Movies_and_TV.json.gz')
keep_cols = ['asin', 'title']
df_movies_meta_data = df_movies_meta_data[keep_cols]
df_movies_meta_data = df_movies_meta_data[~df_movies_meta_data.title.isna()]
asin2title_map = dict(df_movies_meta_data.values)
print("number of items with missing title in the core dataset: {}".format(
np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))
print("number of items with non missing titles in the core dataset: {}".format(
len(id2asin_map) \
- np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))
id2title_map = {}
for k,v in id2asin_map.items():
try:
id2title_map[k] = asin2title_map[v]
except:
continue
df_results = pd.read_pickle(DATA_PATH/MODEL_DIR/'results_df.p')
best_gmf = (df_results[df_results.modelname.str.contains('GMF')]
.sort_values('best_hr', ascending=False)
.reset_index(drop=True)
).modelname[0]
n_emb_i = int(np.where([s == 'emb' for s in best_gmf.split("_")])[0])+1
n_emb = int(best_gmf.split("_")[n_emb_i])
dataset = np.load(DATA_PATH/'neuralcf_split.npz')
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
gmf_model = GMF(n_users, n_items, n_emb)
gmf_model.load_state_dict(torch.load(DATA_PATH/MODEL_DIR/best_gmf))
item_embeddings = gmf_model.embeddings_item.weight.data.numpy()
knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(item_embeddings)
def get_movie_titles(input_id, n=20):
"""first movie will be the "query" movie and the remaining n-1 the similar
movies. Similar defined under the functioning of the algorithm, i.e.
leading to the same prediction"""
dist, nnidx = knn_model.kneighbors(
item_embeddings[input_id].reshape(1, -1),
n_neighbors = n)
titles = []
for idx in nnidx[0]:
try:
titles.append(id2title_map[idx])
except:
continue
return titles
similar_movies = get_movie_titles(1234)
# In [13]: similar_movies
# Out[13]:
# ['Ace Ventura: Pet Detective',
# 'Gone in 60 Seconds',
# 'Better Off Dead [VHS]',
# 'Dinosaur [VHS]',
# 'Coming to America [VHS]',
# 'Rush Hour [VHS]',
# 'Tommy Boy [VHS]',
# 'Rush Hour 2 [VHS]',
# "Bill and Ted's Excellent Adventure [VHS]",
# 'Evolution',
# 'Con Air [VHS]',
# 'Liar Liar [VHS]',
# 'The Karate Kid',
# 'Weird Science [VHS]',
# 'The Fast and the Furious',
# 'Jumanji [VHS]',
# 'Open Season [UMD for PSP]',
# 'Ace Ventura: When Nature Calls [VHS]',
# 'Scary Movie',
# 'Die Hard Trilogy']
\ No newline at end of file
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--n_emb", type=int, default=8,
help="embedding size.")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="boolean to set the use of CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation")
return parser.parse_args()
class GMF(nn.Module):
def __init__(self, n_user, n_item, n_emb=8):
super(GMF, self).__init__()
self.n_emb = n_emb
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, n_emb)
self.embeddings_item = nn.Embedding(n_item, n_emb)
self.out = nn.Linear(in_features=n_emb, out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
elif isinstance(m, nn.Linear):
nn.init.uniform_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
# Task-1: complete the proces to compute preds
# preds = '...'
return preds
def train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg):
model.train()
train_dataset = get_train_instances(train_ratings,
negatives,
n_items,
n_neg)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
num_workers=4,
shuffle=True)
train_steps = (len(train_loader.dataset) // train_loader.batch_size) + 1
running_loss=0
for data in train_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
optimizer.zero_grad()
preds = model(users, items)
if scheduler:
scheduler.step()
loss = criterion(preds.squeeze(1), labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
return running_loss/train_steps
def evaluate(model, test_loader, use_cuda, topk):
model.eval()
scores=[]
with torch.no_grad():
for data in test_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
preds = model(users, items)
items_cpu = items.cpu().numpy()
preds_cpu = preds.squeeze(1).detach().cpu().numpy()
split_chuncks = preds_cpu.shape[0]//100
litems=np.split(items_cpu, split_chuncks)
lpreds=np.split(preds_cpu, split_chuncks)
scores += [get_scores(it,pr,topk) for it,pr in zip(litems,lpreds)]
hits = [s[0] for s in scores]
ndcgs = [s[1] for s in scores]
return (np.array(hits).mean(),np.array(ndcgs).mean())
def checkpoint(model, modelpath):
torch.save(model.state_dict(), modelpath)
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
n_emb = args.n_emb
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "GMF" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
if not os.path.exists(modeldir): os.makedirs(modeldir)
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = GMF(n_users, n_items, n_emb=n_emb)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 # one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best GMF model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from utils import get_train_instances, get_scores
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="standard_split.npz",
help="npz file with dataset")
parser.add_argument("--epochs", type=int, default=5,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=256,
help="batch size.")
parser.add_argument("--n_emb", type=int, default=8,
help="embedding size.")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
return parser.parse_args()
class GMF(nn.Module):
def __init__(self, n_user, n_item, n_emb=8):
super(GMF, self).__init__()
self.n_emb = n_emb
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, n_emb)
self.embeddings_item = nn.Embedding(n_item, n_emb)
self.out = nn.Linear(in_features=n_emb, out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
elif isinstance(m, nn.Linear):
nn.init.uniform_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
prod = user_emb*item_emb
preds = self.out(prod)
return preds
def train(model, train_loader, criterion, optimizer, epoch):
model.train()
train_steps = (len(train_loader.dataset) // train_loader.batch_size) + 1
running_loss=0
with trange(train_steps) as t:
for i, data in zip(t, train_loader):
t.set_description('epoch %i' % epoch)
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
optimizer.zero_grad()
preds = model(users, items)
loss = criterion(preds.squeeze(1), labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
avg_loss = running_loss/(i+1)
t.set_postfix(loss=np.sqrt(avg_loss))
return running_loss/train_steps
def valid(model, data_loader, criterion, mode='valid'):
model.eval()
steps = (len(data_loader.dataset) // data_loader.batch_size) + 1
running_loss=0
with torch.no_grad():
with trange(steps) as t:
for i, data in zip(t, data_loader):
t.set_description(mode)
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
preds = model(users, items)
loss = criterion(preds.squeeze(1), labels)
running_loss += loss.item()
avg_loss = running_loss/(i+1)
t.set_postfix(loss=np.sqrt(avg_loss))
return running_loss/steps
def checkpoint(model, modelpath):
torch.save(model.state_dict(), modelpath)
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
modeldir = args.modeldir
n_emb = args.n_emb
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
# I am going to perform a simple train/valid/test exercise, predicting
# directly the ratings. I will leave it to you to adapt the datasets so
# that we could get some ranking metrics
dataset = np.load(os.path.join(datadir, dataname))
train_dataset, test_dataset = dataset['train'][:, [0,1,3]], dataset['test'][:, [0,1,3]]
train_dataset, valid_dataset = train_test_split(train_dataset, test_size=0.2, stratify=train_dataset[:,2])
n_users, n_items = dataset['n_users'], dataset['n_items']
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
num_workers=4,
shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset,
batch_size=batch_size,
num_workers=4,
shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False
)
model = GMF(n_users, n_items, n_emb=n_emb)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)
criterion = nn.MSELoss()
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
# There are better ways of structuring the code, but since I already had
# it from the other experiments I will run it like this:
for epoch in range(1,epochs+1):
tr_loss = train(model, train_loader, criterion, optimizer, epoch)
val_loss = valid(model, valid_loader, criterion, mode='valid')
test_loss = valid(model, test_loader, criterion, mode='test')
print("test loss: {}".format(np.sqrt(test_loss)))
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
from gmf import train, evaluate, checkpoint
os.environ["CUDA_VISIBLE_DEVICES"] = '1' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--layers", type=str, default="[64,32,16,8]",
help="layer architecture. The first elements is used for the embedding \
layers and equals n_emb*2")
parser.add_argument("--dropouts", type=str, default="[0,0,0]",
help="dropout per dense layer. len(dropouts) = len(layers)-1")
parser.add_argument("--l2reg", type=float, default=0.,
help="l2 regularization")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="use CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance.")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation.")
return parser.parse_args()
class MLP(nn.Module):
"""
Concatenate Embeddings that are then passed through a series of Dense layers
"""
def __init__(self, n_user, n_item, layers, dropouts):
super(MLP, self).__init__()
self.layers = layers
self.n_layers = len(layers)
self.dropouts = dropouts
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, int(layers[0]/2))
self.embeddings_item = nn.Embedding(n_item, int(layers[0]/2))
self.mlp = nn.Sequential()
for i in range(1,self.n_layers):
self.mlp.add_module("linear%d" %i, nn.Linear(layers[i-1],layers[i]))
self.mlp.add_module("relu%d" %i, torch.nn.ReLU())
self.mlp.add_module("dropout%d" %i , torch.nn.Dropout(p=dropouts[i-1]))
# Task-2: replace '?' with two proper numbers in below code
# self.out = nn.Linear(in_features='?', out_features='?')
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
emb_vector = torch.cat([user_emb,item_emb], dim=1)
emb_vector = self.mlp(emb_vector)
preds = torch.sigmoid(self.out(emb_vector))
return preds
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
layers = eval(args.layers)
ll = str(layers[-1]) #last layer
dropouts = eval(args.dropouts)
dp = "wdp" if dropouts[0]!=0 else "wodp"
l2reg = args.l2reg
n_emb = int(layers[0]/2)
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "MLP" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_reg", str(l2reg).replace(".", "")]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_ll", ll]) + \
"_".join(["_dp", dp]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = MLP(n_users, n_items, layers, dropouts)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=l2reg)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2reg)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 #one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best MLP model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
def highlight_greaterthan(s, threshold, column):
is_max = pd.Series(data=False, index=s.index)
is_max[column] = s.loc[column] >= threshold
return ['background-color: lightgreen' if is_max.any() else '' for v in is_max]
def build_model_df(df):
df_cp = df.copy()
df_cp = df_cp[df_cp.modelname.str.contains("MLP|GMF") & ~df_cp.modelname.str.contains("MSE")]
df_cp['model'] = df_cp.modelname.apply(lambda x: 'GMF' if 'GMF' in x else 'MLP')
df_cp = (df_cp
.sort_values('best_hr', ascending=False)
.reset_index(drop=True)
)
n_emb = [int(mn.split("n_emb_")[1].split("_")[0]) for mn in df_cp.modelname.tolist()]
df_cp['n_emb'] = n_emb
df_cp = (df_cp
.sort_values(by=['model', 'n_emb', 'best_hr'], ascending=[True,True,False])
.reset_index(drop=True)
)
df_cp = df_cp.groupby(['model', 'n_emb']).first().reset_index()
return df_cp
def plot_emb(df):
sns.set(color_codes=True)
sns.set_context("notebook", font_scale=1.)
plt.figure(figsize=(15, 10))
plt.subplot(2,2,1)
plt.subplots_adjust(hspace=0.4)
fig = sns.lineplot(x='n_emb', y='best_hr', hue='model', style='model',
markers=True, markersize=10, linewidth=2, data=df)
fig.set(ylabel="HR@10")
fig.set(xlabel="Number of Embeddings")
plt.xticks(df.n_emb.unique())
plt.subplot(2,2,2)
fig = sns.lineplot(x='n_emb', y='best_ndcg', hue='model', style='model',
markers=True, markersize=10, linewidth=2, data=df)
fig.set(ylabel="NDCG@10")
fig.set(xlabel="Number of Embeddings")
plt.xticks(df.n_emb.unique())
plt.subplot(2,2,3)
fig = sns.lineplot(x='n_emb', y='iter_loss', hue='model', style='model',
markers=True, markersize=10, linewidth=2, data=df)
fig.set(ylabel="BCELoss")
fig.set(xlabel="Number of Embeddings")
plt.xticks(df.n_emb.unique())
def plot_loss(df):
sns.set(color_codes=True)
sns.set_context("notebook", font_scale=1.)
plt.figure(figsize=(15, 10))
plt.subplot(2,2,1)
plt.subplots_adjust(hspace=0.4)
fig = sns.lineplot(x='iter_loss', y='best_hr', hue='model', style='model',
markers=True, markersize=10, linewidth=2, data=df.round(4))
fig.set(ylabel="HR@10")
fig.set(xlabel="BCE Loss")
plt.subplot(2,2,2)
fig = sns.lineplot(x='iter_loss', y='best_ndcg', hue='model', style='model',
markers=True, markersize=10, linewidth=2, data=df.round(4))
fig.set(ylabel="NDCG@10")
fig.set(xlabel="BCE Loss")
import numpy as np
import pandas as pd
import gzip
import pickle
import argparse
import scipy.sparse as sp
from time import time
from pathlib import Path
from scipy.sparse import save_npz
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
def array2mtx(interactions):
num_users = interactions[:,0].max()
num_items = interactions[:,1].max()
mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
for user, item, rating in interactions:
mat[user, item] = rating
return mat.tocsr()
def standard_split(df, data_path):
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
n_ranks = df['rank'].nunique()
train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
# Save
np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
def neuralcf_split(df, data_path):
# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
# sort by rank
dfc = df.copy()
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)
# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
.apply(list)
.reset_index()
).item.tolist()
def sample_not_rated(item_list, rseed=1, n=99):
np.random.seed=rseed
return np.random.choice(np.setdiff1d(all_items, item_list), n)
print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))
negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
.sort_values('user', ascending=True)
.reset_index(drop=True)
)
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
test_negative=test_negative.values, negatives=np.array(non_rated_items),
n_users=n_users, n_items=n_items)
# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
def prepare_amazon(data_path, input_fname):
df = pd.read_json(data_path/input_fname, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
# rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
# mapping user and item ids to integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
standard_split(df, data_path)
neuralcf_split(df, data_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="prepare Amazon dataset")
parser.add_argument("--input_dir",type=str, default=".")
parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
args = parser.parse_args()
DATA_PATH = Path(args.input_dir)
reviews = args.input_fname
prepare_amazon(DATA_PATH, reviews)
This source diff could not be displayed because it is too large. You can view the blob instead.
# batch size
python gmf.py --batch_size 512 --lr 0.01 --n_emb 8 --epochs 30
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 8 --epochs 30
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 8 --epochs 30 --validate_every 2
# learning rates
python gmf.py --batch_size 1024 --lr 0.001 --n_emb 8 --epochs 30 --validate_every 2
python gmf.py --batch_size 1024 --lr 0.005 --n_emb 8 --epochs 30 --validate_every 2
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --epochs 30 --validate_every 2
# Embeddings
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 16 --epochs 30 --validate_every 2
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 32 --epochs 30 --validate_every 2
python gmf.py --batch_size 1024 --lr 0.01 --n_emb 64 --epochs 30 --validate_every 2
# batch size
python mlp.py --batch_size 512 --lr 0.01 --layers "[32, 16, 8]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.01 --layers "[32, 16, 8]" --epochs 30 --validate_every 2
# learning rates
python mlp.py --batch_size 1024 --lr 0.001 --layers "[32, 16, 8]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.005 --layers "[32, 16, 8]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.01 --layers "[32, 16, 8]" --epochs 30 --lr_scheduler --validate_every 2
# Embeddings
python mlp.py --batch_size 1024 --lr 0.01 --layers "[64, 32, 16]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.01 --layers "[128, 64, 32]" --epochs 30 --validate_every 2
# higher lr and lr_scheduler
python mlp.py --batch_size 1024 --lr 0.03 --layers "[64, 32, 16]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.03 --layers "[128, 64, 32]" --epochs 30 --validate_every 2
python mlp.py --batch_size 1024 --lr 0.03 --layers "[64, 32, 16]" --epochs 30 --lr_scheduler --validate_every 2
python mlp.py --batch_size 1024 --lr 0.03 --layers "[128, 64, 32]" --epochs 30 --lr_scheduler --validate_every 2
# neumf
python neumf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --layers "[32, 16, 8]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_512_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_512_reg_00_lr_001_n_emb_16_ll_8_dp_wodp_lrnr_adam_lrs_wolrs.pt" \
--epochs 1 --learner "SGD"
python neumf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--epochs 20 --learner "SGD" --validate_every 2
python neumf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--epochs 20 --learner "SGD" --validate_every 2
python neumf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--epochs 20 --learner "SGD" --validate_every 2
python neumf.py --batch_size 1024 --lr 0.01 --n_emb 8 --lr_scheduler --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--epochs 20 --validate_every 2
# I repeated this experiment 3 times: with and without momentum and a 3rd time
# with MSE but did not save it
python neumf.py --batch_size 1024 --lr 0.001 --n_emb 8 --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--freeze 1 --epochs 4 --learner "SGD"
python neumf.py --batch_size 1024 --lr 0.001 --n_emb 8 --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--freeze 1 --epochs 4 --learner "SGD"
python neumf.py --batch_size 1024 --lr 0.001 --n_emb 8 --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--freeze 1 --epochs 4 --learner "SGD"
python neumf.py --batch_size 1024 --lr 0.001 --n_emb 8 --layers "[128, 64, 32]" --dropouts "[0.,0.]" \
--mf_pretrain "GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt" \
--mlp_pretrain "MLP_bs_1024_reg_00_lr_003_n_emb_64_ll_32_dp_wodp_lrnr_adam_lrs_wlrs.pt" \
--freeze 1 --epochs 4
import numpy as np
import pandas as pd
import math
import heapq
def get_train_instances(train, negatives, n_items, n_neg):
user, item, labels = [],[],[]
for (u, i), r in train.items():
# positive instance
user.append(u)
item.append(i)
labels.append(1)
# negative instances: we also need to make sure they are not in the
# negative examples used for testing
for _ in range(n_neg):
j = np.random.randint(n_items)
while ((u, j) in train.keys()) or (j in negatives[u]):
j = np.random.randint(n_items)
user.append(u)
item.append(j)
labels.append(0)
train_w_negative = np.vstack([user,item,labels]).T
assert train_w_negative.shape[0] == (len(train) + len(train)*n_neg)
return train_w_negative.astype(np.int64)
def get_hitratio(ranklist, gtitem):
if gtitem in ranklist: return 1
return 0
def get_ndcg(ranklist, gtitem):
for i in range(len(ranklist)):
item = ranklist[i]
if item == gtitem:
return math.log(2) / math.log(i+2)
return 0
def get_scores(items, preds, topk):
gtitem = items[0]
# the following 3 lines of code ensure that the fact that the 1st item is
# gtitem does not affect the final rank
randidx = np.arange(100)
np.random.shuffle(randidx)
items, preds = items[randidx], preds[randidx]
map_item_score = dict( zip(items, preds) )
ranklist = heapq.nlargest(topk, map_item_score, key=map_item_score.get)
hr = get_hitratio(ranklist, gtitem)
ndcg = get_ndcg(ranklist, gtitem)
return hr, ndcg
<h2>项目名称:基于Neural Collaborative Filtering的影视推荐</h2>
<br>
<h3>1.项目简介:</h3>
----根据2017年WWW顶会论文(Neural Collaborative Filtering,[论文地址](https://arxiv.org/pdf/1708.05031.pdf),基于神经网络的协同滤波),阅读并掌握论文中的三种基于神经网络的模型:GMF(Generalized Matrix Factorization)、MLP(Multi-Layer Perceptron)、NeuMF(Fusion of GMF and MLP)。<br>
----基于对论文模型的理解,完成所提供pytorch代码的三个缺失部分,每个缺失部分对应一个项目子任务,补充完善后最终得到可运行成功的三个推荐模型。<br>
<h3>2.作业提交:</h3>
----上传完成的项目代码文件一份和作业小报告一份。报告中含有缺失代码的完善版本,含有三个模型运行成功的截屏(如下图所示)和最后运行成功的实验性能结果。<br>
<img src = '123.png'></img><br>
<h3>3.项目准备:</h3>
<h4>---3.1环境搭建</h4>
&emsp;conda create -n py36_torch14 python==3.6.10<br>
&emsp;conda activate py36_torch14<br>
&emsp;conda install pytorch<br>
&emsp;pip install pandas scipy joblib tqdm<br>
<h4>---3.2数据集下载</h4>
&emsp;[数据集](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz)(约700M)<br>
&emsp;[百度网盘](https://pan.baidu.com/s/1kB4HGyfBjl5ZIGcJPWezNA)(提取码:bc2y)<br>
&emsp;数据集下载后存放在代码所在的根目录下即可<br>
<h3>4.项目实施:</h3>
<h4>---4.1第一步:数据处理</h4>
<br>&emsp;执行Python prepare_data.py <br>
&emsp;在当前目录下生成数据文件。<br>
<h4>---4.2第二步:运行GMF模型(含任务1)</h4>
<br>&emsp;任务1:根据论文中GMF模型描述,补充完善gmf.py文件中GMF类forward前传函数输出值preds的计算过程,请用2-3行代码补充完整。 <br>
&emsp;补充完善后,执行Python gmf.py,可根据需要设置超参数。<br><br>
<img src = '234.png')</img><br>
<h4>---4.3第三步:运行MLP模型(含任务2)</h4>
<br>&emsp;任务2:根据论文中MLP模型的描述,针对mlp.py文件中MLP类__init__函数的self.out网络层所在行,填入被抠去的两个数字,实现正确的网络输出层。 <br>
&emsp;补充完善后,执行Python mlp.py,可根据需要设置超参数。<br><br>
<img src = '345.png')</img><br>
<h4>---4.4第四步:运行NeuMF模型(含任务3)</h4>
<br>&emsp;任务3:根据论文中NeuMF模型的描述,针对neumf.py文件中NeuMF类forward函数的emb_vector所在行,选择用代码中的两个张量去替换掉’tensor1’和’tensor2’,实现正确的级联向量。 <br>
&emsp;补充完善后,执行Python neumf.py,可根据需要设置超参数。<br><br>
<img src = '567.png')</img><br>
<h4>---4.5第五步:实验和结果对比展示</h4>
<br>&emsp;感兴趣的同学可根据run_experiments.sh脚本的参数,利用三个模型进行多次实验,调整模型的超参数,使用results_summary.ipynb对模型的实验结果进行分析和展示。 <br>
&emsp;
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment