Commit 49a7b493 by 20200318111

推荐系统作业

parents
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--n_emb", type=int, default=8,
help="embedding size.")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="boolean to set the use of CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation")
return parser.parse_args()
class GMF(nn.Module):
def __init__(self, n_user, n_item, n_emb=8):
super(GMF, self).__init__()
self.n_emb = n_emb
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, n_emb)
self.embeddings_item = nn.Embedding(n_item, n_emb)
self.out = nn.Linear(in_features=n_emb, out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
elif isinstance(m, nn.Linear):
nn.init.uniform_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
# Task-1: complete the proces to compute preds
# preds = '...'
preds = torch.sigmoid(self.out(torch.mul(user_emb, item_emb)))
return preds
def train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg):
model.train()
train_dataset = get_train_instances(train_ratings,
negatives,
n_items,
n_neg)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
num_workers=4,
shuffle=True)
train_steps = (len(train_loader.dataset) // train_loader.batch_size) + 1
running_loss=0
for data in train_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
optimizer.zero_grad()
preds = model(users, items)
if scheduler:
scheduler.step()
loss = criterion(preds.squeeze(1), labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
return running_loss/train_steps
def evaluate(model, test_loader, use_cuda, topk):
model.eval()
scores=[]
with torch.no_grad():
for data in test_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
preds = model(users, items)
items_cpu = items.cpu().numpy()
preds_cpu = preds.squeeze(1).detach().cpu().numpy()
split_chuncks = preds_cpu.shape[0]//100
litems=np.split(items_cpu, split_chuncks)
lpreds=np.split(preds_cpu, split_chuncks)
scores += [get_scores(it,pr,topk) for it,pr in zip(litems,lpreds)]
hits = [s[0] for s in scores]
ndcgs = [s[1] for s in scores]
return (np.array(hits).mean(),np.array(ndcgs).mean())
def checkpoint(model, modelpath):
torch.save(model.state_dict(), modelpath)
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
n_emb = args.n_emb
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "GMF" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
if not os.path.exists(modeldir): os.makedirs(modeldir)
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = GMF(n_users, n_items, n_emb=n_emb)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 # one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best GMF model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
from gmf import train, evaluate, checkpoint
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--layers", type=str, default="[64,32,16,8]",
help="layer architecture. The first elements is used for the embedding \
layers and equals n_emb*2")
parser.add_argument("--dropouts", type=str, default="[0,0,0]",
help="dropout per dense layer. len(dropouts) = len(layers)-1")
parser.add_argument("--l2reg", type=float, default=0.,
help="l2 regularization")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="use CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance.")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation.")
return parser.parse_args()
class MLP(nn.Module):
"""
Concatenate Embeddings that are then passed through a series of Dense layers
"""
def __init__(self, n_user, n_item, layers, dropouts):
super(MLP, self).__init__()
self.layers = layers
self.n_layers = len(layers)
self.dropouts = dropouts
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, int(layers[0]/2))
self.embeddings_item = nn.Embedding(n_item, int(layers[0]/2))
self.mlp = nn.Sequential()
for i in range(1,self.n_layers):
self.mlp.add_module("linear%d" %i, nn.Linear(layers[i-1],layers[i]))
self.mlp.add_module("relu%d" %i, torch.nn.ReLU())
self.mlp.add_module("dropout%d" %i , torch.nn.Dropout(p=dropouts[i-1]))
# Task-2: replace '?' with two proper numbers in below code
# self.out = nn.Linear(in_features='?', out_features='?')
self.out = nn.Linear(in_features= layers[-1], out_features= 1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
emb_vector = torch.cat([user_emb,item_emb], dim=1)
emb_vector = self.mlp(emb_vector)
preds = torch.sigmoid(self.out(emb_vector))
return preds
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
layers = eval(args.layers)
ll = str(layers[-1]) #last layer
dropouts = eval(args.dropouts)
dp = "wdp" if dropouts[0]!=0 else "wodp"
l2reg = args.l2reg
n_emb = int(layers[0]/2)
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "MLP" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_reg", str(l2reg).replace(".", "")]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_ll", ll]) + \
"_".join(["_dp", dp]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = MLP(n_users, n_items, layers, dropouts)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=l2reg)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2reg)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 #one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best MLP model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
from gmf import GMF, train, evaluate, checkpoint
from mlp import MLP
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
# dirnames
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
# general parameter
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--lr", type=float, default=0.001,
help="learning rate.")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="boolean to set the use of CyclicLR during training")
# GMF set up
parser.add_argument("--n_emb", type=int, default=8,
help="embedding size for the GMF part.")
# MLP set up
parser.add_argument("--layers", type=str, default="[64,32,16,8]",
help="layer architecture. The first elements is used for the embedding \
layers for the MLP part and equals n_emb*2")
parser.add_argument("--dropouts", type=str, default="[0.,0.,0.]",
help="dropout per dense layer. len(dropouts) = len(layers)-1")
# regularization
parser.add_argument("--l2reg", type=float, default=0.,
help="l2 regularization.")
# Pretrained model names
parser.add_argument("--freeze", type=int, default=0,
help="freeze all but the last output layer where \
weights are combined")
parser.add_argument("--mf_pretrain", type=str, default="",
help="Specify the pretrain model filename for GMF part. \
If empty, no pretrain will be used")
parser.add_argument("--mlp_pretrain", type=str, default="",
help="Specify the pretrain model filename for MLP part. \
If empty, no pretrain will be used")
# Experiment set up
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance.")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation.")
return parser.parse_args()
class NeuMF(nn.Module):
def __init__(self, n_user, n_item, n_emb, layers, dropouts):
super(NeuMF, self).__init__()
self.layers = layers
self.n_layers = len(layers)
self.dropouts = dropouts
self.n_user = n_user
self.n_item = n_item
self.mf_embeddings_user = nn.Embedding(n_user, n_emb)
self.mf_embeddings_item = nn.Embedding(n_item, n_emb)
self.mlp_embeddings_user = nn.Embedding(n_user, layers[0]//2)
self.mlp_embeddings_item = nn.Embedding(n_item, layers[0]//2)
self.mlp = nn.Sequential()
for i in range(1,self.n_layers):
self.mlp.add_module("linear%d" %i, nn.Linear(layers[i-1],layers[i]))
self.mlp.add_module("relu%d" %i, torch.nn.ReLU())
self.mlp.add_module("dropout%d" %i , torch.nn.Dropout(p=dropouts[i-1]))
self.out = nn.Linear(in_features=n_emb+layers[-1], out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
def forward(self, users, items):
mf_user_emb = self.mf_embeddings_user(users)
mf_item_emb = self.mf_embeddings_item(items)
mlp_user_emb = self.mlp_embeddings_user(users)
mlp_item_emb = self.mlp_embeddings_item(items)
mf_emb_vector = mf_user_emb*mf_item_emb
mlp_emb_vector = torch.cat([mlp_user_emb,mlp_item_emb], dim=1)
mlp_emb_vector = self.mlp(mlp_emb_vector)
# Task-3: replace 'tensor1' and 'tensor2' with two proper tensors in below code
# emb_vector = torch.cat(['tensor1','tensor2'], dim=1)
emb_vector = torch.cat([mf_emb_vector, mlp_emb_vector], dim=1)
preds = torch.sigmoid(self.out(emb_vector))
return preds
def load_pretrain_model(model, gmf_model, mlp_model):
# MF embeddings
model.mf_embeddings_item.weight = gmf_model.embeddings_item.weight
model.mf_embeddings_user.weight = gmf_model.embeddings_user.weight
# MLP embeddings
model.mlp_embeddings_item.weight = mlp_model.embeddings_item.weight
model.mlp_embeddings_user.weight = mlp_model.embeddings_user.weight
# MLP layers
model_dict = model.state_dict()
mlp_layers_dict = mlp_model.state_dict()
mlp_layers_dict = {k: v for k, v in mlp_layers_dict.items() if 'linear' in k}
model_dict.update(mlp_layers_dict)
model.load_state_dict(model_dict)
# Prediction weights
mf_prediction_weight, mf_prediction_bias = gmf_model.out.weight, gmf_model.out.bias
mlp_prediction_weight, mlp_prediction_bias = mlp_model.out.weight, mlp_model.out.bias
new_weight = torch.cat([mf_prediction_weight, mlp_prediction_weight], dim=1)
new_bias = mf_prediction_bias + mlp_prediction_bias
model.out.weight = torch.nn.Parameter(0.5*new_weight)
model.out.bias = torch.nn.Parameter(0.5*new_bias)
return model
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
epochs = args.epochs
batch_size = args.batch_size
lr = args.lr
learner = args.learner
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
n_emb = args.n_emb
layers = eval(args.layers)
dropouts = eval(args.dropouts)
freeze = bool(args.freeze)
mf_pretrain = os.path.join(modeldir, args.mf_pretrain)
mlp_pretrain = os.path.join(modeldir, args.mlp_pretrain)
with_pretrained = "wpret" if os.path.isfile(mf_pretrain) else "wopret"
is_frozen = "frozen" if freeze else "trainable"
l2reg = args.l2reg
validate_every = args.validate_every
save_model = bool(args.save_model)
n_neg = args.n_neg
topk = args.topk
modelfname = "NeuMF" + \
"_" + with_pretrained + \
"_" + is_frozen + \
"_" + learner + \
"_".join(["_lrs", lrs]) + \
".pt"
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = NeuMF(n_users, n_items, n_emb, layers, dropouts)
if os.path.isfile(mf_pretrain) and os.path.isfile(mlp_pretrain):
gmf_model = GMF(n_users, n_items, n_emb)
gmf_model.load_state_dict(torch.load(mf_pretrain))
mlp_model = MLP(n_users, n_items, layers, dropouts)
mlp_model.load_state_dict(torch.load(mlp_pretrain))
model = load_pretrain_model(model, gmf_model, mlp_model)
print("Load pretrained GMF {} and MLP {} models done. ".format(mf_pretrain, mlp_pretrain))
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
if freeze:
for name, layer in model.named_parameters():
if not ("out" in name):
layer.requires_grad = False
# or this and pass train_parametes to the optimizer
# train_parametes = model.out.parameters() if freeze else model.parameters()
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=l2reg)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2reg)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
# model_parameters = filter(lambda p: p.requires_grad, model.parameters())
# trainable_params = sum([np.prod(p.size()) for p in model_parameters])
# print(trainable_params)
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*10
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best NeuMF model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import numpy as np
import pandas as pd
import gzip
import pickle
import argparse
import scipy.sparse as sp
from time import time
from pathlib import Path
from scipy.sparse import save_npz
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
def array2mtx(interactions):
num_users = interactions[:,0].max()
num_items = interactions[:,1].max()
mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
for user, item, rating in interactions:
mat[user, item] = rating
return mat.tocsr()
def standard_split(df, data_path):
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
n_ranks = df['rank'].nunique()
train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
# Save
np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
def neuralcf_split(df, data_path):
# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
# sort by rank
dfc = df.copy()
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)
# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
.apply(list)
.reset_index()
).item.tolist()
def sample_not_rated(item_list, rseed=1, n=99):
np.random.seed=rseed
return np.random.choice(np.setdiff1d(all_items, item_list), n)
print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))
negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
.sort_values('user', ascending=True)
.reset_index(drop=True)
)
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
test_negative=test_negative.values, negatives=np.array(non_rated_items),
n_users=n_users, n_items=n_items)
# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
def prepare_amazon(data_path, input_fname):
df = pd.read_json(data_path/input_fname, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
# rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
# mapping user and item ids to integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
standard_split(df, data_path)
neuralcf_split(df, data_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="prepare Amazon dataset")
parser.add_argument("--input_dir",type=str, default=".")
parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
args = parser.parse_args()
DATA_PATH = Path(args.input_dir)
reviews = args.input_fname
prepare_amazon(DATA_PATH, reviews)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment