Commit 49a7b493 by 20200318111

推荐系统作业

parents
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--n_emb", type=int, default=8,
help="embedding size.")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="boolean to set the use of CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation")
return parser.parse_args()
class GMF(nn.Module):
def __init__(self, n_user, n_item, n_emb=8):
super(GMF, self).__init__()
self.n_emb = n_emb
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, n_emb)
self.embeddings_item = nn.Embedding(n_item, n_emb)
self.out = nn.Linear(in_features=n_emb, out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
elif isinstance(m, nn.Linear):
nn.init.uniform_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
# Task-1: complete the proces to compute preds
# preds = '...'
preds = torch.sigmoid(self.out(torch.mul(user_emb, item_emb)))
return preds
def train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg):
model.train()
train_dataset = get_train_instances(train_ratings,
negatives,
n_items,
n_neg)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
num_workers=4,
shuffle=True)
train_steps = (len(train_loader.dataset) // train_loader.batch_size) + 1
running_loss=0
for data in train_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
optimizer.zero_grad()
preds = model(users, items)
if scheduler:
scheduler.step()
loss = criterion(preds.squeeze(1), labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
return running_loss/train_steps
def evaluate(model, test_loader, use_cuda, topk):
model.eval()
scores=[]
with torch.no_grad():
for data in test_loader:
users = data[:,0]
items = data[:,1]
labels = data[:,2].float()
if use_cuda:
users, items, labels = users.cuda(), items.cuda(), labels.cuda()
preds = model(users, items)
items_cpu = items.cpu().numpy()
preds_cpu = preds.squeeze(1).detach().cpu().numpy()
split_chuncks = preds_cpu.shape[0]//100
litems=np.split(items_cpu, split_chuncks)
lpreds=np.split(preds_cpu, split_chuncks)
scores += [get_scores(it,pr,topk) for it,pr in zip(litems,lpreds)]
hits = [s[0] for s in scores]
ndcgs = [s[1] for s in scores]
return (np.array(hits).mean(),np.array(ndcgs).mean())
def checkpoint(model, modelpath):
torch.save(model.state_dict(), modelpath)
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
n_emb = args.n_emb
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "GMF" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
if not os.path.exists(modeldir): os.makedirs(modeldir)
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = GMF(n_users, n_items, n_emb=n_emb)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 # one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best GMF model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
import numpy as np
import pandas as pd
import os
import torch
import argparse
import heapq
import pdb
from tqdm import tqdm,trange
from time import time
from scipy.sparse import load_npz
from torch import nn
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
from gmf import train, evaluate, checkpoint
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str, default=".",
help="data directory.")
parser.add_argument("--modeldir", type=str, default="./models",
help="models directory")
parser.add_argument("--dataname", type=str, default="neuralcf_split.npz",
help="npz file with dataset")
parser.add_argument("--train_matrix", type=str, default="neuralcf_train_sparse.npz",
help="train matrix for faster iteration")
parser.add_argument("--epochs", type=int, default=20,
help="number of epochs.")
parser.add_argument("--batch_size", type=int, default=1024,
help="batch size.")
parser.add_argument("--layers", type=str, default="[64,32,16,8]",
help="layer architecture. The first elements is used for the embedding \
layers and equals n_emb*2")
parser.add_argument("--dropouts", type=str, default="[0,0,0]",
help="dropout per dense layer. len(dropouts) = len(layers)-1")
parser.add_argument("--l2reg", type=float, default=0.,
help="l2 regularization")
parser.add_argument("--lr", type=float, default=0.01,
help="if lr_scheduler this will be max_lr")
parser.add_argument("--learner", type=str, default="adam",
help="Specify an optimizer: adagrad, adam, rmsprop, sgd")
parser.add_argument("--lr_scheduler", action="store_true",
help="use CyclicLR during training")
parser.add_argument("--validate_every", type=int, default=1,
help="validate every n epochs")
parser.add_argument("--save_model", type=int, default=1)
parser.add_argument("--n_neg", type=int, default=4,
help="number of negative instances to consider per positive instance.")
parser.add_argument("--topk", type=int, default=10,
help="number of items to retrieve for recommendation.")
return parser.parse_args()
class MLP(nn.Module):
"""
Concatenate Embeddings that are then passed through a series of Dense layers
"""
def __init__(self, n_user, n_item, layers, dropouts):
super(MLP, self).__init__()
self.layers = layers
self.n_layers = len(layers)
self.dropouts = dropouts
self.n_user = n_user
self.n_item = n_item
self.embeddings_user = nn.Embedding(n_user, int(layers[0]/2))
self.embeddings_item = nn.Embedding(n_item, int(layers[0]/2))
self.mlp = nn.Sequential()
for i in range(1,self.n_layers):
self.mlp.add_module("linear%d" %i, nn.Linear(layers[i-1],layers[i]))
self.mlp.add_module("relu%d" %i, torch.nn.ReLU())
self.mlp.add_module("dropout%d" %i , torch.nn.Dropout(p=dropouts[i-1]))
# Task-2: replace '?' with two proper numbers in below code
# self.out = nn.Linear(in_features='?', out_features='?')
self.out = nn.Linear(in_features= layers[-1], out_features= 1)
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight)
def forward(self, users, items):
user_emb = self.embeddings_user(users)
item_emb = self.embeddings_item(items)
emb_vector = torch.cat([user_emb,item_emb], dim=1)
emb_vector = self.mlp(emb_vector)
preds = torch.sigmoid(self.out(emb_vector))
return preds
if __name__ == '__main__':
args = parse_args()
datadir = args.datadir
dataname = args.dataname
train_matrix = args.train_matrix
modeldir = args.modeldir
layers = eval(args.layers)
ll = str(layers[-1]) #last layer
dropouts = eval(args.dropouts)
dp = "wdp" if dropouts[0]!=0 else "wodp"
l2reg = args.l2reg
n_emb = int(layers[0]/2)
batch_size = args.batch_size
epochs = args.epochs
learner = args.learner
lr = args.lr
lr_scheduler = args.lr_scheduler
lrs = "wlrs" if lr_scheduler else "wolrs"
validate_every = args.validate_every
save_model = args.save_model
topk = args.topk
n_neg = args.n_neg
modelfname = "MLP" + \
"_".join(["_bs", str(batch_size)]) + \
"_".join(["_reg", str(l2reg).replace(".", "")]) + \
"_".join(["_lr", str(lr).replace(".", "")]) + \
"_".join(["_n_emb", str(n_emb)]) + \
"_".join(["_ll", ll]) + \
"_".join(["_dp", dp]) + \
"_".join(["_lrnr", learner]) + \
"_".join(["_lrs", lrs]) + \
".pt"
modelpath = os.path.join(modeldir, modelfname)
resultsdfpath = os.path.join(modeldir, 'results_df.p')
dataset = np.load(os.path.join(datadir, dataname))
train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
test_ratings, negatives = dataset['test_negative'], dataset['negatives']
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()
test_loader = DataLoader(dataset=test_ratings,
batch_size=1000,
shuffle=False
)
model = MLP(n_users, n_items, layers, dropouts)
if learner.lower() == "adagrad":
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=l2reg)
elif learner.lower() == "rmsprop":
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9)
elif learner.lower() == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2reg)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=l2reg,
momentum=0.9, nesterov=True)
criterion = nn.BCELoss()
training_steps = ((len(train_ratings)+len(train_ratings)*n_neg)//batch_size)+1
step_size = training_steps*3 #one cycle every 6 epochs
cycle_momentum=True
if learner.lower() == "adagrad" or learner.lower()=="adam":
cycle_momentum=False
if lr_scheduler:
scheduler = CyclicLR(optimizer, step_size_up=step_size, base_lr=lr/10., max_lr=lr,
cycle_momentum=cycle_momentum)
else:
scheduler = None
use_cuda = torch.cuda.is_available()
if use_cuda:
model = model.cuda()
best_hr, best_ndcgm, best_iter=0,0,0
for epoch in range(1,epochs+1):
t1 = time()
loss = train(model, criterion, optimizer, scheduler, epoch, batch_size,
use_cuda, train_ratings, negatives, n_items, n_neg)
t2 = time()
if epoch % validate_every == 0:
(hr, ndcg) = evaluate(model, test_loader, use_cuda, topk)
print("Epoch: {} {:.2f}s, LOSS = {:.4f}, HR = {:.4f}, NDCG = {:.4f}, validated in {:.2f}s".
format(epoch, t2-t1, loss, hr, ndcg, time()-t2))
if hr > best_hr:
iter_loss, best_hr, best_ndcg, best_iter, train_time = \
loss, hr, ndcg, epoch, t2-t1
if save_model:
checkpoint(model, modelpath)
print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
if save_model:
print("The best MLP model is saved to {}".format(modelpath))
if save_model:
cols = ["modelname", "iter_loss","best_hr", "best_ndcg", "best_iter","train_time"]
vals = [modelfname, iter_loss, best_hr, best_ndcg, best_iter, train_time]
if not os.path.isfile(resultsdfpath):
results_df = pd.DataFrame(columns=cols)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
else:
results_df = pd.read_pickle(resultsdfpath)
experiment_df = pd.DataFrame(data=[vals], columns=cols)
results_df = results_df.append(experiment_df, ignore_index=True)
results_df.to_pickle(resultsdfpath)
This diff is collapsed. Click to expand it.
import numpy as np
import pandas as pd
import gzip
import pickle
import argparse
import scipy.sparse as sp
from time import time
from pathlib import Path
from scipy.sparse import save_npz
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
def array2mtx(interactions):
num_users = interactions[:,0].max()
num_items = interactions[:,1].max()
mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
for user, item, rating in interactions:
mat[user, item] = rating
return mat.tocsr()
def standard_split(df, data_path):
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
n_ranks = df['rank'].nunique()
train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
# Save
np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
def neuralcf_split(df, data_path):
# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
# sort by rank
dfc = df.copy()
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)
# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
.apply(list)
.reset_index()
).item.tolist()
def sample_not_rated(item_list, rseed=1, n=99):
np.random.seed=rseed
return np.random.choice(np.setdiff1d(all_items, item_list), n)
print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))
negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
.sort_values('user', ascending=True)
.reset_index(drop=True)
)
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
test_negative=test_negative.values, negatives=np.array(non_rated_items),
n_users=n_users, n_items=n_items)
# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
def prepare_amazon(data_path, input_fname):
df = pd.read_json(data_path/input_fname, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
# rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
# mapping user and item ids to integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
standard_split(df, data_path)
neuralcf_split(df, data_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="prepare Amazon dataset")
parser.add_argument("--input_dir",type=str, default=".")
parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
args = parser.parse_args()
DATA_PATH = Path(args.input_dir)
reviews = args.input_fname
prepare_amazon(DATA_PATH, reviews)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment