Commit cd5aec2c by 20200318029

homework6

parent c9476408
......@@ -5,4 +5,7 @@
__pycache__/
data/
.DS_Store
*.json
\ No newline at end of file
*.json.gz
models/
*.p
*.npz
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -76,6 +76,8 @@ class GMF(nn.Module):
# Task-1: complete the proces to compute preds
# preds = '...'
preds = self.out(user_emb * item_emb)
preds = torch.sigmoid(preds)
return preds
......
......@@ -15,7 +15,7 @@ from torch.utils.data import DataLoader, Dataset
from utils import get_train_instances, get_scores
from gmf import train, evaluate, checkpoint
os.environ["CUDA_VISIBLE_DEVICES"] = '1' # assign GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
......@@ -81,6 +81,7 @@ class MLP(nn.Module):
# Task-2: replace '?' with two proper numbers in below code
# self.out = nn.Linear(in_features='?', out_features='?')
self.out = nn.Linear(in_features=layers[-1], out_features=1)
for m in self.modules():
if isinstance(m, nn.Embedding):
......
......@@ -15,7 +15,7 @@ from utils import get_train_instances, get_scores
from gmf import GMF, train, evaluate, checkpoint
from mlp import MLP
os.environ["CUDA_VISIBLE_DEVICES"] = '2' # assign GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # assign GPU
def parse_args():
......@@ -120,6 +120,7 @@ class NeuMF(nn.Module):
# Task-3: replace 'tensor1' and 'tensor2' with two proper tensors in below code
# emb_vector = torch.cat(['tensor1','tensor2'], dim=1)
emb_vector = torch.cat([mf_emb_vector, mlp_emb_vector], dim=1)
preds = torch.sigmoid(self.out(emb_vector))
return preds
......
......@@ -23,121 +23,121 @@ def array2mtx(interactions):
def standard_split(df, data_path):
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
n_ranks = df['rank'].nunique()
train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
n_ranks = df['rank'].nunique()
train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
# Save
np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
# Save
np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
def neuralcf_split(df, data_path):
# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
# sort by rank
dfc = df.copy()
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)
# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
.apply(list)
.reset_index()
).item.tolist()
def sample_not_rated(item_list, rseed=1, n=99):
np.random.seed=rseed
return np.random.choice(np.setdiff1d(all_items, item_list), n)
print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))
negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
.sort_values('user', ascending=True)
.reset_index(drop=True)
)
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
test_negative=test_negative.values, negatives=np.array(non_rated_items),
n_users=n_users, n_items=n_items)
# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
# sort by rank
dfc = df.copy()
# Cardinality
n_users = df.user.nunique()
n_items = df.item.nunique()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)
# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
.apply(list)
.reset_index()
).item.tolist()
def sample_not_rated(item_list, rseed=1, n=99):
np.random.seed=rseed
return np.random.choice(np.setdiff1d(all_items, item_list), n)
print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))
negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
.sort_values('user', ascending=True)
.reset_index(drop=True)
)
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
test_negative=test_negative.values, negatives=np.array(non_rated_items),
n_users=n_users, n_items=n_items)
# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
def prepare_amazon(data_path, input_fname):
df = pd.read_json(data_path/input_fname, lines=True)
df = pd.read_json(data_path/input_fname, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
# rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
# rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
# mapping user and item ids to integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
# mapping user and item ids to integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
standard_split(df, data_path)
neuralcf_split(df, data_path)
standard_split(df, data_path)
neuralcf_split(df, data_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="prepare Amazon dataset")
parser = argparse.ArgumentParser(description="prepare Amazon dataset")
parser.add_argument("--input_dir",type=str, default=".")
parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
args = parser.parse_args()
parser.add_argument("--input_dir",type=str, default=".")
parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
args = parser.parse_args()
DATA_PATH = Path(args.input_dir)
reviews = args.input_fname
DATA_PATH = Path(args.input_dir)
reviews = args.input_fname
prepare_amazon(DATA_PATH, reviews)
prepare_amazon(DATA_PATH, reviews)
## 1. gmf
```sh
python gmf.py
```
Epoch: 1 262.31s, LOSS = 0.5026, HR = 0.1032, NDCG = 0.0475, validated in 153.72s
Epoch: 2 260.43s, LOSS = 0.4978, HR = 0.2571, NDCG = 0.1365, validated in 153.29s
Epoch: 3 259.84s, LOSS = 0.4085, HR = 0.5034, NDCG = 0.3083, validated in 152.75s
Epoch: 4 259.75s, LOSS = 0.3646, HR = 0.5244, NDCG = 0.3200, validated in 152.57s
Epoch: 5 260.03s, LOSS = 0.3398, HR = 0.5335, NDCG = 0.3260, validated in 153.25s
Epoch: 6 259.52s, LOSS = 0.3151, HR = 0.5613, NDCG = 0.3494, validated in 152.72s
Epoch: 7 259.14s, LOSS = 0.2942, HR = 0.5828, NDCG = 0.3666, validated in 152.93s
Epoch: 8 259.48s, LOSS = 0.2780, HR = 0.5941, NDCG = 0.3766, validated in 152.79s
Epoch: 9 259.25s, LOSS = 0.2656, HR = 0.6149, NDCG = 0.3928, validated in 152.74s
Epoch: 10 259.59s, LOSS = 0.2564, HR = 0.6297, NDCG = 0.4014, validated in 152.70s
Epoch: 11 258.89s, LOSS = 0.2489, HR = 0.6407, NDCG = 0.4122, validated in 152.90s
Epoch: 12 259.50s, LOSS = 0.2430, HR = 0.6533, NDCG = 0.4193, validated in 152.77s
Epoch: 13 259.17s, LOSS = 0.2380, HR = 0.6564, NDCG = 0.4272, validated in 152.98s
Epoch: 14 259.66s, LOSS = 0.2340, HR = 0.6651, NDCG = 0.4315, validated in 153.17s
Epoch: 15 259.46s, LOSS = 0.2307, HR = 0.6707, NDCG = 0.4370, validated in 152.97s
Epoch: 16 259.71s, LOSS = 0.2275, HR = 0.6770, NDCG = 0.4404, validated in 152.91s
Epoch: 17 258.68s, LOSS = 0.2250, HR = 0.6843, NDCG = 0.4477, validated in 153.00s
Epoch: 18 259.66s, LOSS = 0.2227, HR = 0.6832, NDCG = 0.4486, validated in 153.61s
Epoch: 19 260.77s, LOSS = 0.2207, HR = 0.6868, NDCG = 0.4464, validated in 154.02s
Epoch: 20 260.41s, LOSS = 0.2190, HR = 0.6865, NDCG = 0.4457, validated in 152.90s
End. Best Iteration 19: HR = 0.6868, NDCG = 0.4464.
The best GMF model is saved to ./models\GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt
## 2. mlp
```sh
python mlp.py
```
Epoch: 1 280.25s, LOSS = 0.3882, HR = 0.5336, NDCG = 0.3288, validated in 158.24s
Epoch: 2 281.01s, LOSS = 0.3617, HR = 0.5443, NDCG = 0.3328, validated in 158.10s
Epoch: 3 281.09s, LOSS = 0.3460, HR = 0.5555, NDCG = 0.3412, validated in 158.71s
Epoch: 4 280.95s, LOSS = 0.3286, HR = 0.5663, NDCG = 0.3546, validated in 158.17s
Epoch: 5 280.30s, LOSS = 0.3099, HR = 0.5758, NDCG = 0.3649, validated in 156.86s
Epoch: 6 296.81s, LOSS = 0.2927, HR = 0.5834, NDCG = 0.3747, validated in 157.70s
Epoch: 7 281.39s, LOSS = 0.2769, HR = 0.5877, NDCG = 0.3719, validated in 157.40s
Epoch: 8 281.28s, LOSS = 0.2629, HR = 0.5885, NDCG = 0.3789, validated in 156.90s
Epoch: 9 280.25s, LOSS = 0.2502, HR = 0.5923, NDCG = 0.3811, validated in 157.00s
Epoch: 10 280.88s, LOSS = 0.2395, HR = 0.6002, NDCG = 0.3915, validated in 156.67s
Epoch: 11 279.24s, LOSS = 0.2305, HR = 0.6000, NDCG = 0.3959, validated in 157.01s
Epoch: 12 280.54s, LOSS = 0.2224, HR = 0.6062, NDCG = 0.4018, validated in 159.55s
Epoch: 13 298.20s, LOSS = 0.2154, HR = 0.6044, NDCG = 0.3970, validated in 157.81s
Epoch: 14 296.25s, LOSS = 0.2091, HR = 0.6030, NDCG = 0.3994, validated in 157.01s
Epoch: 15 296.10s, LOSS = 0.2037, HR = 0.5994, NDCG = 0.3991, validated in 157.08s
Epoch: 16 295.99s, LOSS = 0.1988, HR = 0.6101, NDCG = 0.4035, validated in 157.52s
Epoch: 17 296.45s, LOSS = 0.1946, HR = 0.5959, NDCG = 0.3957, validated in 156.45s
Epoch: 18 295.52s, LOSS = 0.1908, HR = 0.6023, NDCG = 0.4023, validated in 156.69s
Epoch: 19 296.37s, LOSS = 0.1874, HR = 0.6086, NDCG = 0.4045, validated in 156.69s
Epoch: 20 295.67s, LOSS = 0.1842, HR = 0.5976, NDCG = 0.3979, validated in 157.77s
End. Best Iteration 16: HR = 0.6101, NDCG = 0.4035.
The best MLP model is saved to ./models\MLP_bs_1024_reg_00_lr_001_n_emb_32_ll_8_dp_wodp_lrnr_adam_lrs_wolrs.pt
## 3. neumf
```sh
python neumf.py
```
Epoch: 1 170.91s, LOSS = 0.4144, HR = 0.5345, NDCG = 0.3307, validated in 139.97s
Epoch: 2 168.71s, LOSS = 0.3779, HR = 0.5356, NDCG = 0.3307, validated in 142.42s
Epoch: 3 170.15s, LOSS = 0.3680, HR = 0.5377, NDCG = 0.3316, validated in 139.07s
Epoch: 4 168.91s, LOSS = 0.3612, HR = 0.5372, NDCG = 0.3317, validated in 139.91s
Epoch: 5 165.80s, LOSS = 0.3558, HR = 0.5384, NDCG = 0.3315, validated in 142.21s
Epoch: 6 167.40s, LOSS = 0.3513, HR = 0.5380, NDCG = 0.3291, validated in 143.98s
Epoch: 7 168.12s, LOSS = 0.3464, HR = 0.5378, NDCG = 0.3292, validated in 141.11s
Epoch: 8 167.75s, LOSS = 0.3412, HR = 0.5403, NDCG = 0.3280, validated in 140.05s
Epoch: 9 168.61s, LOSS = 0.3361, HR = 0.5363, NDCG = 0.3254, validated in 139.48s
Epoch: 10 167.73s, LOSS = 0.3308, HR = 0.5338, NDCG = 0.3225, validated in 138.01s
Epoch: 11 169.24s, LOSS = 0.3249, HR = 0.5286, NDCG = 0.3182, validated in 137.74s
Epoch: 12 170.76s, LOSS = 0.3189, HR = 0.5295, NDCG = 0.3182, validated in 138.17s
Epoch: 13 167.33s, LOSS = 0.3125, HR = 0.5294, NDCG = 0.3185, validated in 140.21s
Epoch: 14 168.14s, LOSS = 0.3057, HR = 0.5198, NDCG = 0.3111, validated in 140.29s
Epoch: 15 169.59s, LOSS = 0.2985, HR = 0.5278, NDCG = 0.3193, validated in 138.76s
Epoch: 16 169.99s, LOSS = 0.2914, HR = 0.5207, NDCG = 0.3136, validated in 139.12s
Epoch: 17 167.52s, LOSS = 0.2839, HR = 0.5235, NDCG = 0.3161, validated in 139.70s
Epoch: 18 170.53s, LOSS = 0.2763, HR = 0.5184, NDCG = 0.3105, validated in 137.88s
Epoch: 19 169.11s, LOSS = 0.2692, HR = 0.5240, NDCG = 0.3172, validated in 140.03s
Epoch: 20 165.96s, LOSS = 0.2620, HR = 0.5169, NDCG = 0.3116, validated in 139.67s
End. Best Iteration 8: HR = 0.5403, NDCG = 0.3280.
The best NeuMF model is saved to ./models\NeuMF_wopret_trainable_adam_lrs_wolrs.pt
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment