homework6

cd5aec2c · 20200318029 · c9476408 · cd5aec2c · cd5aec2c · cd5aec2c
Commit cd5aec2c authored Jul 22, 2020 by 20200318029
Showing with 194 additions and 101 deletions

.gitignore
+4 -2

homework6/Neural Collaborative Filtering.xml
+0 -0

homework6/code/gmf.py
+2 -0

homework6/code/mlp.py
+2 -1

homework6/code/neumf.py
+2 -1

homework6/code/prepare_data.py
+97 -97

homework6/result.md
+87 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,7 @@
 __pycache__/
 data/
 .DS_Store
-*.json
\ No newline at end of file
+*.json.gz
+models/
+*.p
+*.npz
--- a/homework6/Neural Collaborative Filtering.xml
+++ b/homework6/Neural Collaborative Filtering.xml
--- a/homework6/code/gmf.py
+++ b/homework6/code/gmf.py
@@ -76,6 +76,8 @@ class GMF(nn.Module):

        # Task-1: complete the proces to compute preds
        # preds = '...'
+        preds = self.out(user_emb * item_emb)
+        preds = torch.sigmoid(preds)

        return preds


--- a/homework6/code/mlp.py
+++ b/homework6/code/mlp.py
@@ -15,7 +15,7 @@ from torch.utils.data import DataLoader, Dataset
 from utils import get_train_instances, get_scores
 from gmf import train, evaluate, checkpoint

-os.environ["CUDA_VISIBLE_DEVICES"] = '1'  # assign GPU
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'  # assign GPU

 def parse_args():

@@ -81,6 +81,7 @@ class MLP(nn.Module):

        # Task-2: replace '?' with two proper numbers in below code
        # self.out = nn.Linear(in_features='?', out_features='?')
+        self.out = nn.Linear(in_features=layers[-1], out_features=1)

        for m in self.modules():
            if isinstance(m, nn.Embedding):

--- a/homework6/code/neumf.py
+++ b/homework6/code/neumf.py
@@ -15,7 +15,7 @@ from utils import get_train_instances, get_scores
 from gmf import GMF, train, evaluate, checkpoint
 from mlp import MLP

-os.environ["CUDA_VISIBLE_DEVICES"] = '2'  # assign GPU
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'  # assign GPU

 def parse_args():

@@ -120,6 +120,7 @@ class NeuMF(nn.Module):

        # Task-3: replace 'tensor1' and 'tensor2' with two proper tensors in below code
        # emb_vector = torch.cat(['tensor1','tensor2'], dim=1)
+        emb_vector = torch.cat([mf_emb_vector, mlp_emb_vector], dim=1)
        preds = torch.sigmoid(self.out(emb_vector))

        return preds

--- a/homework6/code/prepare_data.py
+++ b/homework6/code/prepare_data.py
@@ -23,121 +23,121 @@ def array2mtx(interactions):

 def standard_split(df, data_path):

-	# Cardinality
-	n_users = df.user.nunique()
-	n_items = df.item.nunique()
-	n_ranks = df['rank'].nunique()
-	train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
+    # Cardinality
+    n_users = df.user.nunique()
+    n_items = df.item.nunique()
+    n_ranks = df['rank'].nunique()
+    train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)

-	# Save
-	np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
-		n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
+    # Save
+    np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
+        n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())


 def neuralcf_split(df, data_path):
-	# Xiangnan He, et al, 2017 train/test split with implicit negative feedback
-
-	# sort by rank
-	dfc = df.copy()
-
-	# Cardinality
-	n_users = df.user.nunique()
-	n_items = df.item.nunique()
-
-	dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
-	dfc.reset_index(inplace=True, drop=True)
-
-	# use last ratings for testing and all the previous for training
-	test = dfc.groupby('user').tail(1)
-	train = pd.merge(dfc, test, on=['user','item'],
-		how='outer', suffixes=('', '_y'))
-	train = train[train.rating_y.isnull()]
-	test = test[['user','item','rating']]
-	train = train[['user','item','rating']]
-
-	# select 99 random movies per user that were never rated by that user
-	all_items = dfc.item.unique()
-	rated_items = (dfc.groupby("user")['item']
-	    .apply(list)
-	    .reset_index()
-	    ).item.tolist()
-
-	def sample_not_rated(item_list, rseed=1, n=99):
-		np.random.seed=rseed
-		return np.random.choice(np.setdiff1d(all_items, item_list), n)
-
-	print("sampling not rated items...")
-	start = time()
-	non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
-	end = time() - start
-	print("sampling took {} min".format(round(end/60,2)))
-
-	negative = pd.DataFrame({'negative':non_rated_items})
-	negative[['item_n'+str(i) for i in range(99)]] =\
-		pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
-	negative.drop('negative', axis=1, inplace=True)
-	negative = negative.stack().reset_index()
-	negative = negative.iloc[:, [0,2]]
-	negative.columns = ['user','item']
-	negative['rating'] = 0
-	assert negative.shape[0] == len(non_rated_items)*99
-	test_negative = (pd.concat([test,negative])
-		.sort_values('user', ascending=True)
-		.reset_index(drop=True)
-		)
-	# Ensuring that the 1st element every 100 is the rated item. This is
-	# fundamental for testing
-	test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
-	assert np.all(test_negative.values[0::100][:,2] != 0)
-
-	# Save
-	np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
-		test_negative=test_negative.values, negatives=np.array(non_rated_items),
-		n_users=n_users, n_items=n_items)
-
-	# Save training as sparse matrix
-	print("saving training set as sparse matrix...")
-	train_mtx = array2mtx(train.values)
-	save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
+    # Xiangnan He, et al, 2017 train/test split with implicit negative feedback
+
+    # sort by rank
+    dfc = df.copy()
+
+    # Cardinality
+    n_users = df.user.nunique()
+    n_items = df.item.nunique()
+
+    dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
+    dfc.reset_index(inplace=True, drop=True)
+
+    # use last ratings for testing and all the previous for training
+    test = dfc.groupby('user').tail(1)
+    train = pd.merge(dfc, test, on=['user','item'],
+        how='outer', suffixes=('', '_y'))
+    train = train[train.rating_y.isnull()]
+    test = test[['user','item','rating']]
+    train = train[['user','item','rating']]
+
+    # select 99 random movies per user that were never rated by that user
+    all_items = dfc.item.unique()
+    rated_items = (dfc.groupby("user")['item']
+        .apply(list)
+        .reset_index()
+        ).item.tolist()
+
+    def sample_not_rated(item_list, rseed=1, n=99):
+        np.random.seed=rseed
+        return np.random.choice(np.setdiff1d(all_items, item_list), n)
+
+    print("sampling not rated items...")
+    start = time()
+    non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
+    end = time() - start
+    print("sampling took {} min".format(round(end/60,2)))
+
+    negative = pd.DataFrame({'negative':non_rated_items})
+    negative[['item_n'+str(i) for i in range(99)]] =\
+        pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
+    negative.drop('negative', axis=1, inplace=True)
+    negative = negative.stack().reset_index()
+    negative = negative.iloc[:, [0,2]]
+    negative.columns = ['user','item']
+    negative['rating'] = 0
+    assert negative.shape[0] == len(non_rated_items)*99
+    test_negative = (pd.concat([test,negative])
+        .sort_values('user', ascending=True)
+        .reset_index(drop=True)
+        )
+    # Ensuring that the 1st element every 100 is the rated item. This is
+    # fundamental for testing
+    test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
+    assert np.all(test_negative.values[0::100][:,2] != 0)
+
+    # Save
+    np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
+        test_negative=test_negative.values, negatives=np.array(non_rated_items),
+        n_users=n_users, n_items=n_items)
+
+    # Save training as sparse matrix
+    print("saving training set as sparse matrix...")
+    train_mtx = array2mtx(train.values)
+    save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)


 def prepare_amazon(data_path, input_fname):

-	df = pd.read_json(data_path/input_fname, lines=True)
+    df = pd.read_json(data_path/input_fname, lines=True)

-	keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
-	new_colnames = ['user', 'item', 'timestamp', 'rating']
-	df = df[keep_cols]
-	df.columns = new_colnames
+    keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
+    new_colnames = ['user', 'item', 'timestamp', 'rating']
+    df = df[keep_cols]
+    df.columns = new_colnames

-	# rank of items bought
-	df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
-	df.drop("timestamp", axis=1, inplace=True)
+    # rank of items bought
+    df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
+    df.drop("timestamp", axis=1, inplace=True)

-	# mapping user and item ids to integers
-	user_mappings = {k:v for v,k in enumerate(df.user.unique())}
-	item_mappings = {k:v for v,k in enumerate(df.item.unique())}
-	df['user'] = df['user'].map(user_mappings)
-	df['item'] = df['item'].map(item_mappings)
-	df = df[['user','item','rank','rating']].astype(np.int64)
+    # mapping user and item ids to integers
+    user_mappings = {k:v for v,k in enumerate(df.user.unique())}
+    item_mappings = {k:v for v,k in enumerate(df.item.unique())}
+    df['user'] = df['user'].map(user_mappings)
+    df['item'] = df['item'].map(item_mappings)
+    df = df[['user','item','rank','rating']].astype(np.int64)

-	pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
-	pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
+    pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
+    pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))

-	standard_split(df, data_path)
-	neuralcf_split(df, data_path)
+    standard_split(df, data_path)
+    neuralcf_split(df, data_path)


 if __name__ == '__main__':

-	parser = argparse.ArgumentParser(description="prepare Amazon dataset")
+    parser = argparse.ArgumentParser(description="prepare Amazon dataset")

-	parser.add_argument("--input_dir",type=str, default=".")
-	parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
-	args = parser.parse_args()
+    parser.add_argument("--input_dir",type=str, default=".")
+    parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
+    args = parser.parse_args()

-	DATA_PATH = Path(args.input_dir)
-	reviews = args.input_fname
+    DATA_PATH = Path(args.input_dir)
+    reviews = args.input_fname

-	prepare_amazon(DATA_PATH, reviews)
+    prepare_amazon(DATA_PATH, reviews)

--- a/homework6/result.md
+++ b/homework6/result.md
+## 1. gmf
+
+```sh
+python gmf.py
+```
+
+    Epoch: 1 262.31s, LOSS = 0.5026, HR = 0.1032, NDCG = 0.0475, validated in 153.72s
+    Epoch: 2 260.43s, LOSS = 0.4978, HR = 0.2571, NDCG = 0.1365, validated in 153.29s
+    Epoch: 3 259.84s, LOSS = 0.4085, HR = 0.5034, NDCG = 0.3083, validated in 152.75s
+    Epoch: 4 259.75s, LOSS = 0.3646, HR = 0.5244, NDCG = 0.3200, validated in 152.57s
+    Epoch: 5 260.03s, LOSS = 0.3398, HR = 0.5335, NDCG = 0.3260, validated in 153.25s
+    Epoch: 6 259.52s, LOSS = 0.3151, HR = 0.5613, NDCG = 0.3494, validated in 152.72s
+    Epoch: 7 259.14s, LOSS = 0.2942, HR = 0.5828, NDCG = 0.3666, validated in 152.93s
+    Epoch: 8 259.48s, LOSS = 0.2780, HR = 0.5941, NDCG = 0.3766, validated in 152.79s
+    Epoch: 9 259.25s, LOSS = 0.2656, HR = 0.6149, NDCG = 0.3928, validated in 152.74s
+    Epoch: 10 259.59s, LOSS = 0.2564, HR = 0.6297, NDCG = 0.4014, validated in 152.70s
+    Epoch: 11 258.89s, LOSS = 0.2489, HR = 0.6407, NDCG = 0.4122, validated in 152.90s
+    Epoch: 12 259.50s, LOSS = 0.2430, HR = 0.6533, NDCG = 0.4193, validated in 152.77s
+    Epoch: 13 259.17s, LOSS = 0.2380, HR = 0.6564, NDCG = 0.4272, validated in 152.98s
+    Epoch: 14 259.66s, LOSS = 0.2340, HR = 0.6651, NDCG = 0.4315, validated in 153.17s
+    Epoch: 15 259.46s, LOSS = 0.2307, HR = 0.6707, NDCG = 0.4370, validated in 152.97s
+    Epoch: 16 259.71s, LOSS = 0.2275, HR = 0.6770, NDCG = 0.4404, validated in 152.91s
+    Epoch: 17 258.68s, LOSS = 0.2250, HR = 0.6843, NDCG = 0.4477, validated in 153.00s
+    Epoch: 18 259.66s, LOSS = 0.2227, HR = 0.6832, NDCG = 0.4486, validated in 153.61s
+    Epoch: 19 260.77s, LOSS = 0.2207, HR = 0.6868, NDCG = 0.4464, validated in 154.02s
+    Epoch: 20 260.41s, LOSS = 0.2190, HR = 0.6865, NDCG = 0.4457, validated in 152.90s
+    End. Best Iteration 19: HR = 0.6868, NDCG = 0.4464.
+    The best GMF model is saved to ./models\GMF_bs_1024_lr_001_n_emb_8_lrnr_adam_lrs_wolrs.pt
+
+## 2. mlp
+
+```sh
+python mlp.py
+```
+
+    Epoch: 1 280.25s, LOSS = 0.3882, HR = 0.5336, NDCG = 0.3288, validated in 158.24s
+    Epoch: 2 281.01s, LOSS = 0.3617, HR = 0.5443, NDCG = 0.3328, validated in 158.10s
+    Epoch: 3 281.09s, LOSS = 0.3460, HR = 0.5555, NDCG = 0.3412, validated in 158.71s
+    Epoch: 4 280.95s, LOSS = 0.3286, HR = 0.5663, NDCG = 0.3546, validated in 158.17s
+    Epoch: 5 280.30s, LOSS = 0.3099, HR = 0.5758, NDCG = 0.3649, validated in 156.86s
+    Epoch: 6 296.81s, LOSS = 0.2927, HR = 0.5834, NDCG = 0.3747, validated in 157.70s
+    Epoch: 7 281.39s, LOSS = 0.2769, HR = 0.5877, NDCG = 0.3719, validated in 157.40s
+    Epoch: 8 281.28s, LOSS = 0.2629, HR = 0.5885, NDCG = 0.3789, validated in 156.90s
+    Epoch: 9 280.25s, LOSS = 0.2502, HR = 0.5923, NDCG = 0.3811, validated in 157.00s
+    Epoch: 10 280.88s, LOSS = 0.2395, HR = 0.6002, NDCG = 0.3915, validated in 156.67s
+    Epoch: 11 279.24s, LOSS = 0.2305, HR = 0.6000, NDCG = 0.3959, validated in 157.01s
+    Epoch: 12 280.54s, LOSS = 0.2224, HR = 0.6062, NDCG = 0.4018, validated in 159.55s
+    Epoch: 13 298.20s, LOSS = 0.2154, HR = 0.6044, NDCG = 0.3970, validated in 157.81s
+    Epoch: 14 296.25s, LOSS = 0.2091, HR = 0.6030, NDCG = 0.3994, validated in 157.01s
+    Epoch: 15 296.10s, LOSS = 0.2037, HR = 0.5994, NDCG = 0.3991, validated in 157.08s
+    Epoch: 16 295.99s, LOSS = 0.1988, HR = 0.6101, NDCG = 0.4035, validated in 157.52s
+    Epoch: 17 296.45s, LOSS = 0.1946, HR = 0.5959, NDCG = 0.3957, validated in 156.45s
+    Epoch: 18 295.52s, LOSS = 0.1908, HR = 0.6023, NDCG = 0.4023, validated in 156.69s
+    Epoch: 19 296.37s, LOSS = 0.1874, HR = 0.6086, NDCG = 0.4045, validated in 156.69s
+    Epoch: 20 295.67s, LOSS = 0.1842, HR = 0.5976, NDCG = 0.3979, validated in 157.77s
+    End. Best Iteration 16: HR = 0.6101, NDCG = 0.4035.
+    The best MLP model is saved to ./models\MLP_bs_1024_reg_00_lr_001_n_emb_32_ll_8_dp_wodp_lrnr_adam_lrs_wolrs.pt
+
+## 3. neumf
+
+```sh
+python neumf.py
+```
+
+    Epoch: 1 170.91s, LOSS = 0.4144, HR = 0.5345, NDCG = 0.3307, validated in 139.97s
+    Epoch: 2 168.71s, LOSS = 0.3779, HR = 0.5356, NDCG = 0.3307, validated in 142.42s
+    Epoch: 3 170.15s, LOSS = 0.3680, HR = 0.5377, NDCG = 0.3316, validated in 139.07s
+    Epoch: 4 168.91s, LOSS = 0.3612, HR = 0.5372, NDCG = 0.3317, validated in 139.91s
+    Epoch: 5 165.80s, LOSS = 0.3558, HR = 0.5384, NDCG = 0.3315, validated in 142.21s
+    Epoch: 6 167.40s, LOSS = 0.3513, HR = 0.5380, NDCG = 0.3291, validated in 143.98s
+    Epoch: 7 168.12s, LOSS = 0.3464, HR = 0.5378, NDCG = 0.3292, validated in 141.11s
+    Epoch: 8 167.75s, LOSS = 0.3412, HR = 0.5403, NDCG = 0.3280, validated in 140.05s
+    Epoch: 9 168.61s, LOSS = 0.3361, HR = 0.5363, NDCG = 0.3254, validated in 139.48s
+    Epoch: 10 167.73s, LOSS = 0.3308, HR = 0.5338, NDCG = 0.3225, validated in 138.01s
+    Epoch: 11 169.24s, LOSS = 0.3249, HR = 0.5286, NDCG = 0.3182, validated in 137.74s
+    Epoch: 12 170.76s, LOSS = 0.3189, HR = 0.5295, NDCG = 0.3182, validated in 138.17s
+    Epoch: 13 167.33s, LOSS = 0.3125, HR = 0.5294, NDCG = 0.3185, validated in 140.21s
+    Epoch: 14 168.14s, LOSS = 0.3057, HR = 0.5198, NDCG = 0.3111, validated in 140.29s
+    Epoch: 15 169.59s, LOSS = 0.2985, HR = 0.5278, NDCG = 0.3193, validated in 138.76s
+    Epoch: 16 169.99s, LOSS = 0.2914, HR = 0.5207, NDCG = 0.3136, validated in 139.12s
+    Epoch: 17 167.52s, LOSS = 0.2839, HR = 0.5235, NDCG = 0.3161, validated in 139.70s
+    Epoch: 18 170.53s, LOSS = 0.2763, HR = 0.5184, NDCG = 0.3105, validated in 137.88s
+    Epoch: 19 169.11s, LOSS = 0.2692, HR = 0.5240, NDCG = 0.3172, validated in 140.03s
+    Epoch: 20 165.96s, LOSS = 0.2620, HR = 0.5169, NDCG = 0.3116, validated in 139.67s
+    End. Best Iteration 8: HR = 0.5403, NDCG = 0.3280.
+    The best NeuMF model is saved to ./models\NeuMF_wopret_trainable_adam_lrs_wolrs.pt
\ No newline at end of file