prepare_data.py 4.81 KB
Newer Older
20200318029 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
import numpy as np
import pandas as pd
import gzip
import pickle
import argparse
import scipy.sparse as sp

from time import time
from pathlib import Path
from scipy.sparse import save_npz
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split


def array2mtx(interactions):
    num_users = interactions[:,0].max()
    num_items = interactions[:,1].max()
    mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
    for user, item, rating in interactions:
            mat[user, item] = rating
    return mat.tocsr()


def standard_split(df, data_path):

20200318029 committed
26 27 28 29 30
    # Cardinality
    n_users = df.user.nunique()
    n_items = df.item.nunique()
    n_ranks = df['rank'].nunique()
    train, test, = train_test_split(df.values.astype(np.int64), test_size=0.2, random_state=1)
20200318029 committed
31

20200318029 committed
32 33 34
    # Save
    np.savez(data_path/"standard_split.npz", train=train, test=test, n_users=n_users,
        n_items=n_items, n_ranks=n_ranks, columns=df.columns.tolist())
20200318029 committed
35 36 37


def neuralcf_split(df, data_path):
20200318029 committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
    # Xiangnan He, et al, 2017 train/test split with implicit negative feedback

    # sort by rank
    dfc = df.copy()

    # Cardinality
    n_users = df.user.nunique()
    n_items = df.item.nunique()

    dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
    dfc.reset_index(inplace=True, drop=True)

    # use last ratings for testing and all the previous for training
    test = dfc.groupby('user').tail(1)
    train = pd.merge(dfc, test, on=['user','item'],
        how='outer', suffixes=('', '_y'))
    train = train[train.rating_y.isnull()]
    test = test[['user','item','rating']]
    train = train[['user','item','rating']]

    # select 99 random movies per user that were never rated by that user
    all_items = dfc.item.unique()
    rated_items = (dfc.groupby("user")['item']
        .apply(list)
        .reset_index()
        ).item.tolist()

    def sample_not_rated(item_list, rseed=1, n=99):
        np.random.seed=rseed
        return np.random.choice(np.setdiff1d(all_items, item_list), n)

    print("sampling not rated items...")
    start = time()
    non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
    end = time() - start
    print("sampling took {} min".format(round(end/60,2)))

    negative = pd.DataFrame({'negative':non_rated_items})
    negative[['item_n'+str(i) for i in range(99)]] =\
        pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
    negative.drop('negative', axis=1, inplace=True)
    negative = negative.stack().reset_index()
    negative = negative.iloc[:, [0,2]]
    negative.columns = ['user','item']
    negative['rating'] = 0
    assert negative.shape[0] == len(non_rated_items)*99
    test_negative = (pd.concat([test,negative])
        .sort_values('user', ascending=True)
        .reset_index(drop=True)
        )
    # Ensuring that the 1st element every 100 is the rated item. This is
    # fundamental for testing
    test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
    assert np.all(test_negative.values[0::100][:,2] != 0)

    # Save
    np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
        test_negative=test_negative.values, negatives=np.array(non_rated_items),
        n_users=n_users, n_items=n_items)

    # Save training as sparse matrix
    print("saving training set as sparse matrix...")
    train_mtx = array2mtx(train.values)
    save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)
20200318029 committed
102 103 104 105


def prepare_amazon(data_path, input_fname):

20200318029 committed
106
    df = pd.read_json(data_path/input_fname, lines=True)
20200318029 committed
107

20200318029 committed
108 109 110 111
    keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
    new_colnames = ['user', 'item', 'timestamp', 'rating']
    df = df[keep_cols]
    df.columns = new_colnames
20200318029 committed
112

20200318029 committed
113 114 115
    # rank of items bought
    df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
    df.drop("timestamp", axis=1, inplace=True)
20200318029 committed
116

20200318029 committed
117 118 119 120 121 122
    # mapping user and item ids to integers
    user_mappings = {k:v for v,k in enumerate(df.user.unique())}
    item_mappings = {k:v for v,k in enumerate(df.item.unique())}
    df['user'] = df['user'].map(user_mappings)
    df['item'] = df['item'].map(item_mappings)
    df = df[['user','item','rank','rating']].astype(np.int64)
20200318029 committed
123

20200318029 committed
124 125
    pickle.dump(user_mappings, open(data_path/'user_mappings.p', 'wb'))
    pickle.dump(item_mappings, open(data_path/'item_mappings.p', 'wb'))
20200318029 committed
126

20200318029 committed
127 128
    standard_split(df, data_path)
    neuralcf_split(df, data_path)
20200318029 committed
129 130 131 132


if __name__ == '__main__':

20200318029 committed
133
    parser = argparse.ArgumentParser(description="prepare Amazon dataset")
20200318029 committed
134

20200318029 committed
135 136 137
    parser.add_argument("--input_dir",type=str, default=".")
    parser.add_argument("--input_fname",type=str, default="reviews_Movies_and_TV_5.json.gz")
    args = parser.parse_args()
20200318029 committed
138

20200318029 committed
139 140
    DATA_PATH = Path(args.input_dir)
    reviews = args.input_fname
20200318029 committed
141

20200318029 committed
142
    prepare_amazon(DATA_PATH, reviews)
20200318029 committed
143