Commit a14e52dc by 20200801011

project2 commit

parent 3d717334
{
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 训练"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T11:16:41.915123Z",
"start_time": "2020-08-23T11:13:25.882168Z"
},
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\jerry\\anaconda3\\envs\\tf\\lib\\site-packages\\ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\jerry\\anaconda3\\envs\\tf\\lib\\site-packages\\ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"num_user: 943 num_movie: 1682\n",
"train: 90000 test: 10000\n",
"__________________________________________________________________________________________________\n",
"Layer (type) Output Shape Param # Connected to \n",
"==================================================================================================\n",
"input_1 (InputLayer) (None, None) 0 \n",
"__________________________________________________________________________________________________\n",
"input_2 (InputLayer) (None, None) 0 \n",
"__________________________________________________________________________________________________\n",
"embedding_1 (Embedding) (None, 1, 4) 3776 input_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"embedding_2 (Embedding) (None, 1, 4) 6732 input_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"reshape_1 (Reshape) (None, 4) 0 embedding_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"reshape_2 (Reshape) (None, 4) 0 embedding_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"dot_1 (Dot) (None, 1) 0 reshape_1[0][0] \n",
" reshape_2[0][0] \n",
"==================================================================================================\n",
"Total params: 10,508\n",
"Trainable params: 10,508\n",
"Non-trainable params: 0\n",
"__________________________________________________________________________________________________\n",
"Epoch 1/30\n",
"90000/90000 [==============================] - 1s 9us/step - loss: 13.1581 - acc: 2.8889e-04\n",
"Epoch 2/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 7.6275 - acc: 0.0553\n",
"Epoch 3/30\n",
"90000/90000 [==============================] - 1s 8us/step - loss: 3.0855 - acc: 0.2012\n",
"Epoch 4/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 1.7874 - acc: 0.3039\n",
"Epoch 5/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 1.3459 - acc: 0.3516\n",
"Epoch 6/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 1.1382 - acc: 0.3794\n",
"Epoch 7/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 1.0283 - acc: 0.3940\n",
"Epoch 8/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.9659 - acc: 0.4039\n",
"Epoch 9/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.9283 - acc: 0.4104\n",
"Epoch 10/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.9049 - acc: 0.4151\n",
"Epoch 11/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8896 - acc: 0.4176\n",
"Epoch 12/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8794 - acc: 0.4186\n",
"Epoch 13/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8720 - acc: 0.4191\n",
"Epoch 14/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8667 - acc: 0.4212\n",
"Epoch 15/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8628 - acc: 0.4206\n",
"Epoch 16/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8596 - acc: 0.4209\n",
"Epoch 17/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8573 - acc: 0.4204\n",
"Epoch 18/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8551 - acc: 0.4210\n",
"Epoch 19/30\n",
"90000/90000 [==============================] - 1s 8us/step - loss: 0.8535 - acc: 0.4214\n",
"Epoch 20/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8521 - acc: 0.4214\n",
"Epoch 21/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8507 - acc: 0.4217\n",
"Epoch 22/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8497 - acc: 0.4222\n",
"Epoch 23/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8489 - acc: 0.4216\n",
"Epoch 24/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8479 - acc: 0.4225\n",
"Epoch 25/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8472 - acc: 0.4224\n",
"Epoch 26/30\n",
"90000/90000 [==============================] - 1s 8us/step - loss: 0.8464 - acc: 0.4227\n",
"Epoch 27/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8456 - acc: 0.4226\n",
"Epoch 28/30\n",
"90000/90000 [==============================] - 1s 8us/step - loss: 0.8452 - acc: 0.4221\n",
"Epoch 29/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8443 - acc: 0.4226\n",
"Epoch 30/30\n",
"90000/90000 [==============================] - 1s 7us/step - loss: 0.8435 - acc: 0.4224A: 0s - loss: 0.8431 -\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from keras import Model, regularizers\n",
"import keras.backend as K\n",
"from keras.layers import Embedding, Reshape, Input, Dense, Dot\n",
"from keras.models import load_model\n",
"from keras import initializers\n",
"from sklearn.utils import shuffle\n",
"\n",
"K.clear_session()\n",
"\n",
"\n",
"def Recommend_model(num_user, num_movie, embedding_size):\n",
" # 输入层\n",
" input_user = Input(shape=[None, ], dtype=\"int32\")\n",
" input_movie = Input(shape=[None, ], dtype=\"int32\")\n",
" \n",
" # 嵌入层\n",
" # input_dim: int > 0。词汇表大小, 即,最大整数 index + 1。\n",
" # 看keras常见API那个文件,里面介绍了这个Embedding这个的详细参数\n",
" user_embedding = Embedding(num_user + 1, embedding_size, input_length=1)(input_user)\n",
" # 加正则\n",
" #user_embedding = Embedding(num_user + 1, embedding_size, input_length=1, embeddings_initializer=initializers.random_normal(stddev=0.01), embeddings_regularizer=regularizers.l2(0.01))(input_user) \n",
" user_embedding = Reshape((embedding_size,))(user_embedding)\n",
" \n",
" movie_embedding = Embedding(num_movie + 1, embedding_size, input_length=1)(input_movie)\n",
" movie_embedding = Reshape((embedding_size,))(movie_embedding)\n",
" \n",
" # 输出层\n",
" out = Dot(1)([user_embedding, movie_embedding])\n",
" \n",
" model = Model(inputs=[input_user, input_movie], outputs=out)\n",
" model.compile(loss='mse', optimizer='Adam', metrics=['accuracy'])\n",
" model.summary()\n",
" return model\n",
"\n",
"\n",
"def split_data(df):\n",
" df.sort_values(by=['time'], inplace=True) # 按时间排序\n",
" boundary = df['time'].quantile(.9) # 按时间划分 分界线\n",
" train = df[df['time'] < boundary]\n",
" train.sort_values(by=['user', 'time'], axis=0, inplace=True)\n",
" test = df[df['time'] >= boundary]\n",
" test.sort_values(by=['user', 'time'], axis=0, inplace=True)\n",
" # shuffle 数据集\n",
" return shuffle(train), shuffle(test)\n",
"\n",
"\n",
"def load_data(path):\n",
" dformat = ['user', 'item', 'rating', 'time']\n",
" rating = pd.read_csv(path, sep=\"\\t\", header=None, names=dformat)\n",
" train_rating, test_rating = split_data(rating)\n",
" num_user = np.max(train_rating[\"user\"])\n",
" num_movie = np.max(train_rating[\"item\"])\n",
" print(\"num_user: {} num_movie: {}\".format(num_user, num_movie))\n",
" print(\"train: {} test: {}\".format(len(train_rating), len(test_rating)))\n",
"\n",
" train_user, train_movie = train_rating['user'].values, train_rating['item'].values\n",
" test_user, test_movie = test_rating['user'].values, test_rating['item'].values\n",
"\n",
" train_x = [train_user, train_movie]\n",
" train_y = train_rating['rating'].values\n",
" test_x = [test_user, test_movie]\n",
" test_y = test_rating['rating'].values\n",
" return num_user, num_movie, train_x, train_y, test_x, test_y\n",
"\n",
"\n",
"def train(num_user, num_movie, train_x, train_y, model_save_path, batch_size=128, epochs=30, embedding_size=4):\n",
" model = Recommend_model(num_user, num_movie, embedding_size)\n",
" model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs)\n",
" model.save(model_save_path)\n",
" return model\n",
"\n",
"\n",
"def evaluate(model, test_x, test_y):\n",
" eval_ = model.evaluate(test_x, test_y, verbose=0)\n",
" print(\"Evaluation on test data: loss = %0.6f accuracy = %0.2f%%\" % (eval_[0], eval_[1] * 100))\n",
"\n",
"\n",
"def predict(model, input_x):\n",
" pred = model.predict(input_x)\n",
" print(\"pred = {}\".format(pred))\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" data_path = \"./data/u.data\"\n",
" model_save_path = './model/mf_model.h5'\n",
"\n",
" num_user, num_movie, train_x, train_y, test_x, test_y = load_data(data_path)\n",
"\n",
" model = train(num_user, num_movie, train_x, train_y, model_save_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 验证"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T11:13:06.085270Z",
"start_time": "2020-08-23T11:13:04.509692Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation on test data: loss = 10.496347 accuracy = 10.65%\n"
]
}
],
"source": [
"evaluate(model, test_x, test_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 加载模型"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T10:05:38.148759Z",
"start_time": "2020-08-23T10:05:37.641417Z"
}
},
"outputs": [],
"source": [
"model = load_model(model_save_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 预测"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T10:05:39.865348Z",
"start_time": "2020-08-23T10:05:39.856793Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[array([353, 145, 146, ..., 753, 102, 56], dtype=int64),\n",
" array([327, 347, 272, ..., 435, 175, 597], dtype=int64)]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T10:05:40.788430Z",
"start_time": "2020-08-23T10:05:40.782967Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_y[-1]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T10:05:41.880114Z",
"start_time": "2020-08-23T10:05:41.873859Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_y[-2]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T02:34:11.143251Z",
"start_time": "2020-08-23T02:34:11.138074Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pred = [[0.04973275]]\n"
]
}
],
"source": [
"input_x = [np.array([test_x[0][-1]]), np.array([test_x[1][-1]])]\n",
"predict(model, input_x)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-23T02:34:36.012906Z",
"start_time": "2020-08-23T02:34:36.008214Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pred = [[2.4741406]]\n"
]
}
],
"source": [
"input_x = [np.array([test_x[0][-2]]), np.array([test_x[0][-2]])]\n",
"predict(model, input_x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "tf",
"language": "python",
"name": "tf"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment