project2 commit

a14e52dc · 20200801011 · 3d717334 · a14e52dc · a14e52dc · a14e52dc
Commit a14e52dc authored Sep 04, 2020 by 20200801011
Hide whitespace changes
Inline Side-by-side

Showing with 442 additions and 0 deletions

project2/Keras实现MF.ipynb
+442 -0

project2/data/u.data
+0 -0

project2/model/mf_model.h5
+0 -0

No files found.
--- a/project2/Keras实现MF.ipynb
+++ b/project2/Keras实现MF.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 训练"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T11:16:41.915123Z",
+     "start_time": "2020-08-23T11:13:25.882168Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\jerry\\anaconda3\\envs\\tf\\lib\\site-packages\\ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "C:\\Users\\jerry\\anaconda3\\envs\\tf\\lib\\site-packages\\ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "num_user: 943 num_movie: 1682\n",
+      "train: 90000 test: 10000\n",
+      "__________________________________________________________________________________________________\n",
+      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
+      "==================================================================================================\n",
+      "input_1 (InputLayer)            (None, None)         0                                            \n",
+      "__________________________________________________________________________________________________\n",
+      "input_2 (InputLayer)            (None, None)         0                                            \n",
+      "__________________________________________________________________________________________________\n",
+      "embedding_1 (Embedding)         (None, 1, 4)         3776        input_1[0][0]                    \n",
+      "__________________________________________________________________________________________________\n",
+      "embedding_2 (Embedding)         (None, 1, 4)         6732        input_2[0][0]                    \n",
+      "__________________________________________________________________________________________________\n",
+      "reshape_1 (Reshape)             (None, 4)            0           embedding_1[0][0]                \n",
+      "__________________________________________________________________________________________________\n",
+      "reshape_2 (Reshape)             (None, 4)            0           embedding_2[0][0]                \n",
+      "__________________________________________________________________________________________________\n",
+      "dot_1 (Dot)                     (None, 1)            0           reshape_1[0][0]                  \n",
+      "                                                                 reshape_2[0][0]                  \n",
+      "==================================================================================================\n",
+      "Total params: 10,508\n",
+      "Trainable params: 10,508\n",
+      "Non-trainable params: 0\n",
+      "__________________________________________________________________________________________________\n",
+      "Epoch 1/30\n",
+      "90000/90000 [==============================] - 1s 9us/step - loss: 13.1581 - acc: 2.8889e-04\n",
+      "Epoch 2/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 7.6275 - acc: 0.0553\n",
+      "Epoch 3/30\n",
+      "90000/90000 [==============================] - 1s 8us/step - loss: 3.0855 - acc: 0.2012\n",
+      "Epoch 4/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 1.7874 - acc: 0.3039\n",
+      "Epoch 5/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 1.3459 - acc: 0.3516\n",
+      "Epoch 6/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 1.1382 - acc: 0.3794\n",
+      "Epoch 7/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 1.0283 - acc: 0.3940\n",
+      "Epoch 8/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.9659 - acc: 0.4039\n",
+      "Epoch 9/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.9283 - acc: 0.4104\n",
+      "Epoch 10/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.9049 - acc: 0.4151\n",
+      "Epoch 11/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8896 - acc: 0.4176\n",
+      "Epoch 12/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8794 - acc: 0.4186\n",
+      "Epoch 13/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8720 - acc: 0.4191\n",
+      "Epoch 14/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8667 - acc: 0.4212\n",
+      "Epoch 15/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8628 - acc: 0.4206\n",
+      "Epoch 16/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8596 - acc: 0.4209\n",
+      "Epoch 17/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8573 - acc: 0.4204\n",
+      "Epoch 18/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8551 - acc: 0.4210\n",
+      "Epoch 19/30\n",
+      "90000/90000 [==============================] - 1s 8us/step - loss: 0.8535 - acc: 0.4214\n",
+      "Epoch 20/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8521 - acc: 0.4214\n",
+      "Epoch 21/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8507 - acc: 0.4217\n",
+      "Epoch 22/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8497 - acc: 0.4222\n",
+      "Epoch 23/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8489 - acc: 0.4216\n",
+      "Epoch 24/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8479 - acc: 0.4225\n",
+      "Epoch 25/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8472 - acc: 0.4224\n",
+      "Epoch 26/30\n",
+      "90000/90000 [==============================] - 1s 8us/step - loss: 0.8464 - acc: 0.4227\n",
+      "Epoch 27/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8456 - acc: 0.4226\n",
+      "Epoch 28/30\n",
+      "90000/90000 [==============================] - 1s 8us/step - loss: 0.8452 - acc: 0.4221\n",
+      "Epoch 29/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8443 - acc: 0.4226\n",
+      "Epoch 30/30\n",
+      "90000/90000 [==============================] - 1s 7us/step - loss: 0.8435 - acc: 0.4224A: 0s - loss: 0.8431 -\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from keras import Model, regularizers\n",
+    "import keras.backend as K\n",
+    "from keras.layers import Embedding, Reshape, Input, Dense, Dot\n",
+    "from keras.models import load_model\n",
+    "from keras import initializers\n",
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "K.clear_session()\n",
+    "\n",
+    "\n",
+    "def Recommend_model(num_user, num_movie, embedding_size):\n",
+    "    # 输入层\n",
+    "    input_user = Input(shape=[None, ], dtype=\"int32\")\n",
+    "    input_movie = Input(shape=[None, ], dtype=\"int32\")\n",
+    "    \n",
+    "    # 嵌入层\n",
+    "    # input_dim: int > 0。词汇表大小， 即，最大整数 index + 1。\n",
+    "    # 看keras常见API那个文件，里面介绍了这个Embedding这个的详细参数\n",
+    "    user_embedding = Embedding(num_user + 1, embedding_size, input_length=1)(input_user)\n",
+    "    # 加正则\n",
+    "    #user_embedding = Embedding(num_user + 1, embedding_size, input_length=1, embeddings_initializer=initializers.random_normal(stddev=0.01), embeddings_regularizer=regularizers.l2(0.01))(input_user)    \n",
+    "    user_embedding = Reshape((embedding_size,))(user_embedding)\n",
+    "    \n",
+    "    movie_embedding = Embedding(num_movie + 1, embedding_size, input_length=1)(input_movie)\n",
+    "    movie_embedding = Reshape((embedding_size,))(movie_embedding)\n",
+    "    \n",
+    "    # 输出层\n",
+    "    out = Dot(1)([user_embedding, movie_embedding])\n",
+    "    \n",
+    "    model = Model(inputs=[input_user, input_movie], outputs=out)\n",
+    "    model.compile(loss='mse', optimizer='Adam', metrics=['accuracy'])\n",
+    "    model.summary()\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def split_data(df):\n",
+    "    df.sort_values(by=['time'], inplace=True)  # 按时间排序\n",
+    "    boundary = df['time'].quantile(.9)  # 按时间划分 分界线\n",
+    "    train = df[df['time'] < boundary]\n",
+    "    train.sort_values(by=['user', 'time'], axis=0, inplace=True)\n",
+    "    test = df[df['time'] >= boundary]\n",
+    "    test.sort_values(by=['user', 'time'], axis=0, inplace=True)\n",
+    "    # shuffle 数据集\n",
+    "    return shuffle(train), shuffle(test)\n",
+    "\n",
+    "\n",
+    "def load_data(path):\n",
+    "    dformat = ['user', 'item', 'rating', 'time']\n",
+    "    rating = pd.read_csv(path, sep=\"\\t\", header=None, names=dformat)\n",
+    "    train_rating, test_rating = split_data(rating)\n",
+    "    num_user = np.max(train_rating[\"user\"])\n",
+    "    num_movie = np.max(train_rating[\"item\"])\n",
+    "    print(\"num_user: {} num_movie: {}\".format(num_user, num_movie))\n",
+    "    print(\"train: {} test: {}\".format(len(train_rating), len(test_rating)))\n",
+    "\n",
+    "    train_user, train_movie = train_rating['user'].values, train_rating['item'].values\n",
+    "    test_user, test_movie = test_rating['user'].values, test_rating['item'].values\n",
+    "\n",
+    "    train_x = [train_user, train_movie]\n",
+    "    train_y = train_rating['rating'].values\n",
+    "    test_x = [test_user, test_movie]\n",
+    "    test_y = test_rating['rating'].values\n",
+    "    return num_user, num_movie, train_x, train_y, test_x, test_y\n",
+    "\n",
+    "\n",
+    "def train(num_user, num_movie, train_x, train_y, model_save_path, batch_size=128, epochs=30, embedding_size=4):\n",
+    "    model = Recommend_model(num_user, num_movie, embedding_size)\n",
+    "    model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs)\n",
+    "    model.save(model_save_path)\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def evaluate(model, test_x, test_y):\n",
+    "    eval_ = model.evaluate(test_x, test_y, verbose=0)\n",
+    "    print(\"Evaluation on test data: loss = %0.6f accuracy = %0.2f%%\" % (eval_[0], eval_[1] * 100))\n",
+    "\n",
+    "\n",
+    "def predict(model, input_x):\n",
+    "    pred = model.predict(input_x)\n",
+    "    print(\"pred = {}\".format(pred))\n",
+    "\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    data_path = \"./data/u.data\"\n",
+    "    model_save_path = './model/mf_model.h5'\n",
+    "\n",
+    "    num_user, num_movie, train_x, train_y, test_x, test_y = load_data(data_path)\n",
+    "\n",
+    "    model = train(num_user, num_movie, train_x, train_y, model_save_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 验证"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T11:13:06.085270Z",
+     "start_time": "2020-08-23T11:13:04.509692Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation on test data: loss = 10.496347 accuracy = 10.65%\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate(model, test_x, test_y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 加载模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T10:05:38.148759Z",
+     "start_time": "2020-08-23T10:05:37.641417Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model = load_model(model_save_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 预测"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T10:05:39.865348Z",
+     "start_time": "2020-08-23T10:05:39.856793Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[array([353, 145, 146, ..., 753, 102,  56], dtype=int64),\n",
+       " array([327, 347, 272, ..., 435, 175, 597], dtype=int64)]"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T10:05:40.788430Z",
+     "start_time": "2020-08-23T10:05:40.782967Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_y[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T10:05:41.880114Z",
+     "start_time": "2020-08-23T10:05:41.873859Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_y[-2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T02:34:11.143251Z",
+     "start_time": "2020-08-23T02:34:11.138074Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pred = [[0.04973275]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_x = [np.array([test_x[0][-1]]), np.array([test_x[1][-1]])]\n",
+    "predict(model, input_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-23T02:34:36.012906Z",
+     "start_time": "2020-08-23T02:34:36.008214Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pred = [[2.4741406]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_x = [np.array([test_x[0][-2]]), np.array([test_x[0][-2]])]\n",
+    "predict(model, input_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tf",
+   "language": "python",
+   "name": "tf"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/project2/data/u.data
+++ b/project2/data/u.data
--- a/project2/model/mf_model.h5
+++ b/project2/model/mf_model.h5