tfidf search

4a0ee0c4 · zbh · dd84525e · 4a0ee0c4
Commit 4a0ee0c4 authored Nov 06, 2020 by zbh
Hide whitespace changes
Inline Side-by-side

Showing with 315 additions and 67 deletions

Project1/starter_code.ipynb
+315 -67

No files found.
--- a/Project1/starter_code.ipynb
+++ b/Project1/starter_code.ipynb
@@ -101,7 +101,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -110,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
@@ -119,7 +119,7 @@
       "(86821, 86821)"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -164,7 +164,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
@@ -174,7 +174,7 @@
       " ['England'])"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -197,7 +197,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
@@ -226,7 +226,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -240,7 +240,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
@@ -276,7 +276,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -303,7 +303,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -339,34 +339,45 @@
    "    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]\n",
    "    return unicodedata.normalize('NFC', ''.join(chars))\n",
    "\n",
-    "for i in range(len(qlist)):\n",
+    "def qlist_preprocess(qlist):\n",
+    "    for i in range(len(qlist)):\n",
    "\n",
-    "    ques = re.sub('\\d+', '#number', qlist[i])  # 数字变成统一字符\n",
+    "        ques = re.sub('\\d+', '#number', qlist[i])  # 数字变成统一字符\n",
-    "    ques = re.sub(\"\\'.\", '', ques)  # 's, 'm 等过滤\n",
+    "        ques = re.sub(\"\\'.\", '', ques)  # 's, 'm 等过滤\n",
-    "    ques = ques.replace('-', ' ')\n",
+    "        ques = ques.replace('-', ' ')\n",
-    "    \n",
-    "    # ques = word_tokenize(re.sub('[!.?,]+', '', ques))  # 去除标点, 分词\n",
-    "    ques = re.sub('[!.?,\\\"]+', '', ques).split()  # 去除标点, 分词\n",
-    "    \n",
-    "    ques = [i.lower() for i in ques]  # 转小写\n",
    "\n",
-    "    # lemmatization\n",
+    "        # ques = word_tokenize(re.sub('[!.?,]+', '', ques))  # 去除标点, 分词\n",
-    "    tagged_sent = pos_tag(ques)\n",
+    "        ques = re.sub('[!.?,\\\"]+', '', ques).split()  # 去除标点, 分词\n",
-    "    ques = [wnl.lemmatize(tag[0], pos=get_wordnet_pos(tag[1])) for tag in tagged_sent]\n",
    "\n",
-    "    ques = [i for i in ques if i not in stop_words]  # 去停用词\n",
+    "        ques = [i.lower() for i in ques]  # 转小写\n",
-    "    \n",
+    "\n",
-    "    # deaccent\n",
+    "        # lemmatization\n",
-    "    ques = [strip_accents(i) for i in ques]\n",
+    "        tagged_sent = pos_tag(ques)\n",
+    "        ques = [wnl.lemmatize(tag[0], pos=get_wordnet_pos(tag[1])) for tag in tagged_sent]\n",
    "\n",
-    "    qlist[i] = ques\n",
+    "        ques = [i for i in ques if i not in stop_words]  # 去停用词\n",
    "\n",
-    "# qlist =     # 更新后的问题列表"
+    "        # deaccent\n",
+    "        ques = [strip_accents(i) for i in ques]\n",
+    "\n",
+    "        qlist[i] = ques\n",
+    "    return qlist\n",
+    "\n",
+    "    # qlist =     # 更新后的问题列表"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qlist = qlist_preprocess(qlist)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -397,20 +408,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 211,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 467600 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 211,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# TODO \n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
-    "X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
+    "vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
-    "X_tfidf = X  # 结果存放在X矩阵里"
+    "# X_tfidf = X  # 结果存放在X矩阵里"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 212,
   "metadata": {},
   "outputs": [
    {
@@ -419,7 +442,7 @@
       "(86821, 30608)"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 212,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -429,6 +452,26 @@
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": 215,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.9943043238027367"
+      ]
+     },
+     "execution_count": 215,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sum(X_tfidf[0].A)"
+   ]
+  },
+  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -570,7 +613,9 @@
  {
   "cell_type": "code",
   "execution_count": 135,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [
    {
     "name": "stdout",
@@ -704,88 +749,290 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO 基于BERT的句子向量计算\n",
+    "\n",
+    "X_bert = results # 每一个句子的向量结果存放在X_bert矩阵里。行数为句子的总个数，列数为一个句子embedding大小。 "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 第三部分： 相似度匹配以及搜索\n",
+    "在这部分里，我们需要把用户每一个输入跟知识库里的每一个问题做一个相似度计算，从而得出最相似的问题。但对于这个问题，时间复杂度其实很高，所以我们需要结合倒排表来获取相似度最高的问题，从而获得答案。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.1 tf-idf + 余弦相似度\n",
+    "我们可以直接基于计算出来的``tf-idf``向量，计算用户最新问题与库中存储的问题之间的相似度，从而选择相似度最高的问题的答案。这个方法的复杂度为``O(N)``， ``N``是库中问题的个数。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 203,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"'beyonce', 'start', 'become, popular?\"\n",
+    "q_list = qlist_preprocess([query])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 204,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['eyonce', 'tart', 'ecome', 'popular']]"
+      ]
+     },
+     "execution_count": 204,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "q_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 205,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ques_tfidf = vectorizer.transform([' '.join(i) for i in q_list])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 206,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "(86821, 768)"
+       "array([[0., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
-     "execution_count": 136,
+     "execution_count": 206,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "results.shape"
+    "ques_tfidf.A"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 207,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "array([[ 0.3757278 , -0.36750692,  0.44403127, ..., -0.29200172,\n",
+       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
-       "         0.22539857, -0.57711858],\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [ 0.08952965, -0.33501336,  0.476886  , ..., -0.01217865,\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "        -0.0762298 , -0.18508096],\n",
-       "       [ 0.25395563, -0.22788545,  0.83195764, ..., -0.30400619,\n",
-       "         0.0784178 , -0.79656309],\n",
       "       ...,\n",
-       "       [-0.02081348, -0.13692667, -0.27600896, ..., -0.26923919,\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "         0.11144835, -0.08454084],\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [-0.47825322,  0.26795244,  0.15068491, ..., -0.79847699,\n",
+       "       [0., 0., 0., ..., 0., 0., 0.]])"
-       "         0.37375435, -0.31773022],\n",
-       "       [-0.43065107,  0.05200777, -0.23492971, ..., -0.10056096,\n",
-       "         0.03602708,  0.12411974]])"
      ]
     },
-     "execution_count": 137,
+     "execution_count": 207,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "results"
+    "X_tfidf.A"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 170,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 467600 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 170,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_tfidf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# TODO 基于BERT的句子向量计算\n",
+    "arr = ques_tfidf.A\n",
-    "\n",
+    "brr = X_tfidf.A\n",
-    "X_bert =   # 每一个句子的向量结果存放在X_bert矩阵里。行数为句子的总个数，列数为一个句子embedding大小。 "
+    "def vector_matrix(arr, brr):\n",
+    "    return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))"
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 194,
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/zbh/anaconda3/envs/py37/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  after removing the cwd from sys.path.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[nan, nan, nan, ..., nan, nan, nan]])"
+      ]
+     },
+     "execution_count": 194,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "### 第三部分： 相似度匹配以及搜索\n",
+    "vector_matrix(arr, brr)"
-    "在这部分里，我们需要把用户每一个输入跟知识库里的每一个问题做一个相似度计算，从而得出最相似的问题。但对于这个问题，时间复杂度其实很高，所以我们需要结合倒排表来获取相似度最高的问题，从而获得答案。"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 192,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0., 0., 0., ..., 0., 0., 0.]])"
+      ]
+     },
+     "execution_count": 192,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "arr.A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 185,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<1x86821 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 0 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 185,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ques_tfidf.dot(X_tfidf.T)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 183,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<1x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 0 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 183,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": []
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 173,
   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[84, 84, 36, 54]])"
+      ]
+     },
+     "execution_count": 173,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "#### 3.1 tf-idf + 余弦相似度\n",
+    "a = np.array([[1, 2, 1, 2, 3, 5, 6, 2]])\n",
-    "我们可以直接基于计算出来的``tf-idf``向量，计算用户最新问题与库中存储的问题之间的相似度，从而选择相似度最高的问题的答案。这个方法的复杂度为``O(N)``， ``N``是库中问题的个数。"
+    "a.dot(b.T)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 177,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[1., 2., 1., 2., 3., 5., 6., 2.],\n",
+       "       [1., 2., 1., 2., 3., 5., 6., 2.],\n",
+       "       [1., 2., 1., 2., 3., 5., 6., 2.],\n",
+       "       [1., 2., 1., 2., 3., 5., 6., 2.]])"
+      ]
+     },
+     "execution_count": 177,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.ones((4,1)).dot(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
+    "X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
+    "X_tfidf = X  # 结果存放在X矩阵里"
   ]
  },
  {
@@ -803,6 +1050,7 @@
    "    3. 找出相似度最高的top5问题的答案\n",
    "    \"\"\"\n",
    "    \n",
+    "    \n",
    "    top_idxs = []  # top_idxs存放相似度最高的（存在qlist里的）问题的下标 \n",
    "                   # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做？ \n",
    "    \n",