inverted idx

fecd2fd3 · zbh · 4a0ee0c4 · 4a0ee0c4 · fecd2fd3
Commit fecd2fd3 authored Nov 08, 2020 by zbh
Hide whitespace changes
Inline Side-by-side

Showing with 208 additions and 327 deletions

Project1/data/.ipynb_checkpoints/starter_code-checkpoint.ipynb
+0 -0

Project1/starter_code.ipynb
+208 -327

No files found.
--- a/Project1/data/.ipynb_checkpoints/starter_code-checkpoint.ipynb
+++ b/Project1/data/.ipynb_checkpoints/starter_code-checkpoint.ipynb
--- a/Project1/starter_code.ipynb
+++ b/Project1/starter_code.ipynb
@@ -408,32 +408,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 211,
+   "execution_count": 230,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "data": {
-      "text/plain": [
-       "<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 467600 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 211,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "# TODO \n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
+    "vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None) # 定义一个tf-idf的vectorizer\n",
-    "vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
+    "X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
-    "# X_tfidf = X  # 结果存放在X矩阵里"
+    "X_tfidf = X  # 结果存放在X矩阵里"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 212,
+   "execution_count": 231,
   "metadata": {},
   "outputs": [
    {
@@ -442,7 +430,7 @@
       "(86821, 30608)"
      ]
     },
-     "execution_count": 212,
+     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -452,26 +440,6 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 215,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.9943043238027367"
-      ]
-     },
-     "execution_count": 215,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.sum(X_tfidf[0].A)"
-   ]
-  },
-  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -776,268 +744,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 203,
+   "execution_count": 566,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "query = \"'beyonce', 'start', 'become, popular?\"\n",
-    "q_list = qlist_preprocess([query])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 204,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[['eyonce', 'tart', 'ecome', 'popular']]"
-      ]
-     },
-     "execution_count": 204,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "q_list"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 205,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ques_tfidf = vectorizer.transform([' '.join(i) for i in q_list])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 206,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0., 0., 0., ..., 0., 0., 0.]])"
-      ]
-     },
-     "execution_count": 206,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ques_tfidf.A"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 207,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       ...,\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.]])"
-      ]
-     },
-     "execution_count": 207,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "X_tfidf.A"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 170,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 467600 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 170,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "X_tfidf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 193,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arr = ques_tfidf.A\n",
-    "brr = X_tfidf.A\n",
-    "def vector_matrix(arr, brr):\n",
-    "    return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 194,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/zbh/anaconda3/envs/py37/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  after removing the cwd from sys.path.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "array([[nan, nan, nan, ..., nan, nan, nan]])"
-      ]
-     },
-     "execution_count": 194,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "vector_matrix(arr, brr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 192,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0., 0., 0., ..., 0., 0., 0.]])"
-      ]
-     },
-     "execution_count": 192,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "arr.A"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 185,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<1x86821 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 0 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 185,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ques_tfidf.dot(X_tfidf.T)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 183,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<1x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 0 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 183,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 173,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[84, 84, 36, 54]])"
-      ]
-     },
-     "execution_count": 173,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = np.array([[1, 2, 1, 2, 3, 5, 6, 2]])\n",
-    "a.dot(b.T)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 177,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1., 2., 1., 2., 3., 5., 6., 2.],\n",
-       "       [1., 2., 1., 2., 3., 5., 6., 2.],\n",
-       "       [1., 2., 1., 2., 3., 5., 6., 2.],\n",
-       "       [1., 2., 1., 2., 3., 5., 6., 2.]])"
-      ]
-     },
-     "execution_count": 177,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.ones((4,1)).dot(a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
-    "X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
-    "X_tfidf = X  # 结果存放在X矩阵里"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1050,22 +757,51 @@
    "    3. 找出相似度最高的top5问题的答案\n",
    "    \"\"\"\n",
    "    \n",
+    "    q_vector = vectorizer.transform([' '.join(qlist_preprocess([query])[0])])\n",
+    "    # 计算余弦相似度，tfidf默认l2范数；矩阵乘法\n",
+    "    sim = (X_tfidf * q_vector.T).toarray()\n",
+    "\n",
+    "    \n",
+    "    # res = np.argsort(sim)\n",
+    "    # query = 'when beyonce start become popular?'\n",
+    "    # ans: array([[43410, 57532, 57531, ..., 39267,   145,     0]])\n",
+    "    \n",
    "    \n",
-    "    top_idxs = []  # top_idxs存放相似度最高的（存在qlist里的）问题的下标 \n",
+    "    # 使用优先队列找出top5\n",
+    "    pq = PriorityQueue()\n",
+    "    for cur in range(sim.shape[0]):\n",
+    "        pq.put((sim[cur][0], cur))\n",
+    "        if len(pq.queue) > 5:\n",
+    "            pq.get()\n",
+    "\n",
+    "    pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
+    "    # print(pq_rank)\n",
+    "\n",
+    "    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表\n",
    "                   # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做？ \n",
    "    \n",
-    "    return alist[top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案    "
+    "\n",
+    "    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案  "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 567,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['in the late 1990s'], ['Dangerously in Love Tour'], ['Particularly since the 1950s, pro wrestling events have frequently been responsible for sellout crowds at large arenas'], ['her mother'], ['Germany, the Netherlands, Switzerland, Latvia, Estonia and Hungary']]\n",
+      "[['in the later 19th century'], ['Tags'], ['Ashoka'], ['economic, social, and cultural'], ['water buffalo']]\n"
+     ]
+    }
+   ],
   "source": [
    "# TODO: 编写几个测试用例，并输出结果\n",
-    "print (get_top_results_tfidf_noindex(\"\"))\n",
+    "print(get_top_results_tfidf_noindex(\"when beyonce start become popular?\"))\n",
-    "print (get_top_results_tfidf_noindex(\"\"))"
+    "print(get_top_results_tfidf_noindex(\"where jordge come from\"))"
   ]
  },
  {
@@ -1085,19 +821,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 531,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO 请创建倒排表\n",
-    "inverted_idx = {}  # 定一个一个简单的倒排表，是一个map结构。 循环所有qlist一遍就可以"
+    "inverted_idx = {}  # 定一个一个简单的倒排表，是一个map结构。 循环所有qlist一遍就可以\n",
+    "\n",
+    "for i, ques in enumerate(qlist):\n",
+    "    for word in ques:\n",
+    "        if word in inverted_idx.keys():\n",
+    "            inverted_idx[word].add(i)\n",
+    "        else:\n",
+    "            inverted_idx[word] = set([i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### 3.3 语义相似度\n",
+    "#### 3.3 related_words.txt\n",
    "这里有一个问题还需要解决，就是语义的相似度。可以这么理解： 两个单词比如car, auto这两个单词长得不一样，但从语义上还是类似的。如果只是使用倒排表我们不能考虑到这些单词之间的相似度，这就导致如果我们搜索句子里包含了``car``, 则我们没法获取到包含auto的所有的文档。所以我们希望把这些信息也存下来。那这个问题如何解决呢？ 其实也不难，可以提前构建好相似度的关系，比如对于``car``这个单词，一开始就找好跟它意思上比较类似的单词比如top 10，这些都标记为``related words``。所以最后我们就可以创建一个保存``related words``的一个``map``. 比如调用``related_words['car']``即可以调取出跟``car``意思上相近的TOP 10的单词。 \n",
    "\n",
    "那这个``related_words``又如何构建呢？ 在这里我们仍然使用``Glove``向量，然后计算一下俩俩的相似度（余弦相似度）。之后对于每一个词，存储跟它最相近的top 10单词，最终结果保存在``related_words``里面。 这个计算需要发生在离线，因为计算量很大，复杂度为``O(V*V)``， V是单词的总数。 \n",
@@ -1113,7 +856,15 @@
   "source": [
    "# TODO 读取语义相关的单词\n",
    "def get_related_words(file):\n",
+    "    f = open('related_words.txt')\n",
+    "    related_words = {}\n",
    "    \n",
+    "    for line in f:\n",
+    "        items = line.strip().split()\n",
+    "        word = items[0]\n",
+    "        sim_words_10 = items[1].split(',')\n",
+    "        related_words[word] = sim_words_10\n",
+    "    f.close()\n",
    "    return related_words\n",
    "\n",
    "related_words = get_related_words('related_words.txt') # 直接放在文件夹的根目录下，不要修改此路径。"
@@ -1134,7 +885,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 574,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import wordnet as wn\n",
+    "def get_related_idx(query):\n",
+    "\n",
+    "    related_set = set()\n",
+    "    query_words = qlist_preprocess([query])[0]\n",
+    "    for query_word in query_words:\n",
+    "        # 1. 读取文件\n",
+    "        # related_words_list = related_words[query_word]\n",
+    "        # 2. nltk\n",
+    "        related_words_list = [x.name().split(\".\")[0] for x in wn.synsets(query_word)][:10]\n",
+    "        for related_word in related_words_list:\n",
+    "            try:\n",
+    "                related_set = related_set | inverted_idx[related_word]\n",
+    "            except KeyError as e:\n",
+    "                # print(e)\n",
+    "                continue    \n",
+    "        \n",
+    "    related_idx = list(related_set)\n",
+    "    return related_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 575,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1146,18 +924,41 @@
    "    3. 找出相似度最高的top5问题的答案\n",
    "    \"\"\"\n",
    "    \n",
-    "    top_idxs = []  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 \n",
+    "    q_vector = vectorizer.transform([' '.join(qlist_preprocess([query])[0])])\n",
-    "                   # hint: 利用priority queue来找出top results. 思考为什么可以这么做？ \n",
+    "    \n",
+    "    related_index = get_related_idx(query)\n",
+    "    X = X_tfidf[related_index]\n",
+    "    \n",
    "    \n",
-    "    return alist[top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案"
+    "    # 计算余弦相似度，tfidf默认l2范数；矩阵乘法\n",
+    "    sim = (X * q_vector.T).toarray()\n",
+    "    \n",
+    "    # 使用优先队列找出top5\n",
+    "    pq = PriorityQueue()\n",
+    "    for cur in range(sim.shape[0]):\n",
+    "        pq.put((sim[cur][0], cur))\n",
+    "        if len(pq.queue) > 5:\n",
+    "            pq.get()\n",
+    "\n",
+    "    pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
+    "    # print(pq_rank)\n",
+    "\n",
+    "    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表\n",
+    "                   # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做？ \n",
+    "    \n",
+    "\n",
+    "    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案  "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 606,
   "metadata": {},
   "outputs": [],
   "source": [
+    "def vector_matrix(arr, brr):\n",
+    "    return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))\n",
+    "\n",
    "def get_top_results_w2v(query):\n",
    "    \"\"\"\n",
    "    给定用户输入的问题 query, 返回最有可能的TOP 5问题。这里面需要做到以下几点：\n",
@@ -1166,15 +967,46 @@
    "    3. 找出相似度最高的top5问题的答案\n",
    "    \"\"\"\n",
    "    \n",
-    "    top_idxs = []  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 \n",
+    "    ques = qlist_preprocess([query])[0]\n",
-    "                   # hint: 利用priority queue来找出top results. 思考为什么可以这么做？ \n",
+    "    \n",
+    "    vec = np.zeros(200)\n",
+    "    length = len(ques)  # 句子长度\n",
+    "    for word in ques:\n",
+    "        try:\n",
+    "            vec += word_dict[word]\n",
+    "        except KeyError as e:\n",
+    "            vec += word_dict['unk']\n",
+    "    vec = vec / length\n",
+    "    \n",
+    "    q_vector = vec\n",
    "    \n",
-    "    return alist[top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案"
+    "    related_index = get_related_idx(query)\n",
+    "    X = X_w2v[related_index]\n",
+    "    \n",
+    "    \n",
+    "    # 计算余弦相似度，tfidf默认l2范数；矩阵乘法\n",
+    "    sim = vector_matrix(q_vector, X).reshape(-1, 1)\n",
+    "    \n",
+    "    # 使用优先队列找出top5\n",
+    "    pq = PriorityQueue()\n",
+    "    for cur in range(sim.shape[0]):\n",
+    "        pq.put((sim[cur][0], cur))\n",
+    "        if len(pq.queue) > 5:\n",
+    "            pq.get()\n",
+    "\n",
+    "    pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
+    "    # print(pq_rank)\n",
+    "\n",
+    "    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表\n",
+    "                   # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做？ \n",
+    "    \n",
+    "\n",
+    "    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案  "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 612,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1186,22 +1018,61 @@
    "    3. 找出相似度最高的top5问题的答案\n",
    "    \"\"\"\n",
    "    \n",
-    "    top_idxs = []  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 \n",
+    "    sentence, arrs = bert_embedding([' '.join(qlist_preprocess([query])[0])])[0]\n",
-    "                   # hint: 利用priority queue来找出top results. 思考为什么可以这么做？ \n",
+    "    # print(sentence)\n",
+    "\n",
+    "    vecs = np.array(arrs)\n",
+    "    vec = np.mean(vecs, axis=0)\n",
+    "    q_vector = vec\n",
+    "    \n",
+    "    related_index = get_related_idx(query)\n",
+    "    X = X_bert[related_index]\n",
+    "    \n",
+    "    # 计算余弦相似度，tfidf默认l2范数；矩阵乘法\n",
+    "    sim = vector_matrix(q_vector, X).reshape(-1, 1)\n",
+    "    \n",
+    "    # 使用优先队列找出top5\n",
+    "    pq = PriorityQueue()\n",
+    "    for cur in range(sim.shape[0]):\n",
+    "        pq.put((sim[cur][0], cur))\n",
+    "        if len(pq.queue) > 5:\n",
+    "            pq.get()\n",
+    "\n",
+    "    pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
+    "    # print(pq_rank)\n",
+    "\n",
+    "    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表\n",
+    "                   # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做？ \n",
    "    \n",
-    "    return alist[top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案"
+    "\n",
+    "    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案  "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 615,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['in the late 1990s'], ['Fredericksburg'], ['02:28:01 PM China Standard Time'], ['132'], ['Link and Toon Link']]\n",
+      "[['in the late 1990s'], ['late 1990s'], ['02:28:01 PM China Standard Time'], ['Kanye West'], ['Shakira']]\n",
+      "[['in the late 1990s'], ['late 1990s'], ['Diana Ross.'], ['I Was Here'], ['Romantic era']]\n",
+      "[['At Last'], ['J. S. Bach, Mozart and Schubert'], ['piano'], ['Polish'], ['Polish']]\n",
+      "[['Beyoncé Cosmetology Center'], ['five.'], ['Madonna and Celine Dion'], ['April 15'], ['Baz Luhrmann']]\n",
+      "[['Madonna and Celine Dion'], ['2013 Met Gala'], ['118 million'], ['Baz Luhrmann'], ['eight']]\n"
+     ]
+    }
+   ],
   "source": [
    "# TODO: 编写几个测试用例，并输出结果\n",
    "\n",
-    "test_query1 = \"\"\n",
+    "# query = \"when beyonce start become popular?\"\n",
-    "test_query2 = \"\"\n",
+    "# qlist_preprocess([query])[0]\n",
+    "test_query1 = \"when beyonce start become popular?\"\n",
+    "test_query2 = \"where jordge come from\"\n",
    "\n",
    "print (get_top_results_tfidf(test_query1))\n",
    "print (get_top_results_w2v(test_query1))\n",
@@ -1245,7 +1116,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 620,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import nltk\n",
+    "# nltk.download('reuters')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 621,
   "metadata": {},
   "outputs": [],
   "source": [