Commit fecd2fd3 by zbh

inverted idx

parent 4a0ee0c4
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -408,32 +408,20 @@ ...@@ -408,32 +408,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 211, "execution_count": 230,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 467600 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 211,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# TODO \n", "# TODO \n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n", "vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None) # 定义一个tf-idf的vectorizer\n",
"vectorizer.fit_transform([' '.join(i) for i in qlist]) \n", "X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
"# X_tfidf = X # 结果存放在X矩阵里" "X_tfidf = X # 结果存放在X矩阵里"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 212, "execution_count": 231,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -442,7 +430,7 @@ ...@@ -442,7 +430,7 @@
"(86821, 30608)" "(86821, 30608)"
] ]
}, },
"execution_count": 212, "execution_count": 231,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -452,26 +440,6 @@ ...@@ -452,26 +440,6 @@
] ]
}, },
{ {
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.9943043238027367"
]
},
"execution_count": 215,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sum(X_tfidf[0].A)"
]
},
{
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
...@@ -776,268 +744,7 @@ ...@@ -776,268 +744,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 203, "execution_count": 566,
"metadata": {},
"outputs": [],
"source": [
"query = \"'beyonce', 'start', 'become, popular?\"\n",
"q_list = qlist_preprocess([query])"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['eyonce', 'tart', 'ecome', 'popular']]"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q_list"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"ques_tfidf = vectorizer.transform([' '.join(i) for i in q_list])"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 206,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ques_tfidf.A"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" ...,\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 207,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_tfidf.A"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 467600 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_tfidf"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"arr = ques_tfidf.A\n",
"brr = X_tfidf.A\n",
"def vector_matrix(arr, brr):\n",
" return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/zbh/anaconda3/envs/py37/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide\n",
" after removing the cwd from sys.path.\n"
]
},
{
"data": {
"text/plain": [
"array([[nan, nan, nan, ..., nan, nan, nan]])"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vector_matrix(arr, brr)"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr.A"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1x86821 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 0 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ques_tfidf.dot(X_tfidf.T)"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 0 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[84, 84, 36, 54]])"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = np.array([[1, 2, 1, 2, 3, 5, 6, 2]])\n",
"a.dot(b.T)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.]])"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ones((4,1)).dot(a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
"X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
"X_tfidf = X # 结果存放在X矩阵里"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -1050,22 +757,51 @@ ...@@ -1050,22 +757,51 @@
" 3. 找出相似度最高的top5问题的答案\n", " 3. 找出相似度最高的top5问题的答案\n",
" \"\"\"\n", " \"\"\"\n",
" \n", " \n",
" q_vector = vectorizer.transform([' '.join(qlist_preprocess([query])[0])])\n",
" # 计算余弦相似度,tfidf默认l2范数;矩阵乘法\n",
" sim = (X_tfidf * q_vector.T).toarray()\n",
"\n",
" \n",
" # res = np.argsort(sim)\n",
" # query = 'when beyonce start become popular?'\n",
" # ans: array([[43410, 57532, 57531, ..., 39267, 145, 0]])\n",
" \n",
" \n", " \n",
" top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下标 \n", " # 使用优先队列找出top5\n",
" pq = PriorityQueue()\n",
" for cur in range(sim.shape[0]):\n",
" pq.put((sim[cur][0], cur))\n",
" if len(pq.queue) > 5:\n",
" pq.get()\n",
"\n",
" pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
" # print(pq_rank)\n",
"\n",
" top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表\n",
" # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n", " # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n",
" \n", " \n",
" return alist[top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案 " "\n",
" return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案 "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 567,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['in the late 1990s'], ['Dangerously in Love Tour'], ['Particularly since the 1950s, pro wrestling events have frequently been responsible for sellout crowds at large arenas'], ['her mother'], ['Germany, the Netherlands, Switzerland, Latvia, Estonia and Hungary']]\n",
"[['in the later 19th century'], ['Tags'], ['Ashoka'], ['economic, social, and cultural'], ['water buffalo']]\n"
]
}
],
"source": [ "source": [
"# TODO: 编写几个测试用例,并输出结果\n", "# TODO: 编写几个测试用例,并输出结果\n",
"print (get_top_results_tfidf_noindex(\"\"))\n", "print(get_top_results_tfidf_noindex(\"when beyonce start become popular?\"))\n",
"print (get_top_results_tfidf_noindex(\"\"))" "print(get_top_results_tfidf_noindex(\"where jordge come from\"))"
] ]
}, },
{ {
...@@ -1085,19 +821,26 @@ ...@@ -1085,19 +821,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 531,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# TODO 请创建倒排表\n", "# TODO 请创建倒排表\n",
"inverted_idx = {} # 定一个一个简单的倒排表,是一个map结构。 循环所有qlist一遍就可以" "inverted_idx = {} # 定一个一个简单的倒排表,是一个map结构。 循环所有qlist一遍就可以\n",
"\n",
"for i, ques in enumerate(qlist):\n",
" for word in ques:\n",
" if word in inverted_idx.keys():\n",
" inverted_idx[word].add(i)\n",
" else:\n",
" inverted_idx[word] = set([i])"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### 3.3 语义相似度\n", "#### 3.3 related_words.txt\n",
"这里有一个问题还需要解决,就是语义的相似度。可以这么理解: 两个单词比如car, auto这两个单词长得不一样,但从语义上还是类似的。如果只是使用倒排表我们不能考虑到这些单词之间的相似度,这就导致如果我们搜索句子里包含了``car``, 则我们没法获取到包含auto的所有的文档。所以我们希望把这些信息也存下来。那这个问题如何解决呢? 其实也不难,可以提前构建好相似度的关系,比如对于``car``这个单词,一开始就找好跟它意思上比较类似的单词比如top 10,这些都标记为``related words``。所以最后我们就可以创建一个保存``related words``的一个``map``. 比如调用``related_words['car']``即可以调取出跟``car``意思上相近的TOP 10的单词。 \n", "这里有一个问题还需要解决,就是语义的相似度。可以这么理解: 两个单词比如car, auto这两个单词长得不一样,但从语义上还是类似的。如果只是使用倒排表我们不能考虑到这些单词之间的相似度,这就导致如果我们搜索句子里包含了``car``, 则我们没法获取到包含auto的所有的文档。所以我们希望把这些信息也存下来。那这个问题如何解决呢? 其实也不难,可以提前构建好相似度的关系,比如对于``car``这个单词,一开始就找好跟它意思上比较类似的单词比如top 10,这些都标记为``related words``。所以最后我们就可以创建一个保存``related words``的一个``map``. 比如调用``related_words['car']``即可以调取出跟``car``意思上相近的TOP 10的单词。 \n",
"\n", "\n",
"那这个``related_words``又如何构建呢? 在这里我们仍然使用``Glove``向量,然后计算一下俩俩的相似度(余弦相似度)。之后对于每一个词,存储跟它最相近的top 10单词,最终结果保存在``related_words``里面。 这个计算需要发生在离线,因为计算量很大,复杂度为``O(V*V)``, V是单词的总数。 \n", "那这个``related_words``又如何构建呢? 在这里我们仍然使用``Glove``向量,然后计算一下俩俩的相似度(余弦相似度)。之后对于每一个词,存储跟它最相近的top 10单词,最终结果保存在``related_words``里面。 这个计算需要发生在离线,因为计算量很大,复杂度为``O(V*V)``, V是单词的总数。 \n",
...@@ -1113,7 +856,15 @@ ...@@ -1113,7 +856,15 @@
"source": [ "source": [
"# TODO 读取语义相关的单词\n", "# TODO 读取语义相关的单词\n",
"def get_related_words(file):\n", "def get_related_words(file):\n",
" f = open('related_words.txt')\n",
" related_words = {}\n",
" \n", " \n",
" for line in f:\n",
" items = line.strip().split()\n",
" word = items[0]\n",
" sim_words_10 = items[1].split(',')\n",
" related_words[word] = sim_words_10\n",
" f.close()\n",
" return related_words\n", " return related_words\n",
"\n", "\n",
"related_words = get_related_words('related_words.txt') # 直接放在文件夹的根目录下,不要修改此路径。" "related_words = get_related_words('related_words.txt') # 直接放在文件夹的根目录下,不要修改此路径。"
...@@ -1134,7 +885,34 @@ ...@@ -1134,7 +885,34 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 574,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import wordnet as wn\n",
"def get_related_idx(query):\n",
"\n",
" related_set = set()\n",
" query_words = qlist_preprocess([query])[0]\n",
" for query_word in query_words:\n",
" # 1. 读取文件\n",
" # related_words_list = related_words[query_word]\n",
" # 2. nltk\n",
" related_words_list = [x.name().split(\".\")[0] for x in wn.synsets(query_word)][:10]\n",
" for related_word in related_words_list:\n",
" try:\n",
" related_set = related_set | inverted_idx[related_word]\n",
" except KeyError as e:\n",
" # print(e)\n",
" continue \n",
" \n",
" related_idx = list(related_set)\n",
" return related_idx"
]
},
{
"cell_type": "code",
"execution_count": 575,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -1146,18 +924,41 @@ ...@@ -1146,18 +924,41 @@
" 3. 找出相似度最高的top5问题的答案\n", " 3. 找出相似度最高的top5问题的答案\n",
" \"\"\"\n", " \"\"\"\n",
" \n", " \n",
" top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下表 \n", " q_vector = vectorizer.transform([' '.join(qlist_preprocess([query])[0])])\n",
" # hint: 利用priority queue来找出top results. 思考为什么可以这么做? \n", " \n",
" related_index = get_related_idx(query)\n",
" X = X_tfidf[related_index]\n",
" \n",
" \n", " \n",
" return alist[top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案" " # 计算余弦相似度,tfidf默认l2范数;矩阵乘法\n",
" sim = (X * q_vector.T).toarray()\n",
" \n",
" # 使用优先队列找出top5\n",
" pq = PriorityQueue()\n",
" for cur in range(sim.shape[0]):\n",
" pq.put((sim[cur][0], cur))\n",
" if len(pq.queue) > 5:\n",
" pq.get()\n",
"\n",
" pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
" # print(pq_rank)\n",
"\n",
" top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表\n",
" # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n",
" \n",
"\n",
" return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案 "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 606,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def vector_matrix(arr, brr):\n",
" return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))\n",
"\n",
"def get_top_results_w2v(query):\n", "def get_top_results_w2v(query):\n",
" \"\"\"\n", " \"\"\"\n",
" 给定用户输入的问题 query, 返回最有可能的TOP 5问题。这里面需要做到以下几点:\n", " 给定用户输入的问题 query, 返回最有可能的TOP 5问题。这里面需要做到以下几点:\n",
...@@ -1166,15 +967,46 @@ ...@@ -1166,15 +967,46 @@
" 3. 找出相似度最高的top5问题的答案\n", " 3. 找出相似度最高的top5问题的答案\n",
" \"\"\"\n", " \"\"\"\n",
" \n", " \n",
" top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下表 \n", " ques = qlist_preprocess([query])[0]\n",
" # hint: 利用priority queue来找出top results. 思考为什么可以这么做? \n", " \n",
" vec = np.zeros(200)\n",
" length = len(ques) # 句子长度\n",
" for word in ques:\n",
" try:\n",
" vec += word_dict[word]\n",
" except KeyError as e:\n",
" vec += word_dict['unk']\n",
" vec = vec / length\n",
" \n",
" q_vector = vec\n",
" \n", " \n",
" return alist[top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案" " related_index = get_related_idx(query)\n",
" X = X_w2v[related_index]\n",
" \n",
" \n",
" # 计算余弦相似度,tfidf默认l2范数;矩阵乘法\n",
" sim = vector_matrix(q_vector, X).reshape(-1, 1)\n",
" \n",
" # 使用优先队列找出top5\n",
" pq = PriorityQueue()\n",
" for cur in range(sim.shape[0]):\n",
" pq.put((sim[cur][0], cur))\n",
" if len(pq.queue) > 5:\n",
" pq.get()\n",
"\n",
" pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
" # print(pq_rank)\n",
"\n",
" top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表\n",
" # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n",
" \n",
"\n",
" return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案 "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 612,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -1186,22 +1018,61 @@ ...@@ -1186,22 +1018,61 @@
" 3. 找出相似度最高的top5问题的答案\n", " 3. 找出相似度最高的top5问题的答案\n",
" \"\"\"\n", " \"\"\"\n",
" \n", " \n",
" top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下表 \n", " sentence, arrs = bert_embedding([' '.join(qlist_preprocess([query])[0])])[0]\n",
" # hint: 利用priority queue来找出top results. 思考为什么可以这么做? \n", " # print(sentence)\n",
"\n",
" vecs = np.array(arrs)\n",
" vec = np.mean(vecs, axis=0)\n",
" q_vector = vec\n",
" \n",
" related_index = get_related_idx(query)\n",
" X = X_bert[related_index]\n",
" \n",
" # 计算余弦相似度,tfidf默认l2范数;矩阵乘法\n",
" sim = vector_matrix(q_vector, X).reshape(-1, 1)\n",
" \n",
" # 使用优先队列找出top5\n",
" pq = PriorityQueue()\n",
" for cur in range(sim.shape[0]):\n",
" pq.put((sim[cur][0], cur))\n",
" if len(pq.queue) > 5:\n",
" pq.get()\n",
"\n",
" pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])\n",
" # print(pq_rank)\n",
"\n",
" top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表\n",
" # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n",
" \n", " \n",
" return alist[top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案" "\n",
" return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案 "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 615,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['in the late 1990s'], ['Fredericksburg'], ['02:28:01 PM China Standard Time'], ['132'], ['Link and Toon Link']]\n",
"[['in the late 1990s'], ['late 1990s'], ['02:28:01 PM China Standard Time'], ['Kanye West'], ['Shakira']]\n",
"[['in the late 1990s'], ['late 1990s'], ['Diana Ross.'], ['I Was Here'], ['Romantic era']]\n",
"[['At Last'], ['J. S. Bach, Mozart and Schubert'], ['piano'], ['Polish'], ['Polish']]\n",
"[['Beyoncé Cosmetology Center'], ['five.'], ['Madonna and Celine Dion'], ['April 15'], ['Baz Luhrmann']]\n",
"[['Madonna and Celine Dion'], ['2013 Met Gala'], ['118 million'], ['Baz Luhrmann'], ['eight']]\n"
]
}
],
"source": [ "source": [
"# TODO: 编写几个测试用例,并输出结果\n", "# TODO: 编写几个测试用例,并输出结果\n",
"\n", "\n",
"test_query1 = \"\"\n", "# query = \"when beyonce start become popular?\"\n",
"test_query2 = \"\"\n", "# qlist_preprocess([query])[0]\n",
"test_query1 = \"when beyonce start become popular?\"\n",
"test_query2 = \"where jordge come from\"\n",
"\n", "\n",
"print (get_top_results_tfidf(test_query1))\n", "print (get_top_results_tfidf(test_query1))\n",
"print (get_top_results_w2v(test_query1))\n", "print (get_top_results_w2v(test_query1))\n",
...@@ -1245,7 +1116,17 @@ ...@@ -1245,7 +1116,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 620,
"metadata": {},
"outputs": [],
"source": [
"# import nltk\n",
"# nltk.download('reuters')"
]
},
{
"cell_type": "code",
"execution_count": 621,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment