Commit 84cd30f8 by 20210509042

第二次项目

parent 11501eca
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 智能问答系统(主文件)\n",
"\n",
"在这里我们来搭建一个轻量级智能问答系统,所需要的模块,包括:\n",
"- 文本预处理:这部分已经帮大家写好,只需要看看代码就可以了。\n",
"- 搭建意图识别分类器:这部分也给大家写好了,使用fastText来做的意图识别器\n",
"- 倒排表:这部分大家需要自己去创建,同时也需要考虑相似的单词(课程视频中讲过)\n",
"- 排序:基于倒排表返回的结果,我们再根据余弦相似度来计算query跟候选问题之间的相似度,最后返回相似度最高的问题的答案。这里,我们将使用BERT来表示句子的向量。 "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"## %env KMP_DUPLICATE_LIB_OK=TRUE "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"import pickle\n",
"import emoji\n",
"import re\n",
"import jieba\n",
"import torch\n",
"import fasttext\n",
"from sys import platform\n",
"from torch.utils.data import DataLoader\n",
"from transformers import BertTokenizer\n",
"# from bert_code.model import BertModelTest\n",
"# from bert_code.utils import test\n",
"# from bert_code.data import DataPrecessForSentence"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 读取已经处理好的数据: 导入在preprocessor.ipynb中生成的data/question_answer_pares.pkl文件,并将其保存在变量QApares中\n",
"with open('data/question_answer_pares.pkl','rb') as f:\n",
" QApares = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 导入在Retrieve.ipynb中生成的data/retrieve/invertedList.pkl倒排表文件,并将其保存在变量invertedList中\n",
"with open('data/retrieve/invertedList.pkl','rb') as f:\n",
" invertedList = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 这一格的内容是从preprocessor.ipynb中粘贴而来,包含了数据预处理的几个关键函数,这部分用来处理input query string\n",
"import emoji\n",
"import re\n",
"import jieba\n",
"def clean(content):\n",
" content = emoji.demojize(content)\n",
" content = re.sub('<.*>','',content)\n",
" return content\n",
"def question_cut(content):\n",
" return list(jieba.cut(content))\n",
"def strip(wordList):\n",
" return [word.strip() for word in wordList if word.strip()!='']\n",
"with open(\"data/stopWord.json\",\"r\", encoding=\"utf-8\") as f:\n",
" stopWords = f.read().split(\"\\n\")\n",
"def rm_stop_word(wordList):\n",
" return [word for word in wordList if word not in stopWords]\n",
"\n",
"def text_processing(sentence):\n",
" sentence = clean(sentence)\n",
" sentence = question_cut(sentence)\n",
" sentence = strip(sentence)\n",
" sentence = rm_stop_word(sentence)\n",
" return sentence\n",
"\n",
"# 这一格是从Retrieve中粘贴而来,用于生成与输入数据较相近的一些候选问题的index\n",
"def get_retrieve_result(sentence):\n",
" \"\"\"\n",
" 基于输入句子,并利用倒排表返回candidate sentence ids\n",
" \"\"\"\n",
" sentence = text_processing(sentence)\n",
" candidate = set()\n",
" for word in sentence:\n",
" if word in invertedList:\n",
" candidate = candidate | invertedList[word]\n",
" return candidate"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# 加载训练好的fasttext模型用于意图识别\n",
"intention = fasttext.load_model('model/fasttext.ftz')\n",
"\n",
"def get_intention_result(sentence):\n",
" '''\n",
" 输入句子,返回意图识别结果\n",
" 入参:\n",
" sentence:输入的句子\n",
" 出参:\n",
" fasttext_label:fasttext模型的输出,共有两种结果:__label__0和__label__1。__label__0表示闲聊型,__label__1表示任务型\n",
" '''\n",
" sentence = text_processing(sentence)\n",
" print(sentence)\n",
" sentence = ' '.join(sentence)\n",
" fasttext_label = intention.predict(sentence)[0][0]\n",
" return fasttext_label"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#导入与bert embedding相关的包,关于mxnet包下载的注意事项参考实验手册\n",
"from transformers import BertModel, BertTokenizer, BertConfig\n",
"model = BertModel.from_pretrained('../bert')\n",
"tokenizer = BertTokenizer.from_pretrained('../bert')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_sentence_vector(sentence):\n",
" text_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, return_attention_mask=True)\n",
" input_ids = torch.tensor(text_dict['input_ids']).unsqueeze(0)\n",
" token_type_ids = torch.tensor(text_dict['token_type_ids']).unsqueeze(0)\n",
" attention_mask = torch.tensor(text_dict['attention_mask']).unsqueeze(0)\n",
" res = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
" last_hidden_states = res[0].detach().squeeze(0)\n",
" shape = last_hidden_states.shape\n",
" return last_hidden_states.sum(axis=0)/shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100000it [2:12:26, 12.58it/s]\n"
]
}
],
"source": [
"## 预计算候选问题的embedding\n",
"with open('data/question_answer_pares.pkl','rb') as f:\n",
" QApares = pickle.load(f)\n",
"sentence_query = []\n",
"for index,question in tqdm(enumerate(QApares.question)):\n",
" vec = get_sentence_vector(question)\n",
" sentence_query.append([vec, question])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def get_best_answer(sentence, candidate):\n",
" \"\"\"\n",
" sentence: 用户输入query, 已经处理好的\n",
" candidate: 通过倒排表返回的候选问题的下标列表\n",
" \n",
" 返回:最佳回复,string形式\n",
" \"\"\"\n",
" \n",
" ## TODO: 你需要完成这部分\n",
" ## 计算query跟每一个候选问题之间的余弦相似度,最后排序\n",
" ## 每个query跟候选问题首先要转化成向量形式,这里使用BERT embedding (可参考第一次项目作业)。 如果你想尝试,其他的embedding方法,也可以自行尝试如tf-idf\n",
"# print('get_best_answer', sentence, len(candidate))\n",
" vec = get_sentence_vector(sentence)\n",
" top_indx = 0\n",
" top_val = 0\n",
" for i, indx in enumerate(candidate):\n",
" dist1 = np.linalg.norm(vec)\n",
" dist2 = np.linalg.norm(sentence_query[indx][0])\n",
" val = np.dot(vec, sentence_query[indx][0])/dist1/dist2\n",
" if val > top_val and val < 0.998:\n",
" top_val = val\n",
"# print(val)\n",
" top_indx = indx\n",
"# print(QApares.answer.iloc[top_indx])\n",
" return QApares.answer.iloc[top_indx]\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def QA(sentence):\n",
" '''\n",
" 实现一个智能客服系统,输入一个句子sentence,返回一个回答\n",
" '''\n",
" # 若意图识别结果为闲聊型,则默认返回'闲聊机器人'\n",
" if get_intention_result(sentence)=='__label__0':\n",
" return '闲聊机器人'\n",
" # 根据倒排表进行检索获得候选问题集\n",
" candidate = get_retrieve_result(sentence)\n",
" # 若候选问题集大小为0,默认返回'我不明白你在说什么'\n",
" if len(candidate)==0:\n",
" return '我不明白你在说什么'\n",
" \n",
" return get_best_answer(sentence, candidate)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['发', '快递']\n"
]
},
{
"data": {
"text/plain": [
"'快递随机哦具体按仓库发货为准的'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 测试\n",
"QA('发什么快递')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['退款']\n"
]
},
{
"data": {
"text/plain": [
"'直接申请下退款哦'"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 测试\n",
"QA('怎么退款')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['商品', '优惠券']\n"
]
},
{
"data": {
"text/plain": [
"'没有的哦满足条件系统自动减的'"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 测试\n",
"QA('这个商品有优惠券吗')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['一二三四五', '六七']\n"
]
},
{
"data": {
"text/plain": [
"'我不明白你在说什么'"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 测试\n",
"QA('一二三四五六七')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 搭建倒排表\n",
"倒排表的作用是让搜索更加快速,是搜索引擎中常用的技术。根据课程中所讲的方法,你需要完成这部分的代码。 "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"import pickle\n",
"from gensim.models import KeyedVectors # 词向量用来比较俩俩之间相似度"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 读取数据: 导入在preprocessor.ipynb中生成的data/question_answer_pares.pkl文件,并将其保存在变量QApares中\n",
"with open('data/question_answer_pares.pkl','rb') as f:\n",
" QApares = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>answer</th>\n",
" <th>question_after_preprocessing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>买二份有没有少点呀</td>\n",
" <td>亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解</td>\n",
" <td>[买, 二份, 有没有, 少点]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>那就等你们处理喽</td>\n",
" <td>好的亲退了</td>\n",
" <td>[处理]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>那我不喜欢</td>\n",
" <td>颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦</td>\n",
" <td>[喜欢]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>不是免运费</td>\n",
" <td>本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮</td>\n",
" <td>[免, 运费]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>好吃吗</td>\n",
" <td>好吃的</td>\n",
" <td>[好吃]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question answer question_after_preprocessing\n",
"0 买二份有没有少点呀 亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解 [买, 二份, 有没有, 少点]\n",
"1 那就等你们处理喽 好的亲退了 [处理]\n",
"2 那我不喜欢 颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦 [喜欢]\n",
"3 不是免运费 本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮 [免, 运费]\n",
"4 好吃吗 好吃的 [好吃]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"QApares.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```TODO1``` 构造一个倒排表,不需要考虑单词的相似度"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# 构建一个倒排表,有关倒排表的详细内容参考实验手册\n",
"# 为了能够快速检索,倒排表应用哈希表来存储。python中字典内部便是用哈希表来存储的,所以这里我们直接将倒排表保存在字典中\n",
"# 注意:在这里不需要考虑单词之间的相似度。\n",
"from collections import defaultdict\n",
"inverted_list = defaultdict(list)\n",
"for index,sentence in enumerate(QApares.question_after_preprocessing):\n",
" ### 你需要完成的代码\n",
" for i, word in enumerate(sentence):\n",
" inverted_list[word].append(index)\n",
" ### 你需要完成的代码结束\n",
"# print(inverted_list)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#d ata/retrieve/sgns.zhihu.word是从https://github.com/Embedding/Chinese-Word-Vectors下载到的预训练好的中文词向量文件\n",
"#使 用KeyedVectors.load_word2vec_format()函数加载预训练好的词向量文件\n",
"model = KeyedVectors.load_word2vec_format('data/retrieve/sgns.zhihu.word')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_similar_by_word(word,topk):\n",
" '''\n",
" 返回与一个单词word相似度最高的topk个单词所组成的单词列表\n",
" 出参:\n",
" word_list:与word相似度最高的topk个单词所组成的单词列表。格式为[单词1,单词2,单词3,单词4,单词5]\n",
" '''\n",
" similar_words = model.similar_by_word(word,topk)\n",
" word_list = [word[0] for word in similar_words]\n",
" return word_list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```TODO2``` 构造一个新的倒排表,考虑单词之间的语义相似度"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache C:\\Users\\webberg\\AppData\\Local\\Temp\\jieba.cache\n",
"Loading model cost 0.900 seconds.\n",
"Prefix dict has been built successfully.\n"
]
}
],
"source": [
"from gensim.models import word2vec\n",
"import jieba\n",
"\n",
"sent = []\n",
"for index,s in enumerate(QApares.question):\n",
" ss = jieba.lcut(s)\n",
" sent.append(ss)\n",
" \n",
"model = word2vec.Word2Vec(sent, min_count=10, window=2)\n",
"word_vec = model.wv"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('购买', 0.7788702845573425),\n",
" ('买过', 0.7300090789794922),\n",
" ('拍', 0.7121500968933105),\n",
" ('卖', 0.6998841762542725),\n",
" ('买点', 0.6799497008323669),\n",
" ('试试', 0.6017423272132874),\n",
" ('订', 0.5960362553596497),\n",
" ('尝尝', 0.5920028686523438),\n",
" ('先买', 0.5833995342254639),\n",
" ('少', 0.5675268769264221)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vec.most_similar('买', topn=10)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████| 3832/3832 [00:01<00:00, 3756.96it/s]\n"
]
}
],
"source": [
"# TODO:\n",
"# 构造一个新的倒排表,并将结果保存在字典inverted_list_new中\n",
"# 新的倒排表键为word,值为老倒排表[word]、老倒排表[单词1]、老倒排表[单词2]、老倒排表[单词3]、老倒排表[单词4]的并集\n",
"# 即新倒排表保存了包含单词word或包含与单词word最相近的5个单词中的某一个的问题的index\n",
"import copy\n",
"\n",
"inverted_list_new = defaultdict(list)\n",
"for word in tqdm(inverted_list):\n",
" ### 你需要完成的部分\n",
" if word not in word_vec.vocab: continue\n",
" cands = word_vec.most_similar(word, topn=5)\n",
" indexes = set()\n",
" if word in inverted_list:\n",
" for val in inverted_list[word]:\n",
" indexes.add(val)\n",
" for idx in [0,1,2,3]:\n",
" if cands[idx][0] in inverted_list:\n",
" for val in inverted_list[cands[idx][0]]:\n",
" indexes.add(val)\n",
" inverted_list_new[word] = indexes\n",
"# print(inverted_list_new[word])\n",
"# break\n",
" ### 你需要完成的代码结束\n",
"# print(inverted_list_new) "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# 将新的倒排表保存在文件data/retrieve/invertedList.pkl中\n",
"with open('data/retrieve/invertedList.pkl','wb') as f:\n",
" pickle.dump(inverted_list_new,f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"以下为测试,完成上述过程之后,可以运行以下的代码来测试准确性。"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#这一格的内容是从preprocessor.ipynb中粘贴而来,包含了数据预处理的几个关键函数\n",
"import emoji\n",
"import re\n",
"import jieba\n",
"def clean(content):\n",
" content = emoji.demojize(content)\n",
" content = re.sub('<.*>','',content)\n",
" return content\n",
"#这一函数是用于对句子进行分词,在preprocessor.ipynb中由于数据是已经分好词的,所以我们并没有进行这一步骤,但是对于一个新的问句,这一步是必不可少的\n",
"def question_cut(content):\n",
" return list(jieba.cut(content))\n",
"def strip(wordList):\n",
" return [word.strip() for word in wordList if word.strip()!='']\n",
"with open(\"data/stopWord.json\",\"r\", encoding=\"utf-8\") as f:\n",
" stopWords = f.read().split(\"\\n\")\n",
"def rm_stop_word(wordList):\n",
" return [word for word in wordList if word not in stopWords]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# 从data/retrieve/invertedList.pkl加载倒排表并将其保存在变量invertedList中\n",
"with open('data/retrieve/invertedList.pkl','rb') as f:\n",
" invertedList = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def get_retrieve_result(sentence):\n",
" '''\n",
" 输入一个句子sentence,根据倒排表进行快速检索,返回与该句子较相近的一些候选问题的index\n",
" 候选问题由包含该句子中任一单词或包含与该句子中任一单词意思相近的单词的问题索引组成\n",
" '''\n",
" sentence = clean(sentence)\n",
" sentence = question_cut(sentence)\n",
" sentence = strip(sentence)\n",
" sentence = rm_stop_word(sentence)\n",
" candidate = set()\n",
" for word in sentence:\n",
" if word in invertedList:\n",
" candidate = candidate | invertedList[word]\n",
" return candidate"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['发货'] 发货\n"
]
},
{
"data": {
"text/plain": [
"{81920,\n",
" 16386,\n",
" 49154,\n",
" 65541,\n",
" 5,\n",
" 32775,\n",
" 32776,\n",
" 81927,\n",
" 81930,\n",
" 16398,\n",
" 81935,\n",
" 17,\n",
" 18,\n",
" 65554,\n",
" 16401,\n",
" 81947,\n",
" 29,\n",
" 65566,\n",
" 49182,\n",
" 32800,\n",
" 81953,\n",
" 32803,\n",
" 98339,\n",
" 81959,\n",
" 32810,\n",
" 98346,\n",
" 49194,\n",
" 32818,\n",
" 55,\n",
" 49209,\n",
" 98366,\n",
" 64,\n",
" 49219,\n",
" 65604,\n",
" 81988,\n",
" 16458,\n",
" 65611,\n",
" 81995,\n",
" 32845,\n",
" 81998,\n",
" 16463,\n",
" 16464,\n",
" 49233,\n",
" 32850,\n",
" 98387,\n",
" 49234,\n",
" 16475,\n",
" 49245,\n",
" 98398,\n",
" 65631,\n",
" 82015,\n",
" 49247,\n",
" 102,\n",
" 65639,\n",
" 65640,\n",
" 49258,\n",
" 49259,\n",
" 65646,\n",
" 98415,\n",
" 98416,\n",
" 49263,\n",
" 49267,\n",
" 32884,\n",
" 16500,\n",
" 118,\n",
" 82035,\n",
" 16503,\n",
" 122,\n",
" 65659,\n",
" 49275,\n",
" 125,\n",
" 32894,\n",
" 133,\n",
" 65669,\n",
" 65670,\n",
" 65671,\n",
" 49293,\n",
" 142,\n",
" 65679,\n",
" 32912,\n",
" 98451,\n",
" 150,\n",
" 151,\n",
" 32929,\n",
" 49318,\n",
" 49320,\n",
" 65708,\n",
" 82092,\n",
" 82093,\n",
" 16556,\n",
" 16560,\n",
" 98484,\n",
" 98489,\n",
" 187,\n",
" 49340,\n",
" 32957,\n",
" 65727,\n",
" 200,\n",
" 16588,\n",
" 32973,\n",
" 65742,\n",
" 98518,\n",
" 16598,\n",
" 49366,\n",
" 65755,\n",
" 220,\n",
" 223,\n",
" 65764,\n",
" 82149,\n",
" 82153,\n",
" 82155,\n",
" 16621,\n",
" 65775,\n",
" 49396,\n",
" 65783,\n",
" 49400,\n",
" 33017,\n",
" 65786,\n",
" 65790,\n",
" 254,\n",
" 82176,\n",
" 261,\n",
" 65798,\n",
" 65810,\n",
" 275,\n",
" 98586,\n",
" 49442,\n",
" 49445,\n",
" 65833,\n",
" 33068,\n",
" 82220,\n",
" 65838,\n",
" 82221,\n",
" 82224,\n",
" 33073,\n",
" 65843,\n",
" 65844,\n",
" 16691,\n",
" 310,\n",
" 16696,\n",
" 82234,\n",
" 65852,\n",
" 49468,\n",
" 318,\n",
" 49471,\n",
" 49472,\n",
" 16706,\n",
" 49475,\n",
" 65862,\n",
" 65863,\n",
" 16712,\n",
" 82248,\n",
" 49483,\n",
" 49493,\n",
" 16726,\n",
" 344,\n",
" 16730,\n",
" 65883,\n",
" 82268,\n",
" 65885,\n",
" 350,\n",
" 33120,\n",
" 16745,\n",
" 49514,\n",
" 364,\n",
" 98668,\n",
" 65903,\n",
" 33140,\n",
" 98678,\n",
" 65913,\n",
" 33148,\n",
" 33149,\n",
" 49535,\n",
" 33158,\n",
" 49543,\n",
" 49544,\n",
" 82313,\n",
" 33163,\n",
" 16783,\n",
" 33168,\n",
" 401,\n",
" 98709,\n",
" 49558,\n",
" 98715,\n",
" 16796,\n",
" 49565,\n",
" 82333,\n",
" 82334,\n",
" 33189,\n",
" 33191,\n",
" 65960,\n",
" 33193,\n",
" 49579,\n",
" 16812,\n",
" 98740,\n",
" 16822,\n",
" 82359,\n",
" 33210,\n",
" 16827,\n",
" 82365,\n",
" 98751,\n",
" 16832,\n",
" 98754,\n",
" 33220,\n",
" 453,\n",
" 49604,\n",
" 98763,\n",
" 461,\n",
" 33233,\n",
" 469,\n",
" 49623,\n",
" 16856,\n",
" 33244,\n",
" 49629,\n",
" 16867,\n",
" 16869,\n",
" 98794,\n",
" 82410,\n",
" 82412,\n",
" 495,\n",
" 98803,\n",
" 49651,\n",
" 49656,\n",
" 33273,\n",
" 16889,\n",
" 49658,\n",
" 33276,\n",
" 82426,\n",
" 66046,\n",
" 66053,\n",
" 49669,\n",
" 82437,\n",
" 33290,\n",
" 49674,\n",
" 16911,\n",
" 33296,\n",
" 66065,\n",
" 530,\n",
" 49682,\n",
" 33300,\n",
" 66069,\n",
" 16918,\n",
" 66077,\n",
" 542,\n",
" 543,\n",
" 82463,\n",
" 66081,\n",
" 98850,\n",
" 16932,\n",
" 66087,\n",
" 66091,\n",
" 556,\n",
" 66093,\n",
" 66094,\n",
" 98860,\n",
" 82475,\n",
" 66097,\n",
" 49709,\n",
" 16942,\n",
" 49715,\n",
" 49716,\n",
" 98877,\n",
" 66111,\n",
" 33346,\n",
" 579,\n",
" 33347,\n",
" 82499,\n",
" 49735,\n",
" 49739,\n",
" 588,\n",
" 16983,\n",
" 98907,\n",
" 16991,\n",
" 82528,\n",
" 49761,\n",
" 49762,\n",
" 66152,\n",
" 66153,\n",
" 49769,\n",
" 33387,\n",
" 17003,\n",
" 49777,\n",
" 98932,\n",
" 17012,\n",
" 66166,\n",
" 17020,\n",
" 17021,\n",
" 639,\n",
" 640,\n",
" 49793,\n",
" 642,\n",
" 17026,\n",
" 98948,\n",
" 17027,\n",
" 49796,\n",
" 82563,\n",
" 17030,\n",
" 66185,\n",
" 82566,\n",
" 651,\n",
" 82565,\n",
" 33422,\n",
" 82576,\n",
" 98962,\n",
" 33427,\n",
" 17043,\n",
" 49812,\n",
" 82584,\n",
" 98970,\n",
" 17050,\n",
" 17052,\n",
" 66207,\n",
" 82592,\n",
" 673,\n",
" 33441,\n",
" 17057,\n",
" 17061,\n",
" 33446,\n",
" 49831,\n",
" 66221,\n",
" 82605,\n",
" 687,\n",
" 66224,\n",
" 49841,\n",
" 33459,\n",
" 17076,\n",
" 694,\n",
" 33464,\n",
" 17080,\n",
" 33466,\n",
" 99002,\n",
" 33468,\n",
" 99005,\n",
" 66238,\n",
" 99007,\n",
" 82628,\n",
" 710,\n",
" 82630,\n",
" 82632,\n",
" 99017,\n",
" 718,\n",
" 82639,\n",
" 49872,\n",
" 82645,\n",
" 82652,\n",
" 49889,\n",
" 82658,\n",
" 99044,\n",
" 743,\n",
" 17127,\n",
" 33513,\n",
" 49897,\n",
" 17132,\n",
" 749,\n",
" 49901,\n",
" 49903,\n",
" 82674,\n",
" 33523,\n",
" 17142,\n",
" 759,\n",
" 17144,\n",
" 66299,\n",
" 99068,\n",
" 33535,\n",
" 17152,\n",
" 769,\n",
" 82689,\n",
" 33539,\n",
" 66307,\n",
" 66309,\n",
" 82694,\n",
" 17162,\n",
" 99084,\n",
" 49932,\n",
" 17167,\n",
" 49935,\n",
" 66321,\n",
" 99090,\n",
" 66323,\n",
" 82713,\n",
" 99098,\n",
" 66334,\n",
" 82719,\n",
" 800,\n",
" 66337,\n",
" 49951,\n",
" 17185,\n",
" 49954,\n",
" 17190,\n",
" 33576,\n",
" 49962,\n",
" 17196,\n",
" 99120,\n",
" 99122,\n",
" 49972,\n",
" 49974,\n",
" 17207,\n",
" 33592,\n",
" 33593,\n",
" 33600,\n",
" 33604,\n",
" 99140,\n",
" 82757,\n",
" 841,\n",
" 49994,\n",
" 845,\n",
" 99149,\n",
" 17235,\n",
" 852,\n",
" 66389,\n",
" 854,\n",
" 82771,\n",
" 33624,\n",
" 33625,\n",
" 858,\n",
" 66395,\n",
" 99163,\n",
" 33629,\n",
" 82775,\n",
" 17245,\n",
" 82782,\n",
" 17249,\n",
" 17251,\n",
" 82787,\n",
" 99173,\n",
" 33638,\n",
" 17255,\n",
" 874,\n",
" 66411,\n",
" 82794,\n",
" 17259,\n",
" 878,\n",
" 33647,\n",
" 66415,\n",
" 66417,\n",
" 33650,\n",
" 66418,\n",
" 99182,\n",
" 99183,\n",
" 33654,\n",
" 82798,\n",
" 891,\n",
" 17275,\n",
" 82812,\n",
" 33662,\n",
" 17283,\n",
" 66436,\n",
" 901,\n",
" 99208,\n",
" 66441,\n",
" 33674,\n",
" 33675,\n",
" 17291,\n",
" 82829,\n",
" 33682,\n",
" 916,\n",
" 66452,\n",
" 17304,\n",
" 33691,\n",
" 33692,\n",
" 66461,\n",
" 66463,\n",
" 82847,\n",
" 33697,\n",
" 99234,\n",
" 931,\n",
" 82848,\n",
" 17317,\n",
" 938,\n",
" 17324,\n",
" 82863,\n",
" 944,\n",
" 66480,\n",
" 33716,\n",
" 99254,\n",
" 82878,\n",
" 959,\n",
" 33728,\n",
" 99266,\n",
" 99267,\n",
" 33736,\n",
" 33741,\n",
" 17359,\n",
" 978,\n",
" 17364,\n",
" 33750,\n",
" 82905,\n",
" 992,\n",
" 17378,\n",
" 33765,\n",
" 99301,\n",
" 1000,\n",
" 50152,\n",
" 1005,\n",
" 99309,\n",
" 50158,\n",
" 82928,\n",
" 33780,\n",
" 99318,\n",
" 1017,\n",
" 33787,\n",
" 66557,\n",
" 1024,\n",
" 17408,\n",
" 17409,\n",
" 1027,\n",
" 50179,\n",
" 82949,\n",
" 82950,\n",
" 1034,\n",
" 50187,\n",
" 1036,\n",
" 50191,\n",
" 66577,\n",
" 99345,\n",
" 66580,\n",
" 99351,\n",
" 82969,\n",
" 82970,\n",
" 17437,\n",
" 99360,\n",
" 1057,\n",
" 33825,\n",
" 99363,\n",
" 82977,\n",
" 82979,\n",
" 33831,\n",
" 66600,\n",
" 17447,\n",
" 33837,\n",
" 99373,\n",
" 66607,\n",
" 99376,\n",
" 33841,\n",
" 50221,\n",
" 82993,\n",
" 99380,\n",
" 66613,\n",
" 99381,\n",
" 82996,\n",
" 82998,\n",
" 33849,\n",
" 83003,\n",
" 66622,\n",
" 1087,\n",
" 1088,\n",
" 99391,\n",
" 83007,\n",
" 17473,\n",
" 83011,\n",
" 50245,\n",
" 33862,\n",
" 1095,\n",
" 1098,\n",
" 66637,\n",
" 1102,\n",
" 99405,\n",
" 50256,\n",
" 99410,\n",
" 99413,\n",
" 66646,\n",
" 99419,\n",
" 66652,\n",
" 83035,\n",
" 1118,\n",
" 1119,\n",
" 83037,\n",
" 83040,\n",
" 99431,\n",
" 1129,\n",
" 17516,\n",
" 83052,\n",
" 66670,\n",
" 66673,\n",
" 50290,\n",
" 17524,\n",
" 33912,\n",
" 83065,\n",
" 33914,\n",
" 66685,\n",
" 50301,\n",
" 83071,\n",
" 17537,\n",
" 50309,\n",
" 33927,\n",
" 99467,\n",
" 99470,\n",
" 50318,\n",
" 99472,\n",
" 66705,\n",
" 83088,\n",
" 66708,\n",
" 66709,\n",
" 50327,\n",
" 99484,\n",
" 99485,\n",
" 83101,\n",
" 99487,\n",
" 66721,\n",
" 33954,\n",
" 1187,\n",
" 83105,\n",
" 50337,\n",
" 50341,\n",
" 99495,\n",
" 66728,\n",
" 50345,\n",
" 50346,\n",
" 66732,\n",
" 83116,\n",
" 66734,\n",
" 50351,\n",
" 17585,\n",
" 50356,\n",
" 99510,\n",
" 17591,\n",
" 50363,\n",
" 99517,\n",
" 66751,\n",
" 17599,\n",
" 1217,\n",
" 1223,\n",
" 33992,\n",
" 99528,\n",
" 17607,\n",
" 50376,\n",
" 17616,\n",
" 99537,\n",
" 66770,\n",
" 99539,\n",
" 1237,\n",
" 66774,\n",
" 1242,\n",
" 66779,\n",
" 66780,\n",
" 17628,\n",
" 1247,\n",
" 17631,\n",
" 83168,\n",
" 50400,\n",
" 99555,\n",
" 50402,\n",
" 83172,\n",
" 50405,\n",
" 34024,\n",
" 50409,\n",
" 50411,\n",
" 1265,\n",
" 34034,\n",
" 83186,\n",
" 17651,\n",
" 50419,\n",
" 1270,\n",
" 34042,\n",
" 1277,\n",
" 66814,\n",
" 99583,\n",
" 66816,\n",
" 17671,\n",
" 17677,\n",
" 50445,\n",
" 34064,\n",
" 1297,\n",
" 66834,\n",
" 50448,\n",
" 50449,\n",
" 50456,\n",
" 66844,\n",
" 83229,\n",
" 83231,\n",
" 1312,\n",
" 66849,\n",
" 50464,\n",
" 17696,\n",
" 17698,\n",
" 17699,\n",
" 66854,\n",
" 83236,\n",
" 17702,\n",
" 83242,\n",
" 1326,\n",
" 17713,\n",
" 66866,\n",
" 34099,\n",
" 99637,\n",
" 1334,\n",
" 66871,\n",
" 83258,\n",
" 66877,\n",
" 1342,\n",
" 50493,\n",
" 1345,\n",
" 99651,\n",
" 66887,\n",
" 1352,\n",
" 50503,\n",
" 1354,\n",
" 17737,\n",
" 50508,\n",
" 66895,\n",
" 99671,\n",
" 83290,\n",
" 66909,\n",
" 99681,\n",
" 83300,\n",
" 1382,\n",
" 17770,\n",
" 1392,\n",
" 66929,\n",
" 34162,\n",
" 34163,\n",
" 17777,\n",
" 1401,\n",
" 1402,\n",
" 34169,\n",
" 50554,\n",
" 99709,\n",
" 50557,\n",
" 1407,\n",
" 66944,\n",
" 66945,\n",
" 1412,\n",
" 1415,\n",
" 34184,\n",
" 1420,\n",
" 83342,\n",
" 99729,\n",
" 1426,\n",
" 34196,\n",
" 83351,\n",
" 66971,\n",
" 1436,\n",
" 83357,\n",
" 66974,\n",
" 99745,\n",
" 1443,\n",
" 83363,\n",
" 50598,\n",
" 34217,\n",
" 66986,\n",
" 83369,\n",
" 1452,\n",
" 66993,\n",
" 50609,\n",
" 1459,\n",
" 34227,\n",
" 66996,\n",
" 83385,\n",
" 50618,\n",
" 50626,\n",
" 50627,\n",
" 1480,\n",
" 67022,\n",
" 83406,\n",
" 83407,\n",
" 34258,\n",
" 50643,\n",
" 83412,\n",
" 99797,\n",
" 50647,\n",
" 99803,\n",
" 83419,\n",
" 50652,\n",
" 50654,\n",
" 1508,\n",
" 50661,\n",
" 99814,\n",
" 1512,\n",
" 1515,\n",
" 67053,\n",
" 99821,\n",
" 17901,\n",
" 99824,\n",
" 99825,\n",
" 83438,\n",
" 50672,\n",
" 83441,\n",
" 50677,\n",
" 17910,\n",
" 83446,\n",
" 83447,\n",
" 83448,\n",
" 17916,\n",
" 17918,\n",
" 34303,\n",
" 67071,\n",
" 34307,\n",
" 83460,\n",
" 67077,\n",
" 67079,\n",
" 99854,\n",
" 83473,\n",
" 67092,\n",
" 67094,\n",
" 99862,\n",
" 1565,\n",
" 34333,\n",
" 1567,\n",
" 50717,\n",
" 34338,\n",
" 1579,\n",
" 83502,\n",
" 99888,\n",
" 1585,\n",
" 83504,\n",
" 67127,\n",
" 99897,\n",
" 50753,\n",
" 83521,\n",
" 1603,\n",
" 1604,\n",
" 17991,\n",
" 17992,\n",
" 50760,\n",
" 83527,\n",
" 67147,\n",
" 17996,\n",
" 67152,\n",
" 50768,\n",
" 99922,\n",
" 34388,\n",
" 67156,\n",
" 99924,\n",
" 1623,\n",
" 34395,\n",
" 99931,\n",
" 18016,\n",
" 1635,\n",
" 34408,\n",
" 99946,\n",
" 67180,\n",
" 99949,\n",
" 1647,\n",
" 99954,\n",
" 34419,\n",
" 83571,\n",
" 99957,\n",
" 83573,\n",
" 18043,\n",
" 50811,\n",
" 67197,\n",
" 34434,\n",
" 99970,\n",
" 18053,\n",
" 83590,\n",
" 34440,\n",
" 1673,\n",
" 50826,\n",
" 1675,\n",
" 1676,\n",
" 34443,\n",
" 67212,\n",
" 67216,\n",
" 18065,\n",
" 18066,\n",
" 18068,\n",
" 18069,\n",
" 50838,\n",
" 34455,\n",
" 50839,\n",
" 34457,\n",
" 50841,\n",
" 1691,\n",
" 1692,\n",
" 83615,\n",
" 1697,\n",
" 50855,\n",
" 34473,\n",
" 67241,\n",
" 50861,\n",
" 34480,\n",
" 83633,\n",
" 67250,\n",
" 18104,\n",
" 18108,\n",
" 83645,\n",
" 18112,\n",
" 18116,\n",
" 67269,\n",
" 83656,\n",
" 1743,\n",
" 83667,\n",
" 67284,\n",
" 50904,\n",
" 83674,\n",
" 50910,\n",
" 34528,\n",
" 18146,\n",
" 67299,\n",
" 50914,\n",
" 50917,\n",
" 1767,\n",
" 83688,\n",
" 50923,\n",
" 1775,\n",
" 18160,\n",
" 18168,\n",
" 34553,\n",
" 67323,\n",
" 18171,\n",
" 50939,\n",
" 50944,\n",
" 1798,\n",
" 83718,\n",
" 34572,\n",
" 1805,\n",
" 83724,\n",
" 34576,\n",
" 67344,\n",
" 83730,\n",
" 67350,\n",
" 1820,\n",
" 1821,\n",
" 34590,\n",
" 50972,\n",
" 1825,\n",
" 50986,\n",
" 50988,\n",
" 1837,\n",
" 34606,\n",
" 1839,\n",
" 67375,\n",
" 50990,\n",
" 50993,\n",
" 67379,\n",
" 83767,\n",
" 51000,\n",
" 1852,\n",
" 34620,\n",
" 83774,\n",
" 1861,\n",
" 18245,\n",
" 83784,\n",
" 1866,\n",
" 67402,\n",
" 1874,\n",
" 34643,\n",
" 83795,\n",
" 83799,\n",
" 67416,\n",
" 18270,\n",
" 51039,\n",
" 83807,\n",
" 83810,\n",
" 51043,\n",
" 51046,\n",
" 67431,\n",
" 51047,\n",
" 1904,\n",
" 67441,\n",
" 18289,\n",
" 34678,\n",
" 34687,\n",
" 1920,\n",
" 34689,\n",
" 51071,\n",
" 51074,\n",
" 83846,\n",
" 1929,\n",
" 18314,\n",
" 51084,\n",
" 34701,\n",
" 18319,\n",
" 83857,\n",
" 83858,\n",
" 34708,\n",
" 34710,\n",
" 18326,\n",
" 51095,\n",
" 51098,\n",
" 34718,\n",
" 67492,\n",
" 67493,\n",
" 51108,\n",
" 18348,\n",
" 83887,\n",
" 1970,\n",
" 51122,\n",
" 83895,\n",
" 34744,\n",
" 51128,\n",
" 83898,\n",
" 83902,\n",
" 34753,\n",
" 83906,\n",
" 51140,\n",
" 18373,\n",
" 67531,\n",
" 51151,\n",
" 18384,\n",
" 2004,\n",
" 51159,\n",
" 2008,\n",
" 2009,\n",
" 83932,\n",
" 18397,\n",
" 18401,\n",
" 83940,\n",
" 2023,\n",
" 67560,\n",
" ...}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_retrieve_result('什么时候发货') # 通过倒排表返回文档IDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment