Upload New File

ec8bac25 · 20210516036 · e6933c37 · ec8bac25
Commit ec8bac25 authored Jul 04, 2021 by 20210516036
Hide whitespace changes
Inline Side-by-side

Showing with 405 additions and 0 deletions

codes/QA_main.ipynb
+405 -0

No files found.
--- a/codes/QA_main.ipynb
+++ b/codes/QA_main.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 智能问答系统（主文件）\n",
+    "\n",
+    "在这里我们来搭建一个轻量级智能问答系统，所需要的模块，包括：\n",
+    "- 文本预处理：这部分已经帮大家写好，只需要看看代码就可以了。\n",
+    "- 搭建意图识别分类器：这部分也给大家写好了，使用fastText来做的意图识别器\n",
+    "- 倒排表：这部分大家需要自己去创建，同时也需要考虑相似的单词（课程视频中讲过）\n",
+    "- 排序：基于倒排表返回的结果，我们再根据余弦相似度来计算query跟候选问题之间的相似度，最后返回相似度最高的问题的答案。这里，我们将使用BERT来表示句子的向量。 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## %env KMP_DUPLICATE_LIB_OK=TRUE  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "import emoji\n",
+    "import re\n",
+    "import jieba\n",
+    "import torch\n",
+    "import fasttext\n",
+    "from sys import platform\n",
+    "from torch.utils.data import DataLoader\n",
+    "from transformers import BertTokenizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "# from bert_code.model import BertModelTest\n",
+    "# from bert_code.utils import test\n",
+    "# from bert_code.data import DataPrecessForSentence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取已经处理好的数据： 导入在preprocessor.ipynb中生成的data/question_answer_pares.pkl文件，并将其保存在变量QApares中\n",
+    "with open('data/question_answer_pares.pkl','rb') as f:\n",
+    "    QApares = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 导入在Retrieve.ipynb中生成的data/retrieve/invertedList.pkl倒排表文件，并将其保存在变量invertedList中\n",
+    "with open('data/retrieve/invertedList.pkl','rb') as f:\n",
+    "    invertedList = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 这一格的内容是从preprocessor.ipynb中粘贴而来，包含了数据预处理的几个关键函数，这部分用来处理input query string\n",
+    "import emoji\n",
+    "import re\n",
+    "import jieba\n",
+    "def clean(content):\n",
+    "    content = emoji.demojize(content)\n",
+    "    content = re.sub('<.*>','',content)\n",
+    "    return content\n",
+    "def question_cut(content):\n",
+    "    return list(jieba.cut(content))\n",
+    "def strip(wordList):\n",
+    "    return [word.strip() for word in wordList if word.strip()!='']\n",
+    "with open(\"data/stopWord.json\",\"r\") as f:\n",
+    "    stopWords = f.read().split(\"\\n\")\n",
+    "def rm_stop_word(wordList):\n",
+    "    return [word for word in wordList if word not in stopWords]\n",
+    "\n",
+    "def text_processing(sentence):\n",
+    "    sentence = clean(sentence)\n",
+    "    sentence = question_cut(sentence)\n",
+    "    sentence = strip(sentence)\n",
+    "    sentence = rm_stop_word(sentence)\n",
+    "    return sentence\n",
+    "\n",
+    "# 这一格是从Retrieve中粘贴而来，用于生成与输入数据较相近的一些候选问题的index\n",
+    "def get_retrieve_result(sentence):\n",
+    "    \"\"\"\n",
+    "    基于输入句子，并利用倒排表返回candidate sentence ids\n",
+    "    \"\"\"\n",
+    "    sentence = text_processing(sentence)\n",
+    "    candidate = set()\n",
+    "    for word in sentence:\n",
+    "        if word in invertedList:\n",
+    "            candidate = candidate | invertedList[word]\n",
+    "    return candidate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 加载训练好的fasttext模型用于意图识别\n",
+    "intention = fasttext.load_model('model/fasttext.ftz')\n",
+    "\n",
+    "def get_intention_result(sentence):\n",
+    "    '''\n",
+    "        输入句子，返回意图识别结果\n",
+    "        入参：\n",
+    "            sentence:输入的句子\n",
+    "        出参:\n",
+    "            fasttext_label:fasttext模型的输出，共有两种结果:__label__0和__label__1。__label__0表示闲聊型，__label__1表示任务型\n",
+    "    '''\n",
+    "    sentence = text_processing(sentence)\n",
+    "    sentence = ' '.join(sentence)\n",
+    "    fasttext_label = intention.predict(sentence)[0][0]\n",
+    "    return fasttext_label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/chi/anaconda3/envs/greedyaiqa/lib/python3.6/site-packages/gluonnlp/model/bert.py:693: UserWarning: wiki_cn/wiki_multilingual will be deprecated. Please use wiki_cn_cased/wiki_multilingual_uncased instead.\n",
+      "  warnings.warn('wiki_cn/wiki_multilingual will be deprecated.'\n"
+     ]
+    }
+   ],
+   "source": [
+    "#导入与bert embedding相关的包，关于mxnet包下载的注意事项参考实验手册\n",
+    "from bert_embedding import BertEmbedding\n",
+    "import mxnet\n",
+    "\n",
+    "ctx = mxnet.gpu()\n",
+    "embedding = BertEmbedding(ctx=ctx,dataset_name='wiki_cn')\n",
+    "# a = embedding(\"我今天要吃饭\")\n",
+    "def get_best_answer(sentence, candidate):\n",
+    "    \"\"\"\n",
+    "    sentence: 用户输入query， 已经处理好的\n",
+    "    candidate: 通过倒排表返回的候选问题的下标列表\n",
+    "    \n",
+    "    返回：最佳回复，string形式\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    ## TODO： 你需要完成这部分\n",
+    "    ##   计算query跟每一个候选问题之间的余弦相似度，最后排序\n",
+    "    ##   每个query跟候选问题首先要转化成向量形式，这里使用BERT embedding （可参考第一次项目作业）。 如果你想尝试，其他的embedding方法，也可以自行尝试如tf-idf\n",
+    "    sentence = text_processing(sentence)\n",
+    "    sentence = ' '.join(sentence)\n",
+    "    candi_score = {}\n",
+    "    candi_sents = []\n",
+    "    candi_ans = []\n",
+    "    for candi in candidate:\n",
+    "        candi_sent = \" \".join(QApares.question_after_preprocessing[candi])\n",
+    "        candi_sents.append(candi_sent)\n",
+    "        candi_an = \" \".join(QApares.answer[candi])\n",
+    "        candi_ans.append(candi_an)\n",
+    "    candi_sents.append(sentence)\n",
+    "    res_bert_vec = np.empty([len(candi_sents),768], dtype = np.float64) \n",
+    "    candi_embs = embedding(candi_sents)\n",
+    "    for i,res_item in enumerate(candi_embs):\n",
+    "        res_bert_vec[i] = np.mean(res_item[1],axis=0)\n",
+    "    ori_sent_vec = res_bert_vec[-1]\n",
+    "    candi_sent_vec = res_bert_vec[:-1]\n",
+    "    for i in range(candi_sent_vec.shape[0]):\n",
+    "        candi_score[i] = cosine_similarity(ori_sent_vec[np.newaxis,:],candi_sent_vec[i][np.newaxis,:])[0][0]\n",
+    "    candi_score = sorted(candi_score.items(),key=lambda x:x[1],reverse=True)\n",
+    "    answer = candi_ans[candi_score[0][0]]\n",
+    "    print(answer)\n",
+    "    return answer\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def QA(sentence):\n",
+    "    '''\n",
+    "        实现一个智能客服系统，输入一个句子sentence，返回一个回答\n",
+    "    '''\n",
+    "    # 若意图识别结果为闲聊型，则默认返回'闲聊机器人'\n",
+    "    if get_intention_result(sentence)=='__label__0':\n",
+    "        return '闲聊机器人'\n",
+    "    # 根据倒排表进行检索获得候选问题集\n",
+    "    candidate = get_retrieve_result(sentence)\n",
+    "    # 若候选问题集大小为0，默认返回'我不明白你在说什么'\n",
+    "    if len(candidate)==0:\n",
+    "        return '我不明白你在说什么'\n",
+    "    \n",
+    "    return get_best_answer(sentence, candidate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "这 边 您 看 不 影 响 您 使 用 的 话 给 您 返 两 元 红 包 可 以 吗\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'这 边 您 看 不 影 响 您 使 用 的 话 给 您 返 两 元 红 包 可 以 吗'"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('处理')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'闲聊机器人'"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('怎么退款')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'闲聊机器人'"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('这个商品有优惠券吗')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'闲聊机器人'"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('一二三四五六七')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "是 的 亲 W X 客 服\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'是 的 亲 W X 客 服'"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('我要找客服')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "亲 亲 明 天 关 注 下 这 个 物 流 哈\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'亲 亲 明 天 关 注 下 这 个 物 流 哈'"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试\n",
+    "QA('搞 这么久 浙江 过来 一两天 时间')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:greedyaiqa] *",
+   "language": "python",
+   "name": "conda-env-greedyaiqa-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}