{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" ] } ], "source": [ "import pandas as pd\n", "import fool\n", "import time\n", "import re\n", "import random\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "stopwords = {}\n", "with open(r'stopword.txt', 'r', encoding='utf-8') as fr:\n", " for word in fr:\n", " stopwords[word.strip()] = 0" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class CLF_MODEL:\n", " # 类目标:该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中\n", "\n", " # 初始化模块\n", " def __init__(self):\n", " self.model = \"\" # 成员变量,用于存储模型\n", " self.vectorizer = \"\" # 成员变量,用于存储tfidf统计值\n", "\n", " # 训练模块\n", " def train(self):\n", " # 函数目标:读取训练数据,训练意图分类模型,并将训练好的分类模型赋值给成员变量self.model\n", " # input:无\n", " # output:无\n", "\n", " # 从excel文件读取训练样本\n", " d_train = pd.read_excel(\"data_train.xlsx\")\n", " # 对训练数据进行预处理\n", " d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean)\n", " print(\"训练样本 = %d\" % len(d_train))\n", "\n", " \"\"\"\n", " TODO:利用sklearn中的函数进行训练,将句子转化为特征features\n", " \"\"\"\n", " self.vectorizer = TfidfVectorizer()\n", " X = self.vectorizer.fit_transform(d_train.sentence_train)\n", " self.model = LogisticRegression(penalty='l1', solver='liblinear')\n", "\n", " self.model.fit(X, d_train.label)\n", "\n", " # 预测模块(使用模型预测)\n", " def predict_model(self, sentence):\n", " # 函数目标:使用意图分类模型预测意图\n", " # input:sentence(用户输入)\n", " # output:clf_result(意图类别),score(意图分数)\n", "\n", " # --------------\n", " # 对样本中没有的特殊情况做特别判断\n", " if sentence in [\"好的\", \"需要\", \"是的\", \"要的\", \"好\", \"要\", \"是\"]:\n", " return 1, 0.8\n", " # --------------\n", "\n", " \"\"\"\n", " TODO:利用已训练好的意图分类模型进行意图识别\n", " \"\"\"\n", " sentence = self.fun_clean(sentence)\n", " x = self.vectorizer.transform([sentence])\n", " clf_result = self.model.predict(x)[0]\n", " score = self.model.predict_proba(x)[0][clf_result]\n", "\n", "\n", " return clf_result, score\n", "\n", " # 预测模块(使用规则)\n", " def predict_rule(self, sentence):\n", " # 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合\"模型\"及\"规则\"的预测方式\n", " # input:sentence(用户输入)\n", " # output:clf_result(意图类别),score(意图分数)\n", "\n", " sentence = sentence.replace(' ', '')\n", " if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence):\n", " return 2, 0.8\n", " elif re.findall(r'订|定|预定|买|购', sentence) or sentence in [\"好的\",\"需要\",\"是的\",\"要的\",\"好\",\"要\",\"是\"]:\n", " return 1, 0.8\n", " else:\n", " return 0, 0.8\n", "\n", " # 预处理函数\n", " def fun_clean(self, sentence):\n", " # 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等\n", " # input:sentence(用户输入语句)\n", " # output:sentence(预处理结果)\n", "\n", " \"\"\"\n", " TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等\n", " \"\"\"\n", " words, ners = fool.analysis(sentence)\n", "\n", " res = []\n", " ner_idx = 0\n", " cur_idx = 0\n", " for word in words[0]:\n", " if ner_idx < len(ners[0]) and cur_idx == ners[0][ner_idx][0]:\n", " if ners[0][ner_idx][0] == 'time':\n", " res.append('DATE')\n", " elif ners[0][ner_idx][0] == 'location':\n", " res.append('CITY')\n", " elif word[0] not in stopwords:\n", " res.append(word[0])\n", " ner_idx += 1\n", " elif word[0] not in stopwords:\n", " res.append(word[0])\n", " cur_idx += len(word)\n", " \n", " return ' '.join(res)\n", "\n", " # 分类主函数\n", " def fun_clf(self, sentence):\n", " # 函数目标:意图识别主函数\n", " # input:sentence( 用户输入语句)\n", " # output:clf_result(意图类别),score(意图分数)\n", "\n", " # 对用户输入进行预处理\n", " sentence = self.fun_clean(sentence)\n", " # 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)\n", " clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测\n", " # clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)\n", " return clf_result, score" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sentence = '帮 我 查 一下 DATE 去 CITY 的 票'" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "训练样本 = 99\n" ] } ], "source": [ "clf_obj = CLF_MODEL()\n", "# 完成意图识别模型的训练\n", "clf_obj.train()" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clf_result, score = clf_obj.predict_model(sentence)" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def fun_replace_num(sentence):\n", " # 函数目标:替换时间中的数字(目的是便于实体识别包fool对实体的识别)\n", " # input:sentence\n", " # output:sentence\n", "\n", " # 定义要替换的数字\n", " time_num = {\"一\":\"1\",\"二\":\"2\",\"三\":\"3\",\"四\":\"4\",\"五\":\"5\",\"六\":\"6\",\"七\":\"7\",\"八\":\"8\",\"九\":\"9\",\"十\":\"10\",\"十一\":\"11\",\"十二\":\"12\"}\n", " for k, v in time_num.items():\n", " sentence = sentence.replace(k, v)\n", " return sentence" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def slot_fill(sentence, key=None):\n", " # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)\n", " # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息)\n", " # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)\n", "\n", " slot = {}\n", " # 进行实体识别\n", " slot_lst = {\"time\":\"time\", \"date\":\"time\", \"from_city\":'location', \"to_city\":'location'}\n", " words, ners = fool.analysis(sentence)\n", " \"\"\"\n", " TODO:从sentence中寻找需要的内容,完成填槽工作\n", " \"\"\"\n", " if not key:\n", " keys = [\"time\", \"date\", \"from_city\", \"to_city\"]\n", " else:\n", " keys = [key]\n", " for key in keys:\n", " print(key)\n", " if ners != [[]]:\n", " for i in range(len(ners[0])):\n", " if ners[0][i][2] == slot_lst[key]:\n", " if key == \"time\" and not isTimeFormat(ners[0][i][3]):\n", " continue\n", " slot[key] = ners[0][i][3]\n", " del ners[0][i]\n", " break\n", " return slot\n", "\n", "\n", "def fun_wait(clf_obj):\n", " # 函数目标:等待,获取用户输入问句\n", " # input:CLF_MODEL类实例化对象\n", " # output:clf_result(用户输入意图类别), score(意图识别分数), sentence(用户输入)\n", "\n", " # 等待用户输入\n", " print(\"\\n\\n\\n\")\n", " print(\"-------------------------------------------------------------\")\n", " print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n", " print(\"Starting ...\")\n", " sentence = input(\"客服:请问需要什么服务?(时间请用12小时制表示)\\n\")\n", " # 对用户输入进行意图识别\n", " clf_result, score = clf_obj.fun_clf(sentence)\n", " return clf_result, score, sentence\n", "\n", "\n", "def fun_search(clf_result, sentence):\n", " # 函数目标:为用户查询余票\n", " # input:clf_result(意图分类结果), sentence(用户输入问句)\n", " # output:是否有票\n", "\n", " # 定义槽存储空间\n", " name = {\"time\":\"出发时间\", \"date\":\"出发日期\", \"from_city\":\"出发城市\", \"to_city\":\"到达城市\"}\n", " slot = {\"time\":\"\", \"date\":\"\", \"from_city\":\"\", \"to_city\":\"\"}\n", " # 使用用户第一句话进行填槽\n", " sentence = fun_replace_num(sentence)\n", " slot_init = slot_fill(sentence)\n", " for key in slot_init.keys():\n", " slot[key] = slot_init[key]\n", " # 对未填充对槽位,向用户提问,进行针对性填槽\n", " while \"\" in slot.values():\n", " for key in slot.keys():\n", " if slot[key]==\"\":\n", " sentence = input(\"客服:请问%s是?\\n\"%(name[key]))\n", " sentence = fun_replace_num(sentence)\n", " slot_cur = slot_fill(sentence, key)\n", " for key in slot_cur.keys():\n", " if slot[key]==\"\":\n", " slot[key] = slot_cur[key]\n", "\n", " # 查询是否有票,并答复用户(本次查询是否有票使用随机数完成,实际情况可查询数据库返回)\n", " if random.random()>0.5:\n", " print(\"客服:%s%s从%s到%s的票充足\"%(slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n", " # 返回1表示有票\n", " return 1\n", " else:\n", " print(\"客服:%s%s从%s到%s无票\" % (slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n", " print(\"End !!!\")\n", " print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n", " print(\"-------------------------------------------------------------\")\n", " # 返回0表示无票\n", " return 0\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 3), match='4pm'>\n" ] } ], "source": [ "time='4pm' \n", "print(re.match('^\\d{1,2}([:.]?\\d{1,2})?([ ]?[a|p]m)?$',time))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[('4', 'm'), ('pm', 'nx')]]\n", "[[]]\n" ] } ], "source": [ "s = '4pm'\n", "words, ners = fool.analysis(s)\n", "print(words)\n", "print(ners)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }