From 1e8350963dc4dd790056f5053778215b8f4e20fe Mon Sep 17 00:00:00 2001 From: zeyu <zeyu@unifisoftware.com> Date: Mon, 20 Apr 2020 20:09:42 -0700 Subject: [PATCH] initial commit --- .DS_Store | Bin 0 -> 6148 bytes .ipynb_checkpoints/Untitled-checkpoint.ipynb | 391 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Readme.txt | 3 +++ __MACOSX/._project2_greedy_upload | Bin 0 -> 220 bytes __MACOSX/project2_greedy_upload/._.DS_Store | Bin 0 -> 120 bytes __MACOSX/project2_greedy_upload/._Readme.txt | Bin 0 -> 484 bytes __MACOSX/project2_greedy_upload/._data_train.xlsx | Bin 0 -> 334 bytes __MACOSX/project2_greedy_upload/._project2_main.py | Bin 0 -> 176 bytes __MACOSX/project2_greedy_upload/._票务对话机器人项目说明.pdf | Bin 0 -> 343 bytes data_train.xlsx | Bin 0 -> 17277 bytes project2_main.py | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stopword.txt | 13 +++++++++++++ 12 files changed, 679 insertions(+) create mode 100644 .DS_Store create mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 Readme.txt create mode 100755 __MACOSX/._project2_greedy_upload create mode 100644 __MACOSX/project2_greedy_upload/._.DS_Store create mode 100644 __MACOSX/project2_greedy_upload/._Readme.txt create mode 100644 __MACOSX/project2_greedy_upload/._data_train.xlsx create mode 100644 __MACOSX/project2_greedy_upload/._project2_main.py create mode 100644 __MACOSX/project2_greedy_upload/._票务对话机器人项目说明.pdf create mode 100644 data_train.xlsx create mode 100644 project2_main.py create mode 100644 stopword.txt diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..0003bba Binary files /dev/null and b/.DS_Store differ diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..a749903 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", + "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import fool\n", + "import time\n", + "import re\n", + "import random\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "stopwords = {}\n", + "with open(r'stopword.txt', 'r', encoding='utf-8') as fr:\n", + " for word in fr:\n", + " stopwords[word.strip()] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class CLF_MODEL:\n", + " # 类目标:该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中\n", + "\n", + " # 初始化模块\n", + " def __init__(self):\n", + " self.model = \"\" # 成员变量,用于存储模型\n", + " self.vectorizer = \"\" # 成员变量,用于存储tfidf统计值\n", + "\n", + " # 训练模块\n", + " def train(self):\n", + " # 函数目标:读取训练数据,训练意图分类模型,并将训练好的分类模型赋值给成员变量self.model\n", + " # input:无\n", + " # output:无\n", + "\n", + " # 从excel文件读取训练样本\n", + " d_train = pd.read_excel(\"data_train.xlsx\")\n", + " # 对训练数据进行预处理\n", + " d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean)\n", + " print(\"训练样本 = %d\" % len(d_train))\n", + "\n", + " \"\"\"\n", + " TODO:利用sklearn中的函数进行训练,将句子转化为特征features\n", + " \"\"\"\n", + " self.vectorizer = TfidfVectorizer()\n", + " X = self.vectorizer.fit_transform(d_train.sentence_train)\n", + " self.model = LogisticRegression(penalty='l1', solver='liblinear')\n", + "\n", + " self.model.fit(X, d_train.label)\n", + "\n", + " # 预测模块(使用模型预测)\n", + " def predict_model(self, sentence):\n", + " # 函数目标:使用意图分类模型预测意图\n", + " # input:sentence(用户输入)\n", + " # output:clf_result(意图类别),score(意图分数)\n", + "\n", + " # --------------\n", + " # 对样本中没有的特殊情况做特别判断\n", + " if sentence in [\"好的\", \"需要\", \"是的\", \"要的\", \"好\", \"要\", \"是\"]:\n", + " return 1, 0.8\n", + " # --------------\n", + "\n", + " \"\"\"\n", + " TODO:利用已训练好的意图分类模型进行意图识别\n", + " \"\"\"\n", + " sentence = self.fun_clean(sentence)\n", + " x = self.vectorizer.transform([sentence])\n", + " clf_result = self.model.predict(x)[0]\n", + " score = self.model.predict_proba(x)[0][clf_result]\n", + "\n", + "\n", + " return clf_result, score\n", + "\n", + " # 预测模块(使用规则)\n", + " def predict_rule(self, sentence):\n", + " # 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合\"模型\"及\"规则\"的预测方式\n", + " # input:sentence(用户输入)\n", + " # output:clf_result(意图类别),score(意图分数)\n", + "\n", + " sentence = sentence.replace(' ', '')\n", + " if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence):\n", + " return 2, 0.8\n", + " elif re.findall(r'订|定|预定|买|购', sentence) or sentence in [\"好的\",\"需要\",\"是的\",\"要的\",\"好\",\"要\",\"是\"]:\n", + " return 1, 0.8\n", + " else:\n", + " return 0, 0.8\n", + "\n", + " # 预处理函数\n", + " def fun_clean(self, sentence):\n", + " # 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等\n", + " # input:sentence(用户输入语句)\n", + " # output:sentence(预处理结果)\n", + "\n", + " \"\"\"\n", + " TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等\n", + " \"\"\"\n", + " words, ners = fool.analysis(sentence)\n", + "\n", + " res = []\n", + " ner_idx = 0\n", + " cur_idx = 0\n", + " for word in words[0]:\n", + " if ner_idx < len(ners[0]) and cur_idx == ners[0][ner_idx][0]:\n", + " if ners[0][ner_idx][0] == 'time':\n", + " res.append('DATE')\n", + " elif ners[0][ner_idx][0] == 'location':\n", + " res.append('CITY')\n", + " elif word[0] not in stopwords:\n", + " res.append(word[0])\n", + " ner_idx += 1\n", + " elif word[0] not in stopwords:\n", + " res.append(word[0])\n", + " cur_idx += len(word)\n", + " \n", + " return ' '.join(res)\n", + "\n", + " # 分类主函数\n", + " def fun_clf(self, sentence):\n", + " # 函数目标:意图识别主函数\n", + " # input:sentence( 用户输入语句)\n", + " # output:clf_result(意图类别),score(意图分数)\n", + "\n", + " # 对用户输入进行预处理\n", + " sentence = self.fun_clean(sentence)\n", + " # 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)\n", + " clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测\n", + " # clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)\n", + " return clf_result, score" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sentence = '帮 我 查 一下 DATE 去 CITY 的 票'" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "训练样本 = 99\n" + ] + } + ], + "source": [ + "clf_obj = CLF_MODEL()\n", + "# 完成意图识别模型的训练\n", + "clf_obj.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "clf_result, score = clf_obj.predict_model(sentence)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def fun_replace_num(sentence):\n", + " # 函数目标:替换时间中的数字(目的是便于实体识别包fool对实体的识别)\n", + " # input:sentence\n", + " # output:sentence\n", + "\n", + " # 定义要替换的数字\n", + " time_num = {\"一\":\"1\",\"二\":\"2\",\"三\":\"3\",\"四\":\"4\",\"五\":\"5\",\"六\":\"6\",\"七\":\"7\",\"八\":\"8\",\"九\":\"9\",\"十\":\"10\",\"十一\":\"11\",\"十二\":\"12\"}\n", + " for k, v in time_num.items():\n", + " sentence = sentence.replace(k, v)\n", + " return sentence" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def slot_fill(sentence, key=None):\n", + " # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)\n", + " # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息)\n", + " # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)\n", + "\n", + " slot = {}\n", + " # 进行实体识别\n", + " slot_lst = {\"time\":\"time\", \"date\":\"time\", \"from_city\":'location', \"to_city\":'location'}\n", + " words, ners = fool.analysis(sentence)\n", + " \"\"\"\n", + " TODO:从sentence中寻找需要的内容,完成填槽工作\n", + " \"\"\"\n", + " if not key:\n", + " keys = [\"time\", \"date\", \"from_city\", \"to_city\"]\n", + " else:\n", + " keys = [key]\n", + " for key in keys:\n", + " print(key)\n", + " if ners != [[]]:\n", + " for i in range(len(ners[0])):\n", + " if ners[0][i][2] == slot_lst[key]:\n", + " if key == \"time\" and not isTimeFormat(ners[0][i][3]):\n", + " continue\n", + " slot[key] = ners[0][i][3]\n", + " del ners[0][i]\n", + " break\n", + " return slot\n", + "\n", + "\n", + "def fun_wait(clf_obj):\n", + " # 函数目标:等待,获取用户输入问句\n", + " # input:CLF_MODEL类实例化对象\n", + " # output:clf_result(用户输入意图类别), score(意图识别分数), sentence(用户输入)\n", + "\n", + " # 等待用户输入\n", + " print(\"\\n\\n\\n\")\n", + " print(\"-------------------------------------------------------------\")\n", + " print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n", + " print(\"Starting ...\")\n", + " sentence = input(\"客服:请问需要什么服务?(时间请用12小时制表示)\\n\")\n", + " # 对用户输入进行意图识别\n", + " clf_result, score = clf_obj.fun_clf(sentence)\n", + " return clf_result, score, sentence\n", + "\n", + "\n", + "def fun_search(clf_result, sentence):\n", + " # 函数目标:为用户查询余票\n", + " # input:clf_result(意图分类结果), sentence(用户输入问句)\n", + " # output:是否有票\n", + "\n", + " # 定义槽存储空间\n", + " name = {\"time\":\"出发时间\", \"date\":\"出发日期\", \"from_city\":\"出发城市\", \"to_city\":\"到达城市\"}\n", + " slot = {\"time\":\"\", \"date\":\"\", \"from_city\":\"\", \"to_city\":\"\"}\n", + " # 使用用户第一句话进行填槽\n", + " sentence = fun_replace_num(sentence)\n", + " slot_init = slot_fill(sentence)\n", + " for key in slot_init.keys():\n", + " slot[key] = slot_init[key]\n", + " # 对未填充对槽位,向用户提问,进行针对性填槽\n", + " while \"\" in slot.values():\n", + " for key in slot.keys():\n", + " if slot[key]==\"\":\n", + " sentence = input(\"客服:请问%s是?\\n\"%(name[key]))\n", + " sentence = fun_replace_num(sentence)\n", + " slot_cur = slot_fill(sentence, key)\n", + " for key in slot_cur.keys():\n", + " if slot[key]==\"\":\n", + " slot[key] = slot_cur[key]\n", + "\n", + " # 查询是否有票,并答复用户(本次查询是否有票使用随机数完成,实际情况可查询数据库返回)\n", + " if random.random()>0.5:\n", + " print(\"客服:%s%s从%s到%s的票充足\"%(slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n", + " # 返回1表示有票\n", + " return 1\n", + " else:\n", + " print(\"客服:%s%s从%s到%s无票\" % (slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n", + " print(\"End !!!\")\n", + " print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n", + " print(\"-------------------------------------------------------------\")\n", + " # 返回0表示无票\n", + " return 0\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<_sre.SRE_Match object; span=(0, 3), match='4pm'>\n" + ] + } + ], + "source": [ + "time='4pm' \n", + "print(re.match('^\\d{1,2}([:.]?\\d{1,2})?([ ]?[a|p]m)?$',time))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[('4', 'm'), ('pm', 'nx')]]\n", + "[[]]\n" + ] + } + ], + "source": [ + "s = '4pm'\n", + "words, ners = fool.analysis(s)\n", + "print(words)\n", + "print(ners)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Readme.txt b/Readme.txt new file mode 100644 index 0000000..49258d5 --- /dev/null +++ b/Readme.txt @@ -0,0 +1,3 @@ +project2_main.py python执行脚本(所有的代码位置) +data_train.xlsx 意图分类训练数据 +stopword.txt 停用词 diff --git a/__MACOSX/._project2_greedy_upload b/__MACOSX/._project2_greedy_upload new file mode 100755 index 0000000..b5ea4b8 Binary files /dev/null and b/__MACOSX/._project2_greedy_upload differ diff --git a/__MACOSX/project2_greedy_upload/._.DS_Store b/__MACOSX/project2_greedy_upload/._.DS_Store new file mode 100644 index 0000000..a5b28df Binary files /dev/null and b/__MACOSX/project2_greedy_upload/._.DS_Store differ diff --git a/__MACOSX/project2_greedy_upload/._Readme.txt b/__MACOSX/project2_greedy_upload/._Readme.txt new file mode 100644 index 0000000..bdaae4b Binary files /dev/null and b/__MACOSX/project2_greedy_upload/._Readme.txt differ diff --git a/__MACOSX/project2_greedy_upload/._data_train.xlsx b/__MACOSX/project2_greedy_upload/._data_train.xlsx new file mode 100644 index 0000000..395d34b Binary files /dev/null and b/__MACOSX/project2_greedy_upload/._data_train.xlsx differ diff --git a/__MACOSX/project2_greedy_upload/._project2_main.py b/__MACOSX/project2_greedy_upload/._project2_main.py new file mode 100644 index 0000000..ce1c222 Binary files /dev/null and b/__MACOSX/project2_greedy_upload/._project2_main.py differ diff --git "a/__MACOSX/project2_greedy_upload/._\347\245\250\345\212\241\345\257\271\350\257\235\346\234\272\345\231\250\344\272\272\351\241\271\347\233\256\350\257\264\346\230\216.pdf" "b/__MACOSX/project2_greedy_upload/._\347\245\250\345\212\241\345\257\271\350\257\235\346\234\272\345\231\250\344\272\272\351\241\271\347\233\256\350\257\264\346\230\216.pdf" new file mode 100644 index 0000000..d79570e Binary files /dev/null and "b/__MACOSX/project2_greedy_upload/._\347\245\250\345\212\241\345\257\271\350\257\235\346\234\272\345\231\250\344\272\272\351\241\271\347\233\256\350\257\264\346\230\216.pdf" differ diff --git a/data_train.xlsx b/data_train.xlsx new file mode 100644 index 0000000..86f353b Binary files /dev/null and b/data_train.xlsx differ diff --git a/project2_main.py b/project2_main.py new file mode 100644 index 0000000..c51a7df --- /dev/null +++ b/project2_main.py @@ -0,0 +1,272 @@ +# coding=utf-8 + +import pandas as pd +import fool +import re +import random +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + + +# ----------------------------------------------------- +# 加载停用词词典 +stopwords = {} +with open(r'stopword.txt', 'r', encoding='utf-8') as fr: + for word in fr: + stopwords[word.strip()] = 0 +# ----------------------------------------------------- + + +# 定义类 +class CLF_MODEL: + # 类目标:该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中 + + # 初始化模块 + def __init__(self): + self.model = "" # 成员变量,用于存储模型 + self.vectorizer = "" # 成员变量,用于存储tfidf统计值 + + # 训练模块 + def train(self): + # 函数目标:读取训练数据,训练意图分类模型,并将训练好的分类模型赋值给成员变量self.model + # input:无 + # output:无 + + # 从excel文件读取训练样本 + d_train = pd.read_excel("data_train.xlsx") + # 对训练数据进行预处理 + d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean) + print("训练样本 = %d" % len(d_train)) + + """ + TODO:利用sklearn中的函数进行训练,将句子转化为特征features + """ + self.vectorizer = TfidfVectorizer() + X = self.vectorizer.fit_transform(d_train.sentence_train) + self.model = LogisticRegression(penalty='l1', solver='liblinear') + + self.model.fit(X, d_train.label) + + # 预测模块(使用模型预测) + def predict_model(self, sentence): + # 函数目标:使用意图分类模型预测意图 + # input:sentence(用户输入) + # output:clf_result(意图类别),score(意图分数) + + # -------------- + # 对样本中没有的特殊情况做特别判断 + if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]: + return 1, 0.8 + # -------------- + + """ + TODO:利用已训练好的意图分类模型进行意图识别 + """ + x = self.vectorizer.transform([sentence]) + clf_result = self.model.predict(x)[0] + score = self.model.predict_proba(x)[0][clf_result] + return clf_result, score + + # 预测模块(使用规则) + def predict_rule(self, sentence): + # 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式 + # input:sentence(用户输入) + # output:clf_result(意图类别),score(意图分数) + + sentence = sentence.replace(' ', '') + if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence): + return 2, 0.8 + elif re.findall(r'订|定|预定|买|购', sentence) or sentence in ["好的","需要","是的","要的","好","要","是"]: + return 1, 0.8 + else: + return 0, 0.8 + + # 预处理函数 + def fun_clean(self, sentence): + # 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 + # input:sentence(用户输入语句) + # output:sentence(预处理结果) + + """ + TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 + """ + words, ners = fool.analysis(sentence) + res = [] + ner_idx = 0 + cur_idx = 0 + for word in words[0]: + if ner_idx < len(ners[0]) and cur_idx == ners[0][ner_idx][0]: + if ners[0][ner_idx][2] == 'time': + res.append('DATE') + elif ners[0][ner_idx][2] == 'location': + res.append('CITY') + elif word[0] not in stopwords: + res.append(word[0]) + ner_idx += 1 + elif word[0] not in stopwords: + res.append(word[0]) + cur_idx += len(word[0]) + return ' '.join(res) + + # 分类主函数 + def fun_clf(self, sentence): + # 函数目标:意图识别主函数 + # input:sentence( 用户输入语句) + # output:clf_result(意图类别),score(意图分数) + + # 对用户输入进行预处理 + sentence = self.fun_clean(sentence) + # 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别) + clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测 + # clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一) + return clf_result, score + + +def fun_replace_num(sentence): + # 函数目标:替换时间中的数字(目的是便于实体识别包fool对实体的识别) + # input:sentence + # output:sentence + + # 定义要替换的数字 + time_num = {"一":"1","二":"2","三":"3","四":"4","五":"5","六":"6","七":"7","八":"8","九":"9","十":"10","十一":"11","十二":"12"} + for k, v in time_num.items(): + sentence = sentence.replace(k, v) + return sentence + + +def slot_fill(sentence, key=None): + # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作) + # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息) + # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值) + + slot = {} + # 进行实体识别 + slot_lst = {"time":"time", "date":"time", "from_city":'location', "to_city":'location'} + words, ners = fool.analysis(sentence) + """ + TODO:从sentence中寻找需要的内容,完成填槽工作 + """ + if not key: + keys = ["time", "date", "from_city", "to_city"] + else: + keys = [key] + for key in keys: + if ners != [[]]: + for i in range(len(ners[0])): + if ners[0][i][2] == slot_lst[key]: + if key == "time" and not re.match('^\d{1,2}([:.]?\d{1,2})?([ ]?[a|p]m)?$',ners[0][i][3]): + continue + slot[key] = ners[0][i][3] + del ners[0][i] + break + return slot + + +def fun_wait(clf_obj): + # 函数目标:等待,获取用户输入问句 + # input:CLF_MODEL类实例化对象 + # output:clf_result(用户输入意图类别), score(意图识别分数), sentence(用户输入) + + # 等待用户输入 + print("\n\n\n") + print("-------------------------------------------------------------") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("Starting ...") + sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n") + # 对用户输入进行意图识别 + clf_result, score = clf_obj.fun_clf(sentence) + return clf_result, score, sentence + + +def fun_search(clf_result, sentence): + # 函数目标:为用户查询余票 + # input:clf_result(意图分类结果), sentence(用户输入问句) + # output:是否有票 + + # 定义槽存储空间 + name = {"time":"出发时间", "date":"出发日期", "from_city":"出发城市", "to_city":"到达城市"} + slot = {"time":"", "date":"", "from_city":"", "to_city":""} + # 使用用户第一句话进行填槽 + sentence = fun_replace_num(sentence) + slot_init = slot_fill(sentence) + for key in slot_init.keys(): + slot[key] = slot_init[key] + # 对未填充对槽位,向用户提问,进行针对性填槽 + while "" in slot.values(): + for key in slot.keys(): + if slot[key]=="": + sentence = input("客服:请问%s是?\n"%(name[key])) + sentence = fun_replace_num(sentence) + slot_cur = slot_fill(sentence, key) + for key in slot_cur.keys(): + if slot[key]=="": + slot[key] = slot_cur[key] + + # 查询是否有票,并答复用户(本次查询是否有票使用随机数完成,实际情况可查询数据库返回) + if random.random()>0.5: + print("客服:%s%s从%s到%s的票充足"%(slot["date"], slot["time"], slot["from_city"], slot["to_city"])) + # 返回1表示有票 + return 1 + else: + print("客服:%s%s从%s到%s无票" % (slot["date"], slot["time"], slot["from_city"], slot["to_city"])) + print("End !!!") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("-------------------------------------------------------------") + # 返回0表示无票 + return 0 + + +def fun_book(): + # 函数目标:执行下单订票动作 + # input:无 + # output:无 + + print("客服:已为您完成订票。\n\n\n") + print("End !!!") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("-------------------------------------------------------------") + + + +if __name__=="__main__": + # 实例化对象 + clf_obj = CLF_MODEL() + # 完成意图识别模型的训练 + clf_obj.train() + # 用户定义阈值(当分类器分类的分数大于阈值才采纳本次意图分类结果,目的是排除分数过低的意图分类结果) + threshold = 0.55 + + # 循环提供服务 + while 1: + clf_result, score, sentence = fun_wait(clf_obj) + # ------------------------------------------------------------------------------- + # 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务” + # ------------------------------------------------------------------------------- + if score<threshold or clf_result==2: + continue + + # ------------------------------------------------------------------------------- + # 状态转移条件(等待-->查询):用户输入分类为“查询” OR “订票” + # ------------------------------------------------------------------------------- + else: + # 收集订票细节信息 + search_result = fun_search(clf_result, sentence) + # 查询无票 + # ------------------------------------------------------------------------------- + # 状态转移条件(查询-->等待):FUN_SEARCH执行完后用户输入意图为“终止服务” OR FUN_SEARCH返回无票 + # ------------------------------------------------------------------------------- + if search_result==0: + continue + # 查询有票 + else: + # 等待用户输入 + sentence = input("客服:需要为您订票吗?\n") + # 对用户输入进行意图识别 + clf_result, score = clf_obj.fun_clf(sentence) + # ------------------------------------------------------------------------------- + # 状态转移条件(查询-->订票):FUN_SEARCH返回有票 AND 用户输入意图为“订票” + # ------------------------------------------------------------------------------- + if clf_result == 1: + fun_book() + continue + diff --git a/stopword.txt b/stopword.txt new file mode 100644 index 0000000..a2664c4 --- /dev/null +++ b/stopword.txt @@ -0,0 +1,13 @@ +的 +到 +一下 +我 +, +帮 +去 +了 +? +吧 +。 +请 +么 -- libgit2 0.26.0