diff --git b/.idea/encodings.xml a/.idea/encodings.xml new file mode 100644 index 0000000..f13fa33 --- /dev/null +++ a/.idea/encodings.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="Encoding" addBOMForNewFiles="with NO BOM" /> +</project> \ No newline at end of file diff --git b/.idea/greedy.iml a/.idea/greedy.iml new file mode 100644 index 0000000..f6a3f5c --- /dev/null +++ a/.idea/greedy.iml @@ -0,0 +1,12 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="PYTHON_MODULE" version="4"> + <component name="NewModuleRootManager"> + <content url="file://$MODULE_DIR$" /> + <orderEntry type="jdk" jdkName="Python 3.6 (Workspace)" jdkType="Python SDK" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> + <component name="TestRunnerService"> + <option name="projectConfiguration" value="pytest" /> + <option name="PROJECT_TEST_RUNNER" value="pytest" /> + </component> +</module> \ No newline at end of file diff --git b/.idea/misc.xml a/.idea/misc.xml new file mode 100644 index 0000000..5417b68 --- /dev/null +++ a/.idea/misc.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" /> +</project> \ No newline at end of file diff --git b/.idea/modules.xml a/.idea/modules.xml new file mode 100644 index 0000000..a2538dd --- /dev/null +++ a/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/project2_greedy.iml" filepath="$PROJECT_DIR$/.idea/project2_greedy.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git b/.idea/workspace.xml a/.idea/workspace.xml new file mode 100644 index 0000000..068b5e6 --- /dev/null +++ a/.idea/workspace.xml @@ -0,0 +1,249 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ChangeListManager"> + <list default="true" id="f0c9c573-b9ae-4bf1-8e72-d6720fc99052" name="Default Changelist" comment="" /> + <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> + <option name="TRACKING_ENABLED" value="true" /> + <option name="SHOW_DIALOG" value="false" /> + <option name="HIGHLIGHT_CONFLICTS" value="true" /> + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> + <option name="LAST_RESOLUTION" value="IGNORE" /> + </component> + <component name="FileEditorManager"> + <leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> + <file leaf-file-name="project2_main.py" pinned="false" current-in-tab="true"> + <entry file="file://$PROJECT_DIR$/project2_main.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="447"> + <caret line="143" column="33" selection-start-line="143" selection-start-column="33" selection-end-line="143" selection-end-column="33" /> + <folding> + <element signature="e#16#35#0" expanded="true" /> + </folding> + </state> + </provider> + </entry> + </file> + <file leaf-file-name="project2_main_original.py" pinned="false" current-in-tab="false"> + <entry file="file://$PROJECT_DIR$/project2_main_original.py"> + <provider selected="true" editor-type-id="text-editor" /> + </entry> + </file> + </leaf> + </component> + <component name="FindInProjectRecents"> + <findStrings> + <find>book</find> + <find>fun_replace_num</find> + <find>replace</find> + <find>clf_model</find> + </findStrings> + <replaceStrings> + <replace>CLF_MODEL</replace> + </replaceStrings> + </component> + <component name="IdeDocumentHistory"> + <option name="CHANGED_PATHS"> + <list> + <option value="$PROJECT_DIR$/project2_20190526.py" /> + <option value="$PROJECT_DIR$/project2_main.py" /> + </list> + </option> + </component> + <component name="ProjectFrameBounds" fullScreen="true"> + <option name="y" value="23" /> + <option name="width" value="1440" /> + <option name="height" value="798" /> + </component> + <component name="ProjectView"> + <navigator proportions="" version="1"> + <foldersAlwaysOnTop value="true" /> + </navigator> + <panes> + <pane id="ProjectPane"> + <subPane> + <expand> + <path> + <item name="project2_greedy_upload的副本" type="b2602c69:ProjectViewProjectNode" /> + <item name="project2_greedy_upload的副本" type="462c0819:PsiDirectoryNode" /> + </path> + </expand> + <select /> + </subPane> + </pane> + <pane id="Scope" /> + </panes> + </component> + <component name="PropertiesComponent"> + <property name="last_opened_file_path" value="$PROJECT_DIR$" /> + <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" /> + </component> + <component name="RecentsManager"> + <key name="CopyFile.RECENT_KEYS"> + <recent name="$PROJECT_DIR$" /> + </key> + </component> + <component name="RunDashboard"> + <option name="ruleStates"> + <list> + <RuleState> + <option name="name" value="ConfigurationTypeDashboardGroupingRule" /> + </RuleState> + <RuleState> + <option name="name" value="StatusDashboardGroupingRule" /> + </RuleState> + </list> + </option> + </component> + <component name="RunManager" selected="Python.project2_main"> + <configuration name="project2_20190526" type="PythonConfigurationType" factoryName="Python" temporary="true"> + <module name="project2_greedy" /> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> + <option name="IS_MODULE_SDK" value="true" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_20190526.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <option name="EMULATE_TERMINAL" value="false" /> + <option name="MODULE_MODE" value="false" /> + </configuration> + <configuration name="project2_main" type="PythonConfigurationType" factoryName="Python" temporary="true"> + <module name="project2_greedy" /> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> + <option name="IS_MODULE_SDK" value="true" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_main.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <option name="EMULATE_TERMINAL" value="false" /> + <option name="MODULE_MODE" value="false" /> + </configuration> + <configuration name="project2_main_original" type="PythonConfigurationType" factoryName="Python" temporary="true"> + <module name="project2_greedy" /> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> + <option name="IS_MODULE_SDK" value="true" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_main_original.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <option name="EMULATE_TERMINAL" value="false" /> + <option name="MODULE_MODE" value="false" /> + </configuration> + <list> + <item itemvalue="Python.project2_20190526" /> + <item itemvalue="Python.project2_main_original" /> + <item itemvalue="Python.project2_main" /> + </list> + <recent_temporary> + <list> + <item itemvalue="Python.project2_main" /> + <item itemvalue="Python.project2_main_original" /> + <item itemvalue="Python.project2_20190526" /> + </list> + </recent_temporary> + </component> + <component name="SvnConfiguration"> + <configuration /> + </component> + <component name="TaskManager"> + <task active="true" id="Default" summary="Default task"> + <changelist id="f0c9c573-b9ae-4bf1-8e72-d6720fc99052" name="Default Changelist" comment="" /> + <created>1558967327224</created> + <option name="number" value="Default" /> + <option name="presentableId" value="Default" /> + <updated>1558967327224</updated> + </task> + <servers /> + </component> + <component name="ToolWindowManager"> + <frame x="0" y="0" width="1440" height="900" extended-state="0" /> + <editor active="true" /> + <layout> + <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.15694444" /> + <window_info anchor="bottom" id="TODO" order="6" /> + <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" /> + <window_info anchor="bottom" id="Run" order="2" /> + <window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" /> + <window_info anchor="bottom" id="Python Console" order="7" /> + <window_info id="Structure" order="1" side_tool="true" weight="0.25" /> + <window_info anchor="bottom" id="Terminal" order="7" /> + <window_info anchor="bottom" id="Debug" order="3" weight="0.3985849" /> + <window_info id="Favorites" order="2" side_tool="true" /> + <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" /> + <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> + <window_info anchor="right" id="Ant Build" order="1" weight="0.25" /> + <window_info anchor="bottom" id="Message" order="0" /> + <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> + <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" /> + <window_info anchor="bottom" id="Find" order="1" /> + </layout> + </component> + <component name="VcsContentAnnotationSettings"> + <option name="myLimit" value="2678400000" /> + </component> + <component name="XDebuggerManager"> + <breakpoint-manager> + <breakpoints> + <line-breakpoint enabled="true" suspend="THREAD" type="python-line"> + <url>file://$PROJECT_DIR$/project2_20190526.py</url> + <line>184</line> + <option name="timeStamp" value="6" /> + </line-breakpoint> + <line-breakpoint enabled="true" suspend="THREAD" type="python-line"> + <url>file://$PROJECT_DIR$/project2_20190526.py</url> + <line>106</line> + <option name="timeStamp" value="13" /> + </line-breakpoint> + </breakpoints> + <default-breakpoints> + <breakpoint type="python-exception"> + <properties notifyOnTerminate="true" exception="BaseException"> + <option name="notifyOnTerminate" value="true" /> + </properties> + </breakpoint> + </default-breakpoints> + </breakpoint-manager> + </component> + <component name="editorHistoryManager"> + <entry file="file://E:/Program Files/Anaconda3/Lib/urllib/parse.py" /> + <entry file="file://E:/Program Files/Anaconda3/Lib/codecs.py" /> + <entry file="file://E:/Program Files/Anaconda3/Lib/site-packages/sklearn/feature_extraction/text.py" /> + <entry file="file://$PROJECT_DIR$/project2_20190526.py" /> + <entry file="file://$PROJECT_DIR$/对话示例.png"> + <provider selected="true" editor-type-id="images" /> + </entry> + <entry file="file://$PROJECT_DIR$/project2_main_original.py"> + <provider selected="true" editor-type-id="text-editor" /> + </entry> + <entry file="file://$PROJECT_DIR$/project2_main.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="447"> + <caret line="143" column="33" selection-start-line="143" selection-start-column="33" selection-end-line="143" selection-end-column="33" /> + <folding> + <element signature="e#16#35#0" expanded="true" /> + </folding> + </state> + </provider> + </entry> + </component> +</project> \ No newline at end of file diff --git b/Readme.txt a/Readme.txt new file mode 100644 index 0000000..49258d5 --- /dev/null +++ a/Readme.txt @@ -0,0 +1,3 @@ +project2_main.py python执行脚本(所有的代码位置) +data_train.xlsx 意图分类训练数据 +stopword.txt 停用词 diff --git b/data_train.xlsx a/data_train.xlsx new file mode 100644 index 0000000..86f353b Binary files /dev/null and a/data_train.xlsx differ diff --git b/description.pdf a/description.pdf new file mode 100644 index 0000000..7387216 Binary files /dev/null and a/description.pdf differ diff --git b/main.py a/main.py new file mode 100644 index 0000000..0d6bc9d --- /dev/null +++ a/main.py @@ -0,0 +1,243 @@ +# coding=utf-8 + +import pandas as pd +import fool +import re +import random +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + + +# ----------------------------------------------------- +# 加载停用词词典 +stopwords = {} +with open(r'stopword.txt', 'r', encoding='utf-8') as fr: + for word in fr: + stopwords[word.strip()] = 0 +# ----------------------------------------------------- + + +# 定义类 +class CLF_MODEL: + # 类目标:该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中 + + # 初始化模块 + def __init__(self): + self.model = "" # 成员变量,用于存储模型 + self.vectorizer = "" # 成员变量,用于存储tfidf统计值 + + # 训练模块 + def train(self): + # 函数目标:读取训练数据,训练意图分类模型,并将训练好的分类模型赋值给成员变量self.model + # input:无 + # output:无 + + # 从excel文件读取训练样本 + d_train = pd.read_excel("data_train.xlsx") + # 对训练数据进行预处理 + d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean) + print("训练样本 = %d" % len(d_train)) + + """ + TODO:利用sklearn中的函数进行训练,将句子转化为特征features + """ + + + self.model.fit(features, d_train.label) + + # 预测模块(使用模型预测) + def predict_model(self, sentence): + # 函数目标:使用意图分类模型预测意图 + # input:sentence(用户输入) + # output:clf_result(意图类别),score(意图分数) + + # -------------- + # 对样本中没有的特殊情况做特别判断 + if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]: + return 1, 0.8 + # -------------- + + """ + TODO:利用已训练好的意图分类模型进行意图识别 + """ + + + return clf_result, score + + # 预测模块(使用规则) + def predict_rule(self, sentence): + # 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式 + # input:sentence(用户输入) + # output:clf_result(意图类别),score(意图分数) + + sentence = sentence.replace(' ', '') + if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence): + return 2, 0.8 + elif re.findall(r'订|定|预定|买|购', sentence) or sentence in ["好的","需要","是的","要的","好","要","是"]: + return 1, 0.8 + else: + return 0, 0.8 + + # 预处理函数 + def fun_clean(self, sentence): + # 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 + # input:sentence(用户输入语句) + # output:sentence(预处理结果) + + """ + TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 + """ + + + return sentence + + # 分类主函数 + def fun_clf(self, sentence): + # 函数目标:意图识别主函数 + # input:sentence( 用户输入语句) + # output:clf_result(意图类别),score(意图分数) + + # 对用户输入进行预处理 + sentence = self.fun_clean(sentence) + # 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别) + clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测 + # clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一) + return clf_result, score + + +def fun_replace_num(sentence): + # 函数目标:替换时间中的数字(目的是便于实体识别包fool对实体的识别) + # input:sentence + # output:sentence + + # 定义要替换的数字 + time_num = {"一":"1","二":"2","三":"3","四":"4","五":"5","六":"6","七":"7","八":"8","九":"9","十":"10","十一":"11","十二":"12"} + for k, v in time_num.items(): + sentence = sentence.replace(k, v) + return sentence + + +def slot_fill(sentence, key=None): + # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作) + # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息) + # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值) + + slot = {} + # 进行实体识别 + words, ners = fool.analysis(sentence) + + """ + TODO:从sentence中寻找需要的内容,完成填槽工作 + """ + + + return slot + + +def fun_wait(clf_obj): + # 函数目标:等待,获取用户输入问句 + # input:CLF_MODEL类实例化对象 + # output:clf_result(用户输入意图类别), score(意图识别分数), sentence(用户输入) + + # 等待用户输入 + print("\n\n\n") + print("-------------------------------------------------------------") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("Starting ...") + sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n") + # 对用户输入进行意图识别 + clf_result, score = clf_obj.fun_clf(sentence) + return clf_result, score, sentence + + +def fun_search(clf_result, sentence): + # 函数目标:为用户查询余票 + # input:clf_result(意图分类结果), sentence(用户输入问句) + # output:是否有票 + + # 定义槽存储空间 + name = {"time":"出发时间", "date":"出发日期", "from_city":"出发城市", "to_city":"到达城市"} + slot = {"time":"", "date":"", "from_city":"", "to_city":""} + # 使用用户第一句话进行填槽 + sentence = fun_replace_num(sentence) + slot_init = slot_fill(sentence) + for key in slot_init.keys(): + slot[key] = slot_init[key] + # 对未填充对槽位,向用户提问,进行针对性填槽 + while "" in slot.values(): + for key in slot.keys(): + if slot[key]=="": + sentence = input("客服:请问%s是?\n"%(name[key])) + sentence = fun_replace_num(sentence) + slot_cur = slot_fill(sentence, key) + for key in slot_cur.keys(): + if slot[key]=="": + slot[key] = slot_cur[key] + + # 查询是否有票,并答复用户(本次查询是否有票使用随机数完成,实际情况可查询数据库返回) + if random.random()>0.5: + print("客服:%s%s从%s到%s的票充足"%(slot["date"], slot["time"], slot["from_city"], slot["to_city"])) + # 返回1表示有票 + return 1 + else: + print("客服:%s%s从%s到%s无票" % (slot["date"], slot["time"], slot["from_city"], slot["to_city"])) + print("End !!!") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("-------------------------------------------------------------") + # 返回0表示无票 + return 0 + + +def fun_book(): + # 函数目标:执行下单订票动作 + # input:无 + # output:无 + + print("客服:已为您完成订票。\n\n\n") + print("End !!!") + print("----*------*-----*-----*----*-----*-----*-----*-----*------") + print("-------------------------------------------------------------") + + + +if __name__=="__main__": + # 实例化对象 + clf_obj = CLF_MODEL() + # 完成意图识别模型的训练 + clf_obj.train() + # 用户定义阈值(当分类器分类的分数大于阈值才采纳本次意图分类结果,目的是排除分数过低的意图分类结果) + threshold = 0.55 + # 循环提供服务 + while 1: + clf_result, score, sentence = fun_wait(clf_obj) + # ------------------------------------------------------------------------------- + # 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务” + # ------------------------------------------------------------------------------- + if score<threshold or clf_result==2: + continue + + # ------------------------------------------------------------------------------- + # 状态转移条件(等待-->查询):用户输入分类为“查询” OR “订票” + # ------------------------------------------------------------------------------- + else: + # 收集订票细节信息 + search_result = fun_search(clf_result, sentence) + # 查询无票 + # ------------------------------------------------------------------------------- + # 状态转移条件(查询-->等待):FUN_SEARCH执行完后用户输入意图为“终止服务” OR FUN_SEARCH返回无票 + # ------------------------------------------------------------------------------- + if search_result==0: + continue + # 查询有票 + else: + # 等待用户输入 + sentence = input("客服:需要为您订票吗?\n") + # 对用户输入进行意图识别 + clf_result, score = clf_obj.fun_clf(sentence) + # ------------------------------------------------------------------------------- + # 状态转移条件(查询-->订票):FUN_SEARCH返回有票 AND 用户输入意图为“订票” + # ------------------------------------------------------------------------------- + if clf_result == 1: + fun_book() + continue + diff --git b/stopword.txt a/stopword.txt new file mode 100644 index 0000000..a2664c4 --- /dev/null +++ a/stopword.txt @@ -0,0 +1,13 @@ +的 +到 +一下 +我 +, +帮 +去 +了 +? +吧 +。 +请 +么