Commit 23d8d868 by 20200203048

first commit

parents
File added
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/project2_greedy.iml" filepath="$PROJECT_DIR$/.idea/project2_greedy.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (Workspace)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="pytest" />
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="f0c9c573-b9ae-4bf1-8e72-d6720fc99052" name="Default Changelist" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="project2_main.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/project2_main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="447">
<caret line="143" column="33" selection-start-line="143" selection-start-column="33" selection-end-line="143" selection-end-column="33" />
<folding>
<element signature="e#16#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="project2_main_original.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/project2_main_original.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
</file>
</leaf>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>book</find>
<find>fun_replace_num</find>
<find>replace</find>
<find>clf_model</find>
</findStrings>
<replaceStrings>
<replace>CLF_MODEL</replace>
</replaceStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/project2_20190526.py" />
<option value="$PROJECT_DIR$/project2_main.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" fullScreen="true">
<option name="y" value="23" />
<option name="width" value="1440" />
<option name="height" value="798" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="project2_greedy_upload的副本" type="b2602c69:ProjectViewProjectNode" />
<item name="project2_greedy_upload的副本" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.project2_main">
<configuration name="project2_20190526" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="project2_greedy" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_20190526.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
</configuration>
<configuration name="project2_main" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="project2_greedy" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_main.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
</configuration>
<configuration name="project2_main_original" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="project2_greedy" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/project2_main_original.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
</configuration>
<list>
<item itemvalue="Python.project2_20190526" />
<item itemvalue="Python.project2_main_original" />
<item itemvalue="Python.project2_main" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.project2_main" />
<item itemvalue="Python.project2_main_original" />
<item itemvalue="Python.project2_20190526" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="f0c9c573-b9ae-4bf1-8e72-d6720fc99052" name="Default Changelist" comment="" />
<created>1558967327224</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1558967327224</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="0" y="0" width="1440" height="900" extended-state="0" />
<editor active="true" />
<layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.15694444" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Python Console" order="7" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info anchor="bottom" id="Terminal" order="7" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.3985849" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Find" order="1" />
</layout>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/project2_20190526.py</url>
<line>184</line>
<option name="timeStamp" value="6" />
</line-breakpoint>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/project2_20190526.py</url>
<line>106</line>
<option name="timeStamp" value="13" />
</line-breakpoint>
</breakpoints>
<default-breakpoints>
<breakpoint type="python-exception">
<properties notifyOnTerminate="true" exception="BaseException">
<option name="notifyOnTerminate" value="true" />
</properties>
</breakpoint>
</default-breakpoints>
</breakpoint-manager>
</component>
<component name="editorHistoryManager">
<entry file="file://E:/Program Files/Anaconda3/Lib/urllib/parse.py" />
<entry file="file://E:/Program Files/Anaconda3/Lib/codecs.py" />
<entry file="file://E:/Program Files/Anaconda3/Lib/site-packages/sklearn/feature_extraction/text.py" />
<entry file="file://$PROJECT_DIR$/project2_20190526.py" />
<entry file="file://$PROJECT_DIR$/对话示例.png">
<provider selected="true" editor-type-id="images" />
</entry>
<entry file="file://$PROJECT_DIR$/project2_main_original.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/project2_main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="447">
<caret line="143" column="33" selection-start-line="143" selection-start-column="33" selection-end-line="143" selection-end-column="33" />
<folding>
<element signature="e#16#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>
\ No newline at end of file
project2_main.py python执行脚本(所有的代码位置)
data_train.xlsx 意图分类训练数据
stopword.txt 停用词
File added
# coding=utf-8
import pandas as pd
import fool
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import jieba
# -----------------------------------------------------
# 加载停用词词典
stopwords = {}
with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
for word in fr:
stopwords[word.strip()] = 0
# -----------------------------------------------------
# 定义类
class CLF_MODEL:
# 类目标:该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中
# 初始化模块
def __init__(self):
self.model = "" # 成员变量,用于存储模型
self.vectorizer = "" # 成员变量,用于存储tfidf统计值
# 训练模块
def train(self):
# 函数目标:读取训练数据,训练意图分类模型,并将训练好的分类模型赋值给成员变量self.model
# input:无
# output:无
# 从excel文件读取训练样本
d_train = pd.read_excel("data_train.xlsx")
# 对训练数据进行预处理
d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean)
print("训练样本 = %d" % len(d_train))
"""
TODO:利用sklearn中的函数进行训练,将句子转化为特征features
"""
self.vectorizer = TfidfVectorizer()
features = self.vectorizer.fit_transform(d_train.sentence_train)
print(features.shape)
self.model = LogisticRegression(penalty='l1', solver='saga', tol=0.1)
self.model.fit(features, d_train.label)
score = self.model.score(features, d_train.label)
print("Test score with L1 penalty: %.4f" % score)
# 预测模块(使用模型预测)
def predict_model(self, sentence):
# 函数目标:使用意图分类模型预测意图
# input:sentence(用户输入)
# output:clf_result(意图类别),score(意图分数)
# --------------
# 对样本中没有的特殊情况做特别判断
if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]:
return 1, 0.8
# --------------
"""
TODO:利用已训练好的意图分类模型进行意图识别
"""
X_pred = self.vectorizer.transform([sentence])
y_pred = self.model.predict(X_pred)
clf_result = y_pred[0]
y_score = self.model.predict_proba(X_pred)
score = y_score[0][clf_result]
return clf_result, score
# 预测模块(使用规则)
def predict_rule(self, sentence):
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
# input:sentence(用户输入)
# output:clf_result(意图类别),score(意图分数)
sentence = sentence.replace(' ', '')
if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence):
return 2, 0.8
elif re.findall(r'订|定|预定|买|购', sentence) or sentence in ["好的","需要","是的","要的","好","要","是"]:
return 1, 0.8
else:
return 0, 0.8
# 预处理函数
def fun_clean(self, sentence):
# 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
# input:sentence(用户输入语句)
# output:sentence(预处理结果)
"""
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
"""
result = []
for word in sentence.split(" "):
if word in stopwords:
continue
result.append(word)
return " ".join(result)
# 分类主函数
def fun_clf(self, sentence):
# 函数目标:意图识别主函数
# input:sentence( 用户输入语句)
# output:clf_result(意图类别),score(意图分数)
s = " ".join(jieba.cut(sentence))
# 对用户输入进行预处理
s = self.fun_clean(s)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
clf_result, score = self.predict_model(s) # 使用训练的模型进行意图预测
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
return clf_result, score
def fun_replace_num(sentence):
# 函数目标:替换时间中的数字(目的是便于实体识别包fool对实体的识别)
# input:sentence
# output:sentence
# 定义要替换的数字
time_num = {"一":"1","二":"2","三":"3","四":"4","五":"5","六":"6","七":"7","八":"8","九":"9","十":"10","十一":"11","十二":"12"}
for k, v in time_num.items():
sentence = sentence.replace(k, v)
return sentence
def slot_fill(sentence, key=None):
# 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)
# input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息)
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
slot = {}
slot_tmp = {}
# 进行实体识别
words, ners = fool.analysis(sentence)
slot = {"time":"", "date":"", "from_city":"", "to_city":""}
"""
TODO:从sentence中寻找需要的内容,完成填槽工作
"""
for ner in ners:
for n in ner:
start, end, genre, content = n
if "location" == genre:
if start > 0 and sentence[start-1] not in ["到","达","回","去","飞","往"]:
slot_tmp["to_city"] = content
else:
slot_tmp["from_city"] = content
elif "time" == genre:
date = re.search(r"(((\d{2}|\d{4})年)?(\d{1,2}月)?\d{1,2}(号|日)|(今天|明天|后天|((周|礼拜|星期)[123456日])))",content)
if date is not None:
slot_tmp["date"] = date.group()
time = re.search(r"(上午|下午|晚上|凌晨|白天|早|晚)(\d{1,2}(时|点))?(\d{1,2}分?)?",content)
if time is not None:
slot_tmp["time"] = time.group()
if key is None:
slot = slot_tmp
else:
for k,v in slot_tmp.items():
if k == key:
slot[key] = v
return slot
def fun_wait(clf_obj):
# 函数目标:等待,获取用户输入问句
# input:CLF_MODEL类实例化对象
# output:clf_result(用户输入意图类别), score(意图识别分数), sentence(用户输入)
# 等待用户输入
print("\n\n\n")
print("-------------------------------------------------------------")
print("----*------*-----*-----*----*-----*-----*-----*-----*------")
print("Starting ...")
sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n")
# 对用户输入进行意图识别
s = " ".join(jieba.cut(sentence))
clf_result, score = clf_obj.fun_clf(s)
return clf_result, score, sentence
def fun_search(clf_result, sentence):
# 函数目标:为用户查询余票
# input:clf_result(意图分类结果), sentence(用户输入问句)
# output:是否有票
# 定义槽存储空间
name = {"time":"出发时间", "date":"出发日期", "from_city":"出发城市", "to_city":"到达城市"}
slot = {"time":"", "date":"", "from_city":"", "to_city":""}
# 使用用户第一句话进行填槽
sentence = fun_replace_num(sentence)
slot_init = slot_fill(sentence)
for key in slot_init.keys():
slot[key] = slot_init[key]
# 对未填充对槽位,向用户提问,进行针对性填槽
while "" in slot.values():
for key in slot.keys():
if slot[key]=="":
sentence = input("客服:请问%s是?\n"%(name[key]))
sentence = fun_replace_num(sentence)
slot_cur = slot_fill(sentence, key)
for key in slot_cur.keys():
if slot[key]=="":
slot[key] = slot_cur[key]
# 查询是否有票,并答复用户(本次查询是否有票使用随机数完成,实际情况可查询数据库返回)
if random.random()>0.5:
print("客服:%s%s从%s到%s的票充足"%(slot["date"], slot["time"], slot["from_city"], slot["to_city"]))
# 返回1表示有票
return 1
else:
print("客服:%s%s从%s到%s无票" % (slot["date"], slot["time"], slot["from_city"], slot["to_city"]))
print("End !!!")
print("----*------*-----*-----*----*-----*-----*-----*-----*------")
print("-------------------------------------------------------------")
# 返回0表示无票
return 0
def fun_book():
# 函数目标:执行下单订票动作
# input:无
# output:无
print("客服:已为您完成订票。\n\n\n")
print("End !!!")
print("----*------*-----*-----*----*-----*-----*-----*-----*------")
print("-------------------------------------------------------------")
if __name__=="__main__":
# 实例化对象
clf_obj = CLF_MODEL()
# 完成意图识别模型的训练
clf_obj.train()
# 用户定义阈值(当分类器分类的分数大于阈值才采纳本次意图分类结果,目的是排除分数过低的意图分类结果)
threshold = 0.55
# 循环提供服务
while 1:
clf_result, score, sentence = fun_wait(clf_obj)
print(clf_result, score, sentence)
# -------------------------------------------------------------------------------
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# -------------------------------------------------------------------------------
if score<threshold or clf_result==2:
continue
# -------------------------------------------------------------------------------
# 状态转移条件(等待-->查询):用户输入分类为“查询” OR “订票”
# -------------------------------------------------------------------------------
else:
# 收集订票细节信息
search_result = fun_search(clf_result, sentence)
# 查询无票
# -------------------------------------------------------------------------------
# 状态转移条件(查询-->等待):FUN_SEARCH执行完后用户输入意图为“终止服务” OR FUN_SEARCH返回无票
# -------------------------------------------------------------------------------
if search_result==0:
continue
# 查询有票
else:
# 等待用户输入
sentence = input("客服:需要为您订票吗?\n")
# 对用户输入进行意图识别
clf_result, score = clf_obj.fun_clf(sentence)
# -------------------------------------------------------------------------------
# 状态转移条件(查询-->订票):FUN_SEARCH返回有票 AND 用户输入意图为“订票”
# -------------------------------------------------------------------------------
if clf_result == 1:
fun_book()
continue
一下
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment