Commit bdaf169e by 20200203063

Replace project2_main.py

parent 7d14c2cd
...@@ -6,7 +6,8 @@ import re ...@@ -6,7 +6,8 @@ import re
import random import random
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import jieba import numpy as np
# ----------------------------------------------------- # -----------------------------------------------------
# 加载停用词词典 # 加载停用词词典
...@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr: ...@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
for word in fr: for word in fr:
stopwords[word.strip()] = 0 stopwords[word.strip()] = 0
# ----------------------------------------------------- # -----------------------------------------------------
# 加载同义词词典
simi = {}
with open(r'simi.txt', 'r', encoding='utf-8') as sr:
for line in sr:
items = line.strip().split()
if len(items)>=2:
stopwords[items[0]] = items[1]
# -----------------------------------------------------
# 定义类 # 定义类
...@@ -41,13 +50,10 @@ class CLF_MODEL: ...@@ -41,13 +50,10 @@ class CLF_MODEL:
""" """
TODO:利用sklearn中的函数进行训练,将句子转化为特征features TODO:利用sklearn中的函数进行训练,将句子转化为特征features
""" """
self.vectorizer = TfidfVectorizer()
features = self.vectorizer.fit_transform(d_train.sentence_train) features = self.vectorizer.fit_transform(d_train.sentence_fenci.to_list())
print(features.shape)
self.model = LogisticRegression(penalty='l1', solver='saga', tol=0.1)
self.model.fit(features, d_train.label) self.model.fit(features, d_train.label)
score = self.model.score(features, d_train.label)
print("Test score with L1 penalty: %.4f" % score)
# 预测模块(使用模型预测) # 预测模块(使用模型预测)
def predict_model(self, sentence): def predict_model(self, sentence):
...@@ -64,13 +70,15 @@ class CLF_MODEL: ...@@ -64,13 +70,15 @@ class CLF_MODEL:
""" """
TODO:利用已训练好的意图分类模型进行意图识别 TODO:利用已训练好的意图分类模型进行意图识别
""" """
X_pred = self.vectorizer.transform([sentence]) sent = self.fun_clean(' '.join(fool.cut(sentence)[0]))
y_pred = self.model.predict(X_pred) inputs = self.vectorizer.transform([sent])
clf_result = y_pred[0] scores = self.model.predict_proba(inputs)[0]
y_score = self.model.predict_proba(X_pred) clf_result = np.argmax(scores, axis=0)
score = y_score[0][clf_result] score = scores[clf_result]
return clf_result, score return clf_result, score
# 预测模块(使用规则) # 预测模块(使用规则)
def predict_rule(self, sentence): def predict_rule(self, sentence):
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式 # 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
...@@ -94,24 +102,24 @@ class CLF_MODEL: ...@@ -94,24 +102,24 @@ class CLF_MODEL:
""" """
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
""" """
tokens = map(lambda x:simi.get(x,x), sentence.split())
tokens = filter(lambda x:x not in stopwords, tokens)
sentence = ' '.join(tokens)
return sentence
result = []
for word in sentence.split(" "):
if word in stopwords:
continue
result.append(word)
return " ".join(result)
# 分类主函数 # 分类主函数
def fun_clf(self, sentence): def fun_clf(self, sentence):
# 函数目标:意图识别主函数 # 函数目标:意图识别主函数
# input:sentence( 用户输入语句) # input:sentence( 用户输入语句)
# output:clf_result(意图类别),score(意图分数) # output:clf_result(意图类别),score(意图分数)
s = " ".join(jieba.cut(sentence))
# 对用户输入进行预处理 # 对用户输入进行预处理
s = self.fun_clean(s) sentence = self.fun_clean(sentence)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别) # 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
clf_result, score = self.predict_model(s) # 使用训练的模型进行意图预测 clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一) # clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
return clf_result, score return clf_result, score
...@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None): ...@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None):
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值) # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
slot = {} slot = {}
slot_tmp = {}
# 进行实体识别 # 进行实体识别
words, ners = fool.analysis(sentence) words, ners = fool.analysis(sentence)
slot = {"time":"", "date":"", "from_city":"", "to_city":""}
""" """
TODO:从sentence中寻找需要的内容,完成填槽工作 TODO:从sentence中寻找需要的内容,完成填槽工作
""" """
for ner in ners:
for n in ner: for item in ners:
start, end, genre, content = n name, value = item[2], item[3]
if "location" == genre: if name=='location':
if start > 0 and sentence[start-1] not in ["到","达","回","去","飞","往"]: if 'from_city' in slot:
slot_tmp["to_city"] = content slot['to_city']=value
else: else:
slot_tmp["from_city"] = content slot['from_city']=value
elif "time" == genre: else:
date = re.search(r"(((\d{2}|\d{4})年)?(\d{1,2}月)?\d{1,2}(号|日)|(今天|明天|后天|((周|礼拜|星期)[123456日])))",content) slot[name]=value
if date is not None:
slot_tmp["date"] = date.group() return slot if not key else slot.get(key,{})
time = re.search(r"(上午|下午|晚上|凌晨|白天|早|晚)(\d{1,2}(时|点))?(\d{1,2}分?)?",content)
if time is not None:
slot_tmp["time"] = time.group()
if key is None:
slot = slot_tmp
else:
for k,v in slot_tmp.items():
if k == key:
slot[key] = v
return slot
def fun_wait(clf_obj): def fun_wait(clf_obj):
...@@ -180,8 +174,7 @@ def fun_wait(clf_obj): ...@@ -180,8 +174,7 @@ def fun_wait(clf_obj):
print("Starting ...") print("Starting ...")
sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n") sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n")
# 对用户输入进行意图识别 # 对用户输入进行意图识别
s = " ".join(jieba.cut(sentence)) clf_result, score = clf_obj.fun_clf(sentence)
clf_result, score = clf_obj.fun_clf(s)
return clf_result, score, sentence return clf_result, score, sentence
...@@ -235,9 +228,6 @@ def fun_book(): ...@@ -235,9 +228,6 @@ def fun_book():
if __name__=="__main__": if __name__=="__main__":
# 实例化对象 # 实例化对象
clf_obj = CLF_MODEL() clf_obj = CLF_MODEL()
...@@ -248,7 +238,6 @@ if __name__=="__main__": ...@@ -248,7 +238,6 @@ if __name__=="__main__":
# 循环提供服务 # 循环提供服务
while 1: while 1:
clf_result, score, sentence = fun_wait(clf_obj) clf_result, score, sentence = fun_wait(clf_obj)
print(clf_result, score, sentence)
# ------------------------------------------------------------------------------- # -------------------------------------------------------------------------------
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务” # 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# ------------------------------------------------------------------------------- # -------------------------------------------------------------------------------
...@@ -279,4 +268,3 @@ if __name__=="__main__": ...@@ -279,4 +268,3 @@ if __name__=="__main__":
if clf_result == 1: if clf_result == 1:
fun_book() fun_book()
continue continue
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment