Commit bdaf169e by 20200203063

Replace project2_main.py

parent 7d14c2cd
......@@ -6,7 +6,8 @@ import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import jieba
import numpy as np
# -----------------------------------------------------
# 加载停用词词典
......@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
for word in fr:
stopwords[word.strip()] = 0
# -----------------------------------------------------
# 加载同义词词典
simi = {}
with open(r'simi.txt', 'r', encoding='utf-8') as sr:
for line in sr:
items = line.strip().split()
if len(items)>=2:
stopwords[items[0]] = items[1]
# -----------------------------------------------------
# 定义类
......@@ -41,13 +50,10 @@ class CLF_MODEL:
"""
TODO:利用sklearn中的函数进行训练,将句子转化为特征features
"""
self.vectorizer = TfidfVectorizer()
features = self.vectorizer.fit_transform(d_train.sentence_train)
print(features.shape)
self.model = LogisticRegression(penalty='l1', solver='saga', tol=0.1)
features = self.vectorizer.fit_transform(d_train.sentence_fenci.to_list())
self.model.fit(features, d_train.label)
score = self.model.score(features, d_train.label)
print("Test score with L1 penalty: %.4f" % score)
# 预测模块(使用模型预测)
def predict_model(self, sentence):
......@@ -64,13 +70,15 @@ class CLF_MODEL:
"""
TODO:利用已训练好的意图分类模型进行意图识别
"""
X_pred = self.vectorizer.transform([sentence])
y_pred = self.model.predict(X_pred)
clf_result = y_pred[0]
y_score = self.model.predict_proba(X_pred)
score = y_score[0][clf_result]
sent = self.fun_clean(' '.join(fool.cut(sentence)[0]))
inputs = self.vectorizer.transform([sent])
scores = self.model.predict_proba(inputs)[0]
clf_result = np.argmax(scores, axis=0)
score = scores[clf_result]
return clf_result, score
# 预测模块(使用规则)
def predict_rule(self, sentence):
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
......@@ -94,24 +102,24 @@ class CLF_MODEL:
"""
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
"""
tokens = map(lambda x:simi.get(x,x), sentence.split())
tokens = filter(lambda x:x not in stopwords, tokens)
sentence = ' '.join(tokens)
return sentence
result = []
for word in sentence.split(" "):
if word in stopwords:
continue
result.append(word)
return " ".join(result)
# 分类主函数
def fun_clf(self, sentence):
# 函数目标:意图识别主函数
# input:sentence( 用户输入语句)
# output:clf_result(意图类别),score(意图分数)
s = " ".join(jieba.cut(sentence))
# 对用户输入进行预处理
s = self.fun_clean(s)
sentence = self.fun_clean(sentence)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
clf_result, score = self.predict_model(s) # 使用训练的模型进行意图预测
clf_result, score = self.predict_model(sentence) # 使用训练的模型进行意图预测
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
return clf_result, score
......@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None):
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
slot = {}
slot_tmp = {}
# 进行实体识别
words, ners = fool.analysis(sentence)
slot = {"time":"", "date":"", "from_city":"", "to_city":""}
"""
TODO:从sentence中寻找需要的内容,完成填槽工作
"""
for ner in ners:
for n in ner:
start, end, genre, content = n
if "location" == genre:
if start > 0 and sentence[start-1] not in ["到","达","回","去","飞","往"]:
slot_tmp["to_city"] = content
else:
slot_tmp["from_city"] = content
elif "time" == genre:
date = re.search(r"(((\d{2}|\d{4})年)?(\d{1,2}月)?\d{1,2}(号|日)|(今天|明天|后天|((周|礼拜|星期)[123456日])))",content)
if date is not None:
slot_tmp["date"] = date.group()
time = re.search(r"(上午|下午|晚上|凌晨|白天|早|晚)(\d{1,2}(时|点))?(\d{1,2}分?)?",content)
if time is not None:
slot_tmp["time"] = time.group()
if key is None:
slot = slot_tmp
else:
for k,v in slot_tmp.items():
if k == key:
slot[key] = v
return slot
for item in ners:
name, value = item[2], item[3]
if name=='location':
if 'from_city' in slot:
slot['to_city']=value
else:
slot['from_city']=value
else:
slot[name]=value
return slot if not key else slot.get(key,{})
def fun_wait(clf_obj):
......@@ -180,8 +174,7 @@ def fun_wait(clf_obj):
print("Starting ...")
sentence = input("客服:请问需要什么服务?(时间请用12小时制表示)\n")
# 对用户输入进行意图识别
s = " ".join(jieba.cut(sentence))
clf_result, score = clf_obj.fun_clf(s)
clf_result, score = clf_obj.fun_clf(sentence)
return clf_result, score, sentence
......@@ -235,9 +228,6 @@ def fun_book():
if __name__=="__main__":
# 实例化对象
clf_obj = CLF_MODEL()
......@@ -248,7 +238,6 @@ if __name__=="__main__":
# 循环提供服务
while 1:
clf_result, score, sentence = fun_wait(clf_obj)
print(clf_result, score, sentence)
# -------------------------------------------------------------------------------
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# -------------------------------------------------------------------------------
......@@ -279,4 +268,3 @@ if __name__=="__main__":
if clf_result == 1:
fun_book()
continue
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment