Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
project5
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200203063
project5
Commits
bdaf169e
Commit
bdaf169e
authored
Sep 01, 2020
by
20200203063
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replace project2_main.py
parent
7d14c2cd
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
42 additions
and
54 deletions
+42
-54
project2_main.py
+42
-54
No files found.
project2_main.py
View file @
bdaf169e
...
...
@@ -6,7 +6,8 @@ import re
import
random
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.linear_model
import
LogisticRegression
import
jieba
import
numpy
as
np
# -----------------------------------------------------
# 加载停用词词典
...
...
@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
for
word
in
fr
:
stopwords
[
word
.
strip
()]
=
0
# -----------------------------------------------------
# 加载同义词词典
simi
=
{}
with
open
(
r'simi.txt'
,
'r'
,
encoding
=
'utf-8'
)
as
sr
:
for
line
in
sr
:
items
=
line
.
strip
()
.
split
()
if
len
(
items
)
>=
2
:
stopwords
[
items
[
0
]]
=
items
[
1
]
# -----------------------------------------------------
# 定义类
...
...
@@ -41,13 +50,10 @@ class CLF_MODEL:
"""
TODO:利用sklearn中的函数进行训练,将句子转化为特征features
"""
self
.
vectorizer
=
TfidfVectorizer
()
features
=
self
.
vectorizer
.
fit_transform
(
d_train
.
sentence_train
)
print
(
features
.
shape
)
self
.
model
=
LogisticRegression
(
penalty
=
'l1'
,
solver
=
'saga'
,
tol
=
0.1
)
features
=
self
.
vectorizer
.
fit_transform
(
d_train
.
sentence_fenci
.
to_list
())
self
.
model
.
fit
(
features
,
d_train
.
label
)
score
=
self
.
model
.
score
(
features
,
d_train
.
label
)
print
(
"Test score with L1 penalty:
%.4
f"
%
score
)
# 预测模块(使用模型预测)
def
predict_model
(
self
,
sentence
):
...
...
@@ -64,13 +70,15 @@ class CLF_MODEL:
"""
TODO:利用已训练好的意图分类模型进行意图识别
"""
X_pred
=
self
.
vectorizer
.
transform
([
sentence
])
y_pred
=
self
.
model
.
predict
(
X_pred
)
clf_result
=
y_pred
[
0
]
y_score
=
self
.
model
.
predict_proba
(
X_pred
)
score
=
y_score
[
0
][
clf_result
]
sent
=
self
.
fun_clean
(
' '
.
join
(
fool
.
cut
(
sentence
)[
0
]))
inputs
=
self
.
vectorizer
.
transform
([
sent
])
scores
=
self
.
model
.
predict_proba
(
inputs
)[
0
]
clf_result
=
np
.
argmax
(
scores
,
axis
=
0
)
score
=
scores
[
clf_result
]
return
clf_result
,
score
# 预测模块(使用规则)
def
predict_rule
(
self
,
sentence
):
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
...
...
@@ -94,24 +102,24 @@ class CLF_MODEL:
"""
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
"""
tokens
=
map
(
lambda
x
:
simi
.
get
(
x
,
x
),
sentence
.
split
())
tokens
=
filter
(
lambda
x
:
x
not
in
stopwords
,
tokens
)
sentence
=
' '
.
join
(
tokens
)
return
sentence
result
=
[]
for
word
in
sentence
.
split
(
" "
):
if
word
in
stopwords
:
continue
result
.
append
(
word
)
return
" "
.
join
(
result
)
# 分类主函数
def
fun_clf
(
self
,
sentence
):
# 函数目标:意图识别主函数
# input:sentence( 用户输入语句)
# output:clf_result(意图类别),score(意图分数)
s
=
" "
.
join
(
jieba
.
cut
(
sentence
))
# 对用户输入进行预处理
s
=
self
.
fun_clean
(
s
)
s
entence
=
self
.
fun_clean
(
sentence
)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
clf_result
,
score
=
self
.
predict_model
(
s
)
# 使用训练的模型进行意图预测
clf_result
,
score
=
self
.
predict_model
(
s
entence
)
# 使用训练的模型进行意图预测
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
return
clf_result
,
score
...
...
@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None):
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
slot
=
{}
slot_tmp
=
{}
# 进行实体识别
words
,
ners
=
fool
.
analysis
(
sentence
)
slot
=
{
"time"
:
""
,
"date"
:
""
,
"from_city"
:
""
,
"to_city"
:
""
}
"""
TODO:从sentence中寻找需要的内容,完成填槽工作
"""
for
ner
in
ners
:
for
n
in
ner
:
start
,
end
,
genre
,
content
=
n
if
"location"
==
genre
:
if
start
>
0
and
sentence
[
start
-
1
]
not
in
[
"到"
,
"达"
,
"回"
,
"去"
,
"飞"
,
"往"
]:
slot_tmp
[
"to_city"
]
=
content
else
:
slot_tmp
[
"from_city"
]
=
content
elif
"time"
==
genre
:
date
=
re
.
search
(
r"(((\d{2}|\d{4})年)?(\d{1,2}月)?\d{1,2}(号|日)|(今天|明天|后天|((周|礼拜|星期)[123456日])))"
,
content
)
if
date
is
not
None
:
slot_tmp
[
"date"
]
=
date
.
group
()
time
=
re
.
search
(
r"(上午|下午|晚上|凌晨|白天|早|晚)(\d{1,2}(时|点))?(\d{1,2}分?)?"
,
content
)
if
time
is
not
None
:
slot_tmp
[
"time"
]
=
time
.
group
()
if
key
is
None
:
slot
=
slot_tmp
else
:
for
k
,
v
in
slot_tmp
.
items
():
if
k
==
key
:
slot
[
key
]
=
v
return
slot
for
item
in
ners
:
name
,
value
=
item
[
2
],
item
[
3
]
if
name
==
'location'
:
if
'from_city'
in
slot
:
slot
[
'to_city'
]
=
value
else
:
slot
[
'from_city'
]
=
value
else
:
slot
[
name
]
=
value
return
slot
if
not
key
else
slot
.
get
(
key
,{})
def
fun_wait
(
clf_obj
):
...
...
@@ -180,8 +174,7 @@ def fun_wait(clf_obj):
print
(
"Starting ..."
)
sentence
=
input
(
"客服:请问需要什么服务?(时间请用12小时制表示)
\n
"
)
# 对用户输入进行意图识别
s
=
" "
.
join
(
jieba
.
cut
(
sentence
))
clf_result
,
score
=
clf_obj
.
fun_clf
(
s
)
clf_result
,
score
=
clf_obj
.
fun_clf
(
sentence
)
return
clf_result
,
score
,
sentence
...
...
@@ -235,9 +228,6 @@ def fun_book():
if
__name__
==
"__main__"
:
# 实例化对象
clf_obj
=
CLF_MODEL
()
...
...
@@ -248,7 +238,6 @@ if __name__=="__main__":
# 循环提供服务
while
1
:
clf_result
,
score
,
sentence
=
fun_wait
(
clf_obj
)
print
(
clf_result
,
score
,
sentence
)
# -------------------------------------------------------------------------------
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# -------------------------------------------------------------------------------
...
...
@@ -279,4 +268,3 @@ if __name__=="__main__":
if
clf_result
==
1
:
fun_book
()
continue
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment