Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
project5
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200203063
project5
Commits
bdaf169e
Commit
bdaf169e
authored
4 years ago
by
20200203063
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replace project2_main.py
parent
7d14c2cd
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
39 additions
and
51 deletions
+39
-51
project2_main.py
+39
-51
No files found.
project2_main.py
View file @
bdaf169e
...
@@ -6,7 +6,8 @@ import re
...
@@ -6,7 +6,8 @@ import re
import
random
import
random
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.linear_model
import
LogisticRegression
import
jieba
import
numpy
as
np
# -----------------------------------------------------
# -----------------------------------------------------
# 加载停用词词典
# 加载停用词词典
...
@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
...
@@ -15,6 +16,14 @@ with open(r'stopword.txt', 'r', encoding='utf-8') as fr:
for
word
in
fr
:
for
word
in
fr
:
stopwords
[
word
.
strip
()]
=
0
stopwords
[
word
.
strip
()]
=
0
# -----------------------------------------------------
# -----------------------------------------------------
# 加载同义词词典
simi
=
{}
with
open
(
r'simi.txt'
,
'r'
,
encoding
=
'utf-8'
)
as
sr
:
for
line
in
sr
:
items
=
line
.
strip
()
.
split
()
if
len
(
items
)
>=
2
:
stopwords
[
items
[
0
]]
=
items
[
1
]
# -----------------------------------------------------
# 定义类
# 定义类
...
@@ -41,13 +50,10 @@ class CLF_MODEL:
...
@@ -41,13 +50,10 @@ class CLF_MODEL:
"""
"""
TODO:利用sklearn中的函数进行训练,将句子转化为特征features
TODO:利用sklearn中的函数进行训练,将句子转化为特征features
"""
"""
self
.
vectorizer
=
TfidfVectorizer
()
features
=
self
.
vectorizer
.
fit_transform
(
d_train
.
sentence_train
)
features
=
self
.
vectorizer
.
fit_transform
(
d_train
.
sentence_fenci
.
to_list
())
print
(
features
.
shape
)
self
.
model
=
LogisticRegression
(
penalty
=
'l1'
,
solver
=
'saga'
,
tol
=
0.1
)
self
.
model
.
fit
(
features
,
d_train
.
label
)
self
.
model
.
fit
(
features
,
d_train
.
label
)
score
=
self
.
model
.
score
(
features
,
d_train
.
label
)
print
(
"Test score with L1 penalty:
%.4
f"
%
score
)
# 预测模块(使用模型预测)
# 预测模块(使用模型预测)
def
predict_model
(
self
,
sentence
):
def
predict_model
(
self
,
sentence
):
...
@@ -64,13 +70,15 @@ class CLF_MODEL:
...
@@ -64,13 +70,15 @@ class CLF_MODEL:
"""
"""
TODO:利用已训练好的意图分类模型进行意图识别
TODO:利用已训练好的意图分类模型进行意图识别
"""
"""
X_pred
=
self
.
vectorizer
.
transform
([
sentence
])
sent
=
self
.
fun_clean
(
' '
.
join
(
fool
.
cut
(
sentence
)[
0
]))
y_pred
=
self
.
model
.
predict
(
X_pred
)
inputs
=
self
.
vectorizer
.
transform
([
sent
])
clf_result
=
y_pred
[
0
]
scores
=
self
.
model
.
predict_proba
(
inputs
)[
0
]
y_score
=
self
.
model
.
predict_proba
(
X_pred
)
clf_result
=
np
.
argmax
(
scores
,
axis
=
0
)
score
=
y_score
[
0
][
clf_result
]
score
=
scores
[
clf_result
]
return
clf_result
,
score
return
clf_result
,
score
# 预测模块(使用规则)
# 预测模块(使用规则)
def
predict_rule
(
self
,
sentence
):
def
predict_rule
(
self
,
sentence
):
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
# 函数目标:如果模型训练出现异常,可以使用规则进行预测,同时也可以让学员融合"模型"及"规则"的预测方式
...
@@ -94,24 +102,24 @@ class CLF_MODEL:
...
@@ -94,24 +102,24 @@ class CLF_MODEL:
"""
"""
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
TODO:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等
"""
"""
tokens
=
map
(
lambda
x
:
simi
.
get
(
x
,
x
),
sentence
.
split
())
tokens
=
filter
(
lambda
x
:
x
not
in
stopwords
,
tokens
)
sentence
=
' '
.
join
(
tokens
)
return
sentence
result
=
[]
for
word
in
sentence
.
split
(
" "
):
if
word
in
stopwords
:
continue
result
.
append
(
word
)
return
" "
.
join
(
result
)
# 分类主函数
# 分类主函数
def
fun_clf
(
self
,
sentence
):
def
fun_clf
(
self
,
sentence
):
# 函数目标:意图识别主函数
# 函数目标:意图识别主函数
# input:sentence( 用户输入语句)
# input:sentence( 用户输入语句)
# output:clf_result(意图类别),score(意图分数)
# output:clf_result(意图类别),score(意图分数)
s
=
" "
.
join
(
jieba
.
cut
(
sentence
))
# 对用户输入进行预处理
# 对用户输入进行预处理
s
=
self
.
fun_clean
(
s
)
s
entence
=
self
.
fun_clean
(
sentence
)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
# 得到意图分类结果(0为“查询”类别,1为“订票”类别,2为“终止服务”类别)
clf_result
,
score
=
self
.
predict_model
(
s
)
# 使用训练的模型进行意图预测
clf_result
,
score
=
self
.
predict_model
(
s
entence
)
# 使用训练的模型进行意图预测
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
# clf_result, score = self.predict_rule(sentence) # 使用规则进行意图预测(可与用模型进行意图识别的方法二选一)
return
clf_result
,
score
return
clf_result
,
score
...
@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None):
...
@@ -134,38 +142,24 @@ def slot_fill(sentence, key=None):
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
# output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)
slot
=
{}
slot
=
{}
slot_tmp
=
{}
# 进行实体识别
# 进行实体识别
words
,
ners
=
fool
.
analysis
(
sentence
)
words
,
ners
=
fool
.
analysis
(
sentence
)
slot
=
{
"time"
:
""
,
"date"
:
""
,
"from_city"
:
""
,
"to_city"
:
""
}
"""
"""
TODO:从sentence中寻找需要的内容,完成填槽工作
TODO:从sentence中寻找需要的内容,完成填槽工作
"""
"""
for
ner
in
ners
:
for
n
in
ner
:
for
item
in
ners
:
start
,
end
,
genre
,
content
=
n
name
,
value
=
item
[
2
],
item
[
3
]
if
"location"
==
genre
:
if
name
==
'location'
:
if
start
>
0
and
sentence
[
start
-
1
]
not
in
[
"到"
,
"达"
,
"回"
,
"去"
,
"飞"
,
"往"
]
:
if
'from_city'
in
slot
:
slot_tmp
[
"to_city"
]
=
content
slot
[
'to_city'
]
=
value
else
:
else
:
slot_tmp
[
"from_city"
]
=
content
slot
[
'from_city'
]
=
value
elif
"time"
==
genre
:
date
=
re
.
search
(
r"(((\d{2}|\d{4})年)?(\d{1,2}月)?\d{1,2}(号|日)|(今天|明天|后天|((周|礼拜|星期)[123456日])))"
,
content
)
if
date
is
not
None
:
slot_tmp
[
"date"
]
=
date
.
group
()
time
=
re
.
search
(
r"(上午|下午|晚上|凌晨|白天|早|晚)(\d{1,2}(时|点))?(\d{1,2}分?)?"
,
content
)
if
time
is
not
None
:
slot_tmp
[
"time"
]
=
time
.
group
()
if
key
is
None
:
slot
=
slot_tmp
else
:
else
:
for
k
,
v
in
slot_tmp
.
items
():
slot
[
name
]
=
value
if
k
==
key
:
slot
[
key
]
=
v
return
slot
return
slot
if
not
key
else
slot
.
get
(
key
,{})
def
fun_wait
(
clf_obj
):
def
fun_wait
(
clf_obj
):
...
@@ -180,8 +174,7 @@ def fun_wait(clf_obj):
...
@@ -180,8 +174,7 @@ def fun_wait(clf_obj):
print
(
"Starting ..."
)
print
(
"Starting ..."
)
sentence
=
input
(
"客服:请问需要什么服务?(时间请用12小时制表示)
\n
"
)
sentence
=
input
(
"客服:请问需要什么服务?(时间请用12小时制表示)
\n
"
)
# 对用户输入进行意图识别
# 对用户输入进行意图识别
s
=
" "
.
join
(
jieba
.
cut
(
sentence
))
clf_result
,
score
=
clf_obj
.
fun_clf
(
sentence
)
clf_result
,
score
=
clf_obj
.
fun_clf
(
s
)
return
clf_result
,
score
,
sentence
return
clf_result
,
score
,
sentence
...
@@ -235,9 +228,6 @@ def fun_book():
...
@@ -235,9 +228,6 @@ def fun_book():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# 实例化对象
# 实例化对象
clf_obj
=
CLF_MODEL
()
clf_obj
=
CLF_MODEL
()
...
@@ -248,7 +238,6 @@ if __name__=="__main__":
...
@@ -248,7 +238,6 @@ if __name__=="__main__":
# 循环提供服务
# 循环提供服务
while
1
:
while
1
:
clf_result
,
score
,
sentence
=
fun_wait
(
clf_obj
)
clf_result
,
score
,
sentence
=
fun_wait
(
clf_obj
)
print
(
clf_result
,
score
,
sentence
)
# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# 状态转移条件(等待-->等待):用户输入未达到“查询”、“订票”类别的阈值 OR 意图被分类为“终止服务”
# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------
...
@@ -279,4 +268,3 @@ if __name__=="__main__":
...
@@ -279,4 +268,3 @@ if __name__=="__main__":
if
clf_result
==
1
:
if
clf_result
==
1
:
fun_book
()
fun_book
()
continue
continue
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment