Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
Project3
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200203063
Project3
Commits
6251e257
Commit
6251e257
authored
4 years ago
by
20200203063
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replace model_training.py
parent
742efdac
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
92 additions
and
1 deletions
+92
-1
model_training.py
+92
-1
No files found.
model_training.py
View file @
6251e257
# encoding: utf-8
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
nltk.tokenize
import
word_tokenize
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.model_selection
import
train_test_split
import
pickle
import
json
import
pandas
as
pd
from
tqdm
import
tqdm
class
SentimentModel
(
object
):
def
__init__
(
self
,
df_review
=
None
,
pos_star
=
4
,
neg_star
=
2
):
if
df_review
is
not
None
:
self
.
pos_review
=
[]
self
.
neg_review
=
[]
for
idx
in
tqdm
(
range
(
len
(
df_review
))):
review
=
df_review
.
iloc
[
idx
]
if
review
.
stars
>=
pos_star
:
self
.
pos_review
.
append
(
" "
.
join
([
word
for
word
in
word_tokenize
(
review
.
text
)]))
elif
review
.
stars
<=
neg_star
:
self
.
neg_review
.
append
(
" "
.
join
([
word
for
word
in
word_tokenize
(
review
.
text
)]))
print
(
"样本统计:
\n
正例: {}
\n
负例: {}"
.
format
(
len
(
self
.
pos_review
),
len
(
self
.
neg_review
)))
# print("样本示例:\n正例: {}\n负例: {}".format("\n".join(self.pos_review[:2]),"\n".join(self.neg_review[:2])))
self
.
vectorizer
=
TfidfVectorizer
()
corpus
=
self
.
pos_review
+
self
.
neg_review
self
.
X
=
self
.
vectorizer
.
fit_transform
(
corpus
)
self
.
y
=
[
1
]
*
len
(
self
.
pos_review
)
+
[
0
]
*
len
(
self
.
neg_review
)
else
:
self
.
clf
=
None
self
.
vectorizer
=
None
def
train
(
self
,
test_size
=
0.25
,
*
args
,
**
kwargs
):
if
kwargs
[
"save_model_path"
]
is
not
None
:
save_model_path
=
kwargs
[
"save_model_path"
]
if
kwargs
[
"save_vectorizer_path"
]
is
not
None
:
save_vectorizer_path
=
kwargs
[
"save_vectorizer_path"
]
self
.
clf
=
LogisticRegression
(
random_state
=
0
,
C
=
10
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
self
.
X
,
self
.
y
,
test_size
=
test_size
,
random_state
=
0
)
self
.
clf
.
fit
(
X_train
,
y_train
)
score
=
self
.
clf
.
score
(
X_test
,
y_test
)
with
open
(
save_model_path
,
"wb"
)
as
f
:
pickle
.
dump
(
self
.
clf
,
f
)
with
open
(
save_vectorizer_path
,
"wb"
)
as
f
:
pickle
.
dump
(
self
.
vectorizer
,
f
)
print
(
"Finish training, score is {}."
.
format
(
str
(
score
)))
def
predict
(
self
,
*
args
,
**
kwargs
):
if
self
.
clf
is
None
:
if
kwargs
[
"model_path"
]
is
not
None
:
model_path
=
kwargs
[
"model_path"
]
with
open
(
model_path
,
'rb'
)
as
f
:
self
.
clf
=
pickle
.
load
(
f
)
else
:
raise
Exception
(
"model_path missing"
)
if
self
.
vectorizer
is
None
:
if
kwargs
[
"vectorizer_path"
]
is
not
None
:
vectorizer_path
=
kwargs
[
"vectorizer_path"
]
with
open
(
vectorizer_path
,
'rb'
)
as
f
:
self
.
vectorizer
=
pickle
.
load
(
f
)
else
:
raise
Exception
(
"vectorizer_path missing"
)
data
=
args
[
0
]
tmp
=
[]
for
d
in
data
:
tmp
.
append
(
" "
.
join
([
word
for
word
in
word_tokenize
(
d
)]))
data
=
tmp
features
=
self
.
vectorizer
.
transform
(
data
)
result
=
self
.
clf
.
predict
(
features
)
score
=
self
.
clf
.
predict_proba
(
features
)
return
result
,
score
if
__name__
==
"__main__"
:
review_path
=
"data/review.json"
model_path
=
"model/model.pkl"
vector_path
=
"model/vector.pkl"
is_train
=
False
if
is_train
:
reviews
=
[]
with
open
(
review_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
for
line
in
f
.
readlines
():
if
len
(
line
.
strip
())
==
0
:
continue
ele
=
json
.
loads
(
line
.
strip
())
reviews
.
append
(
ele
)
review_df
=
pd
.
DataFrame
(
reviews
)
print
(
review_df
.
head
())
model
=
SentimentModel
(
review_df
)
model
.
train
(
save_model_path
=
model_path
,
save_vectorizer_path
=
vector_path
)
else
:
model
=
SentimentModel
()
data
=
[
"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs."
,
"kkkkkk"
]
result
,
score
=
model
.
predict
(
data
,
model_path
=
model_path
,
vectorizer_path
=
vector_path
)
print
(
"
\n
"
.
join
([
"{}
\t
{}
\t
{}"
.
format
(
str
(
label
),
str
(
s
[
label
]),
d
)
for
label
,
s
,
d
in
zip
(
result
,
score
,
data
)]))
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment