Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
2
20200519040-Project2
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200519040
20200519040-Project2
Commits
41de1f9c
Commit
41de1f9c
authored
Sep 09, 2020
by
20200519040
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
20200519040-Project2
parents
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
289 additions
and
0 deletions
+289
-0
submit/business.py
+183
-0
submit/main.py
+32
-0
submit/model.py
+74
-0
No files found.
submit/business.py
0 → 100644
View file @
41de1f9c
import
nltk
,
os
from
collections
import
Counter
,
defaultdict
import
yaml
import
json
businessfilepath
=
r'C:\Users\goooo\OneDrive\Coding\greed\project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1\data\business.json'
class
Review
():
def
__init__
(
self
,
data
):
self
.
text
=
data
[
'text'
]
self
.
stars
=
data
[
'stars'
]
self
.
usesr_id
=
data
[
'usesr_id'
]
self
.
business_id
=
data
[
'business_id'
]
def
extract_aspects
(
self
,
sent
):
"""
从一个句子的review中抽取aspects
"""
aspects
=
set
()
for
word
,
tag
in
nltk
.
pos_tag
(
nltk
.
word_tokenize
(
sent
)):
if
tag
==
'NN'
:
aspects
.
add
(
word
)
return
aspects
def
compute_doc_aspects
(
self
,
doc
,
topk
=
5
):
sents
=
[]
for
line
in
doc
:
sents_
=
nltk
.
sent_tokenize
(
line
)
sents
.
extend
(
sents_
)
topic
=
Counter
()
for
sent
in
sents
:
aspects_
=
extract_aspects
(
sent
)
topic
.
update
(
aspects_
)
aspects
,
freq
=
zip
(
*
topic
.
most_common
(
topk
))
return
aspects
class
BusinessManager
(
object
):
def
__init__
(
self
,
init_dir
=
None
):
self
.
data
=
defaultdict
(
list
)
self
.
aspects
=
defaultdict
(
list
)
self
.
user_stars
=
{}
self
.
sentiment_model
=
None
if
init_dir
:
self
.
load_data
(
init_dir
)
def
load_data
(
self
,
review_dir
):
# all_business = defaultdict(business) // Business -> business
user_stars
=
defaultdict
(
float
)
for
review_file
in
os
.
listdir
(
review_dir
):
review_path
=
os
.
path
.
join
(
review_dir
,
review_file
)
# review_data = json.load(open(review_path, 'r', encoding='utf-8'))
# with open('business.json') as f:
# try:
# review_data = json.load(f)
# except:
# review_data = {}
file
=
open
(
review_path
,
'r'
,
encoding
=
'utf-8'
)
review_data_temp
=
[]
for
line
in
file
.
readlines
():
dic
=
json
.
loads
(
line
)
review_data_temp
.
append
(
dic
)
review_data
=
review_data_temp
[
0
]
review
=
Review
(
review_data
)
business_id
=
review
.
business_id
business
=
self
.
data
.
get
(
business_id
)
business
.
append
(
review
)
user_stars
[
review
.
user_id
]
+=
review
.
stars
self
.
user_stars
=
{
user_id
:
stars
/
len
(
user_stars
)
for
user_id
,
stars
in
user_stars
.
items
()}
def
get_business_ids
():
return
list
(
self
.
data
.
keys
())
def
get_business_reviews
(
self
,
business_id
):
return
self
.
data
.
get
(
business_id
,
[])
def
load_aspects
(
self
,
aspect_config
):
assert
os
.
path
.
exists
(
aspect_config
)
self
.
aspects
=
yaml
.
safe_load
(
aspect_config
)
def
build_aspects
(
self
):
for
business_id
,
reviews
in
self
.
data
.
items
():
doc
=
[
review
.
text
for
review
in
reviews
]
self
.
aspects
[
business_id
]
=
compute_doc_aspects
(
doc
,
topk
=
5
)
def
get_business_aspects
(
self
,
business_id
):
if
business_id
not
in
self
.
aspects
:
print
(
'not find business_id'
)
return
[]
return
self
.
aspects
.
get
(
business_id
)
def
get_all_reviews
(
self
):
return
[
review
for
review
in
reviews
for
reviews
in
list
(
self
.
data
.
values
())]
def
get_business_score
(
self
,
business_id
):
reviews
=
self
.
data
[
business_id
]
scores
=
[
review
.
stars
for
review
in
reviews
]
ave_score
=
sum
(
scores
)
/
len
(
scores
)
return
ave_score
def
get_user_score
(
self
,
user_id
):
reviews
=
self
.
get_all_reviews
()
scores
=
[
review
.
stars
for
review
in
reviews
if
review
.
user_id
==
user_id
]
ave_score
=
sum
(
scores
)
/
len
(
scores
)
return
ave_score
def
get_aspect_summary
(
self
,
business_id
,
aspect
):
pos_sents
,
neg_sents
=
[],
[]
stars
=
0.0
reviews
=
self
.
data
[
business_id
]
for
review
in
reviews
:
if
not
review
.
text
.
contains
(
aspect
):
continue
review_segment
=
get_segment
(
review
,
aspect
)
score
=
sentiment_model
.
predict
(
review_segment
)
stars
+=
review
.
stars
if
score
>
threshold
:
pos_sents
.
append
(
review_segment
)
else
:
neg_sents
.
append
(
review_segment
)
stars
=
stars
/
(
len
(
pos_sents
)
+
len
(
neg_sents
))
return
dict
(
rating
=
stars
,
pos
=
pos_sents
,
neg
=
neg_sents
)
def
aspect_based_summary
(
self
,
business_id
):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
business_rating
=
self
.
get_business_score
(
business_id
)
aspect_summary
=
defaultdict
(
dict
)
aspects
=
self
.
get_business_aspects
(
business_id
)
for
aspect
in
aspects
:
aspect_summary
[
aspect
]
=
self
.
get_aspect_summary
(
business_id
,
aspect
)
return
dict
(
business_id
=
business_id
,
business_name
=
''
,
business_rating
=
business_rating
,
aspect_summary
=
aspect_summary
)
def
generate_model_data
(
self
):
assert
self
.
user_stars
,
"please load review data at first"
data
=
[]
for
review
in
self
.
get_all_reviews
():
ave_star
=
self
.
user_stars
.
get
(
review
.
user_id
)
if
review
.
stars
-
ave_star
>=
0.5
:
data
.
append
((
review
.
text
,
1
))
if
review
.
stars
-
ave_star
<=-
0.5
:
data
.
append
((
review
.
text
,
0
))
else
:
# drop
pass
random
.
shuffle
(
data
)
train_data
,
test_data
=
data
[
0
:
len
(
data
)
*
0.9
],
data
[
len
(
data
)
*
0.9
:]
return
train_data
,
test_data
def
set_sentiment_model
(
self
,
sentiment_model
):
self
.
sentiment_model
=
sentiment_model
submit/main.py
0 → 100644
View file @
41de1f9c
import
os
,
sys
import
json
from
collections
import
defaultdict
from
business
import
BusinessManager
import
model
def
main
():
mgr
=
BusinessManager
(
'data/'
)
train_data
,
test_data
=
mgr
.
generate_model_data
()
feature_builder
=
model
.
FeatureBuilder
(
'tfidf'
)
X_train
,
y_train
,
X_test
,
y_test
=
feature_builder
.
get_feature
(
train_data
,
test_data
)
lrmodel
=
model
.
LinearModel
()
lrmodel
.
train
(
X_train
,
y_train
)
lrmodel
.
save
(
model_path
)
mgr
.
set_sentiment_model
(
lrmodel
)
business_ids
=
get_business_ids
()
for
bid
in
business_ids
:
summary
=
mgr
.
aspect_based_summary
(
business_ids
)
print
(
summary
)
if
__name__
==
"__main__"
:
main
()
submit/model.py
0 → 100644
View file @
41de1f9c
import
os
,
sys
import
sklearn
from
sklearn.linear_model
import
LogisticRegression
import
tensorflow
as
tf
import
tensorflow_datasets
from
transformers
import
*
import
torch
class
FeatureBuilder
():
def
__init_
(
self
,
method
=
'tfidf'
):
self
.
method
=
method
def
get_feature
(
train_data
,
test_data
,
tokenizer
=
None
):
if
self
.
method
==
'tfidf'
:
return
get_tfidf_feature
(
train_data
,
test_data
)
elif
self
.
method
==
'sentence piece'
:
return
get_bert_feature
(
train_data
,
test_data
)
def
get_tfidf_feature
(
train_data
,
test_data
):
X_train_data
,
y_train
=
zip
(
*
train_data
)
X_test_data
,
y_test
=
zip
(
*
train_data
)
vectorizer
=
sklearn
.
feature_extraction
.
text
.
TfidfVectorizer
()
# 定义一个tf-idf的vectorizer
X_train
=
vectorizer
.
fit_transform
(
X_train_data
)
# 训练数据的特征
X_test
=
vectorizer
.
transform
(
X_test_data
)
# 测试数据的特征
return
X_train
,
y_train
,
X_test
,
y_test
def
get_bert_feature
(
train_data
,
test_data
,
tokenizer
):
return
tokenizer
.
encode
(
train_data
),
tokenizer
.
encode
(
test_data
)
class
LinearModel
():
def
__init__
(
self
):
self
.
algorithm
=
'LR'
grid
=
{
"C"
:
numpy
.
logspace
(
-
3
,
3
,
7
)}
self
.
logreg
=
LogisticRegression
(
solver
=
'lbfgs'
,
max_iter
=
1000
)
self
.
logreg_cv
=
sklearn
.
model_selection
.
GridSearchCV
(
logreg
,
grid
,
cv
=
10
,
scoring
=
'f1'
)
def
train
(
self
,
X_train
,
y_train
):
self
.
logreg_cv
.
fit
(
X_train
,
y_train
)
print
(
sklearn
.
metrics
.
classification_report
(
y_test
,
y_pred
))
def
predict
(
self
,
X_test
):
y_pred
=
logreg_cv
.
predict
(
X_test
)
# TODO ...
class
NNModel
():
def
__init__
(
self
):
self
.
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-cased'
)
self
.
model
=
TFBertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
)
self
.
init_model
()
def
init_model
(
self
):
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
metric
])
def
get_tokenizer
(
self
):
return
self
.
tokenizer
def
train
(
self
,
X_train
,
y_train
):
input_ids
=
torch
.
tensor
(
X_train
)
history
=
self
.
model
.
fit
(
input_ids
,
epochs
=
2
,
steps_per_epoch
=
115
,
validation_data
=
valid_dataset
,
validation_steps
=
7
)
self
.
model
.
save_pretrained
(
'./save/'
)
def
predict
(
self
,
X_test
):
return
self
.
model
(
torch
.
tensor
(
X_test
))
.
argmax
()
.
item
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment