Commit cea76185 by 20200203063

Replace business.py

parent 6251e257
# encoding: utf-8
from model_training import SentimentModel
from gen_id2business import id2business
from tqdm import tqdm
from sentence import Sentence
import json
import pandas as pd
model_path = "model/model.pkl"
vector_path = "model/vector.pkl"
SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来
class Business(object):
"""
用来表示跟business相关的变量和函数
"""
SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来
def __init__(self, review_df):
# 初始化变量以及函数
self.review_df = review_df
self.business_id = self.review_df.iloc[0].business_id
self.business_name = id2business[self.business_id]["name"]
self.review_nps,self.aspects = self.extract_aspects()
def aspect_based_summary(self):
def aspect_based_summary(self,threshold=0.5):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
aspect_info = []
for aspect in self.aspects:
aspect_info.append({
"aspect" : aspect,
"stars" : [],
"pos" : [],
"neg" : []
})
for idx in tqdm(range(len(self.review_df))):
review = self.review_df.iloc[idx]
text = review.text
current_review_nps = self.review_nps[idx]
for idx_aspect in range(len(self.aspects)):
aspect = self.aspects[idx_aspect]
if aspect in current_review_nps:
aspect_info[idx_aspect]["stars"].append(review.stars)
data = [text]
result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path)
print(result,score)
if score[0][1] >= threshold:
aspect_info[idx_aspect]["pos"].append((idx,score[0]))
else:
aspect_info[idx_aspect]["neg"].append((idx,score[0]))
business_rating = 0
detail = []
for idx_aspect in range(len(self.aspects)):
aspect = self.aspects[idx_aspect]
business_rating += sum(aspect_info[idx_aspect]["stars"])
step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1
step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1
info = {"aspect":aspect,
"rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]),
"pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])),
"neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))}
detail.append(info)
business_rating = business_rating/len(self.review_df)
return {'business_id':
'business_name':
'business_rating':
'aspect_summary':
return {'business_id': self.business_id,
'business_name': self.business_name,
'business_rating': business_rating,
'aspect_summary': detail
}
......@@ -32,6 +78,41 @@ class Business(object):
"""
从一个business的review中抽取aspects
"""
np_dict = dict()
review_nps = []
for idx in tqdm(range(len(self.review_df))):
review = self.review_df.iloc[idx]
sen = Sentence(review.text)
nps = []
for np in sen.extract_noun_phrase():
print(np)
nps.append(np)
if np not in np_dict:
np_dict[np] = 0
np_dict[np] += 1
review_nps.append(nps)
sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5]
aspects = [aspect for aspect,times in sort_np_dict_items_top_5]
return review_nps,aspects
if __name__ == "__main__":
review_path = "data/review.json"
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in f.readlines():
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print(len(review_df))
print(review_df.head())
business_ids = ["ujmEBvifdJM6h6RLv4wQIg"]
for business_id in business_ids:
current_review_df = review_df[review_df.business_id==business_id]
print(current_review_df.head())
print(len(current_review_df))
business = Business(current_review_df)
print("Aspects",business.aspects)
print(business.aspect_based_summary())
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment