# encoding: utf-8 from model_training import SentimentModel from gen_id2business import id2business from tqdm import tqdm from sentence import Sentence import json import pandas as pd model_path = "model/model.pkl" vector_path = "model/vector.pkl" SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来 class Business(object): """ 用来表示跟business相关的变量和函数 """ def __init__(self, review_df): # 初始化变量以及函数 self.review_df = review_df self.business_id = self.review_df.iloc[0].business_id self.business_name = id2business[self.business_id]["name"] self.review_nps,self.aspects = self.extract_aspects() def aspect_based_summary(self,threshold=0.5): """ 返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 具体细节请看给定的文档。 """ aspect_info = [] for aspect in self.aspects: aspect_info.append({ "aspect" : aspect, "stars" : [], "pos" : [], "neg" : [] }) for idx in tqdm(range(len(self.review_df))): review = self.review_df.iloc[idx] text = review.text current_review_nps = self.review_nps[idx] for idx_aspect in range(len(self.aspects)): aspect = self.aspects[idx_aspect] if aspect in current_review_nps: aspect_info[idx_aspect]["stars"].append(review.stars) data = [text] result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path) print(result,score) if score[0][1] >= threshold: aspect_info[idx_aspect]["pos"].append((idx,score[0])) else: aspect_info[idx_aspect]["neg"].append((idx,score[0])) business_rating = 0 detail = [] for idx_aspect in range(len(self.aspects)): aspect = self.aspects[idx_aspect] business_rating += sum(aspect_info[idx_aspect]["stars"]) step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1 step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1 info = {"aspect":aspect, "rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]), "pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])), "neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))} detail.append(info) business_rating = business_rating/len(self.review_df) return {'business_id': self.business_id, 'business_name': self.business_name, 'business_rating': business_rating, 'aspect_summary': detail } def extract_aspects(self): """ 从一个business的review中抽取aspects """ np_dict = dict() review_nps = [] for idx in tqdm(range(len(self.review_df))): review = self.review_df.iloc[idx] sen = Sentence(review.text) nps = [] for np in sen.extract_noun_phrase(): print(np) nps.append(np) if np not in np_dict: np_dict[np] = 0 np_dict[np] += 1 review_nps.append(nps) sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5] aspects = [aspect for aspect,times in sort_np_dict_items_top_5] return review_nps,aspects if __name__ == "__main__": review_path = "data/review.json" reviews = [] with open(review_path,"r",encoding="utf-8") as f: for line in f.readlines(): if len(line.strip()) == 0: continue ele = json.loads(line.strip()) reviews.append(ele) review_df = pd.DataFrame(reviews) print(len(review_df)) print(review_df.head()) business_ids = ["ujmEBvifdJM6h6RLv4wQIg"] for business_id in business_ids: current_review_df = review_df[review_df.business_id==business_id] print(current_review_df.head()) print(len(current_review_df)) business = Business(current_review_df) print("Aspects",business.aspects) print(business.aspect_based_summary())