Replace business.py

cea76185 · 20200203063 · 6251e257 · cea76185
Commit cea76185 authored Sep 01, 2020 by 20200203063
Show whitespace changes
Inline Side-by-side

Showing with 97 additions and 15 deletions

business.py
+97 -15

No files found.
--- a/business.py
+++ b/business.py
+# encoding: utf-8

+from model_training import SentimentModel
+from gen_id2business import id2business
+from tqdm import tqdm
+from sentence import Sentence
+import json
+import pandas as pd

+model_path = "model/model.pkl"
+vector_path = "model/vector.pkl"
+SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里，并导入进来

 class Business(object):
 	"""
 	用来表示跟business相关的变量和函数
 	"""
-
-	SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里，并导入进来
-	
-
 	def __init__(self, review_df):
 		# 初始化变量以及函数
+		self.review_df = review_df
+		self.business_id = self.review_df.iloc[0].business_id
+		self.business_name = id2business[self.business_id]["name"]
+		self.review_nps,self.aspects = self.extract_aspects()
 		

-	def aspect_based_summary(self):
+	def aspect_based_summary(self,threshold=0.5):
 		"""
 		返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 
 		具体细节请看给定的文档。 
 		"""
-
-		
-
-		return {'business_id': 
-				'business_name': 
-				'business_rating': 
-				'aspect_summary': 	
+		aspect_info = []
+		for aspect in self.aspects:
+			aspect_info.append({
+				"aspect" : aspect,
+				"stars" : [],
+				"pos" : [],
+				"neg" : []
+			})
+		for idx in tqdm(range(len(self.review_df))):
+			review = self.review_df.iloc[idx]
+			text = review.text
+			current_review_nps = self.review_nps[idx]
+			for idx_aspect in range(len(self.aspects)):
+				aspect = self.aspects[idx_aspect]
+				if aspect in current_review_nps:
+					aspect_info[idx_aspect]["stars"].append(review.stars)
+					data = [text]
+					result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path)
+					print(result,score)
+					if score[0][1] >= threshold:
+						aspect_info[idx_aspect]["pos"].append((idx,score[0]))
+					else:
+						aspect_info[idx_aspect]["neg"].append((idx,score[0]))
+				
+		business_rating = 0
+		detail = []
+		for idx_aspect in range(len(self.aspects)):
+			aspect = self.aspects[idx_aspect]
+			business_rating += sum(aspect_info[idx_aspect]["stars"])
+			step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1
+			step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1
+			info = {"aspect":aspect,
+					"rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]),
+					"pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])),
+					"neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))}
+			detail.append(info)
+
+		business_rating = business_rating/len(self.review_df)
+
+		return {'business_id': self.business_id,
+				'business_name': self.business_name,
+				'business_rating': business_rating,
+				'aspect_summary': 	detail
 				}


@@ -32,6 +78,41 @@ class Business(object):
 		"""
 		从一个business的review中抽取aspects
 		"""
-
-
-
+		np_dict = dict()
+		review_nps = []
+		for idx in tqdm(range(len(self.review_df))):
+			review = self.review_df.iloc[idx]
+			sen = Sentence(review.text)
+			nps = []
+			for np in sen.extract_noun_phrase():
+				print(np)
+				nps.append(np)
+				if np not in np_dict:
+					np_dict[np] = 0
+				np_dict[np] += 1
+			review_nps.append(nps)
+		sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5]
+		aspects = [aspect for aspect,times in sort_np_dict_items_top_5]
+		return review_nps,aspects
+
+if __name__ == "__main__":
+    review_path = "data/review.json"
+    reviews = []
+    with open(review_path,"r",encoding="utf-8") as f:
+        for line in f.readlines():
+            if len(line.strip()) == 0:
+                continue
+            ele = json.loads(line.strip())
+            reviews.append(ele)
+    review_df = pd.DataFrame(reviews)
+    print(len(review_df))
+    print(review_df.head())
+    
+    business_ids = ["ujmEBvifdJM6h6RLv4wQIg"]
+    for business_id in business_ids:
+        current_review_df = review_df[review_df.business_id==business_id]
+        print(current_review_df.head())
+        print(len(current_review_df))
+        business = Business(current_review_df)
+        print("Aspects",business.aspects)
+        print(business.aspect_based_summary())
\ No newline at end of file