# encoding: utf-8

from model_training import SentimentModel
from gen_id2business import id2business
from tqdm import tqdm
from sentence import Sentence
import json
import pandas as pd

model_path = "model/model.pkl"
vector_path = "model/vector.pkl"
SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来

class Business(object):
	"""
	用来表示跟business相关的变量和函数
	"""
	def __init__(self, review_df):
		# 初始化变量以及函数
		self.review_df = review_df
		self.business_id = self.review_df.iloc[0].business_id
		self.business_name = id2business[self.business_id]["name"]
		self.review_nps,self.aspects = self.extract_aspects()
		

	def aspect_based_summary(self,threshold=0.5):
		"""
		返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 
		具体细节请看给定的文档。 
		"""
		aspect_info = []
		for aspect in self.aspects:
			aspect_info.append({
				"aspect" : aspect,
				"stars" : [],
				"pos" : [],
				"neg" : []
			})
		for idx in tqdm(range(len(self.review_df))):
			review = self.review_df.iloc[idx]
			text = review.text
			current_review_nps = self.review_nps[idx]
			for idx_aspect in range(len(self.aspects)):
				aspect = self.aspects[idx_aspect]
				if aspect in current_review_nps:
					aspect_info[idx_aspect]["stars"].append(review.stars)
					data = [text]
					result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path)
					print(result,score)
					if score[0][1] >= threshold:
						aspect_info[idx_aspect]["pos"].append((idx,score[0]))
					else:
						aspect_info[idx_aspect]["neg"].append((idx,score[0]))
				
		business_rating = 0
		detail = []
		for idx_aspect in range(len(self.aspects)):
			aspect = self.aspects[idx_aspect]
			business_rating += sum(aspect_info[idx_aspect]["stars"])
			step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1
			step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1
			info = {"aspect":aspect,
					"rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]),
					"pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])),
					"neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))}
			detail.append(info)

		business_rating = business_rating/len(self.review_df)

		return {'business_id': self.business_id,
				'business_name': self.business_name,
				'business_rating': business_rating,
				'aspect_summary': 	detail
				}


	def extract_aspects(self):
		"""
		从一个business的review中抽取aspects
		"""
		np_dict = dict()
		review_nps = []
		for idx in tqdm(range(len(self.review_df))):
			review = self.review_df.iloc[idx]
			sen = Sentence(review.text)
			nps = []
			for np in sen.extract_noun_phrase():
				print(np)
				nps.append(np)
				if np not in np_dict:
					np_dict[np] = 0
				np_dict[np] += 1
			review_nps.append(nps)
		sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5]
		aspects = [aspect for aspect,times in sort_np_dict_items_top_5]
		return review_nps,aspects

if __name__ == "__main__":
    review_path = "data/review.json"
    reviews = []
    with open(review_path,"r",encoding="utf-8") as f:
        for line in f.readlines():
            if len(line.strip()) == 0:
                continue
            ele = json.loads(line.strip())
            reviews.append(ele)
    review_df = pd.DataFrame(reviews)
    print(len(review_df))
    print(review_df.head())
    
    business_ids = ["ujmEBvifdJM6h6RLv4wQIg"]
    for business_id in business_ids:
        current_review_df = review_df[review_df.business_id==business_id]
        print(current_review_df.head())
        print(len(current_review_df))
        business = Business(current_review_df)
        print("Aspects",business.aspects)
        print(business.aspect_based_summary())