business.py 5.7 KB
Newer Older
20200519016 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
#!/usr/bin/env python
#-*- coding=utf8 -*-
import json
from collections import defaultdict
import os
import io
import nltk
import random


class Business(object):
	"""
	用来表示跟business相关的变量和函数
	"""

	# SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来

	def __init__(self):
		# 初始化变量以及函数
		self.business_name = ""
		self.overall_rateing = 0
		self.business_file = './data/business.json'
		self.business_id = dict()

	def filter_business_file(self):
		f = open(self.business_file, 'r', encoding='utf-8')
		# business_dict = json.load(open(self.business_file, 'r',  encoding='utf-8'))
		# print(business_dict)

	def load_business_data(self, business_dir):
		f = io.open(business_dir, 'r', encoding='utf-8')
		print(business_dir)
		business_all = []
		business_name = dict()
		for line in f.readlines():
			dic = json.loads(line)
			business_all.append(dic)

		for dic in business_all:
			if dic['review_count'] < 100:
				continue
			if dic['business_id'] not in business_name.keys():
				business_name[dic['business_id']] = dic['name']
			else:
				print("business_id %s has different names %s" %(dic['business_id'], dic['name']))

		return business_name

	def load_review_data(self, review_dir, business_name):
		f = io.open(review_dir, 'r', encoding='utf-8')
		print(review_dir)
		review_all = []
		business_review = dict()
		business_stars = dict()
		review_text = dict()
		user_star = dict()
		review_user = dict()
		review_star = dict()
		count = 0
		for line in f.readlines():
			count += 1
			#  for test
			if count < 1000:
				dic = json.loads(line)
				review_all.append(dic)
			else:
				break

		for dic in review_all:
			if dic['user_id'] not in user_star.keys():
				user_star[dic['user_id']] = []
			user_star[dic['user_id']].append(dic['stars'])

			if dic['review_id'] not in review_user.keys():
				review_user[dic['review_id']] = []
			review_user[dic['review_id']].append(dic['user_id'])

			if dic['review_id'] not in review_star.keys():
				review_star[dic['review_id']] = []
			review_star[dic['review_id']].append(dic['stars'])

			if dic['business_id'] not in business_name.keys():
				continue

			if dic['business_id'] not in business_review.keys():
				business_review[dic['business_id']] = set()
				business_stars[dic['business_id']] = []
			if dic['review_id'] not in review_text.keys():
				review_text[dic['review_id']] = []

			business_review[dic['business_id']].add(dic['review_id'])
			business_stars[dic['business_id']].append(dic['stars'])
			review_text[dic['business_id']].append(dic['text'])

		return business_review, business_stars, review_text, user_star, review_user, review_star

	def generate_model_data(self, review_text, review_user, user_star, review_star):
		data = []
		for review_id, text in review_text.items():
			users = review_user[review_id]
			ave_star = dict()
			for user in users:
				if user not in ave_star.keys():
					ave_star[user] = sum(user_star[user])/len(user_star[user])

			if review_star[review_id]-ave_star[review_user[review_id]] >= 0.5:
				data.append((text, 1))
			if review_star[review_id]-ave_star[review_user[review_id]] <=-0.5:
				data.append((text, 0))
			else:
				# drop
				pass

		random.shuffle(data)
		train_data, test_data = data[0:len(data)*0.9], data[len(data)*0.9:]
		return train_data, test_data

	def get_aspect_summary(self, business_id, aspect):
		pos_sents, neg_sents = [], []
		stars = 0.0
		reviews = self.data[business_id]
		for review in reviews:
			if not review.text.contains(aspect):
				continue

			review_segment = get_segment(review, aspect)
			score = sentiment_model.predict(review_segment)
			stars += review.stars
			if score > threshold:
				pos_sents.append(review_segment)
			else:
				neg_sents.append(review_segment)

		stars = stars / (len(pos_sents)+len(neg_sents))

		return dict(rating=stars, pos=pos_sents, neg=neg_sents)

	def aspect_based_summary(self, business_id):
		"""
		返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 
		具体细节请看给定的文档。 
		"""
		business_rating = self.get_business_score(business_id)

		aspect_summary = defaultdict(dict)
		aspects = self.get_business_aspects(business_id)
		for aspect in aspects:
			aspect_summary[aspect] = self.get_aspect_summary(business_id, aspect)

		return dict(business_id=business_id,
					business_name='',
					business_rating=business_rating,
					aspect_summary=aspect_summary)

	def extract_aspects(self, business_text):
		"""
		从一个business的review中抽取aspects
		"""
		business_aspect = dict()
		noun_tags = ['NN']
		for business_id, texts in business_text.items():
			for text in texts:
				token = nltk.word_tokenize(text)
				pos_tags = nltk.pos_tag(token)
				np_tags = []
				for (word, tag) in pos_tags:
					if tag in noun_tags:
						np_tags.append(word)
				if business_id not in business_aspect.keys():
					business_aspect[business_id] = []
				business_aspect[business_id].append(np_tags)
		business_top_aspect = dict()
		for business_id, tags in business_aspect:
			tag_count = dict()
			for tag in tags:
				if tag not in tag_count.keys():
					tag_count[tag] = 0
				tag_count[tag] += 1
			sorted_tagcount = sorted(tag_count.items(),key=lambda tem:item[1], reverse=true)
			if business_id not in business_top_aspect.keys():
				business_top_aspect = []
			business_top_aspect[business_id].append([tag.keys() for tag in sorted_tagcount[:5]])

		return business_top_aspect


if __name__ == '__main__':
	business = Business()
	business_name = business.load_business_data('./data/business.json')
	[business_review, business_stars, business_text] = business.load_review_data('./data/review.json', business_name)
	business.extract_aspects(business_text)
	print('end')