add project2

441670aa · 20200519016 · d5cfc57d · 441670aa · 441670aa · 441670aa
Commit 441670aa authored Oct 27, 2020 by 20200519016
20 changed files
--- a/project_2/.DS_Store
+++ b/project_2/.DS_Store
--- a/project_2/Project2-情感分析项目.zip
+++ b/project_2/Project2-情感分析项目.zip
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.DS_Store
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.DS_Store
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/.gitignore
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/inspectionProfiles/profiles_settings.xml
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/misc.xml
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Project1-master-5db594a1ca8abe8d7c541c2cce831979640929fc)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/modules.xml
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml" filepath="$PROJECT_DIR$/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (Project1-master-5db594a1ca8abe8d7c541c2cce831979640929fc)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/vcs.xml
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/business.py
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/business.py
+#!/usr/bin/env python
+#-*- coding=utf8 -*-
+import json
+from collections import defaultdict
+import os
+import io
+import nltk
+import random
+
+
+class Business(object):
+	"""
+	用来表示跟business相关的变量和函数
+	"""
+
+	# SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里，并导入进来
+
+	def __init__(self):
+		# 初始化变量以及函数
+		self.business_name = ""
+		self.overall_rateing = 0
+		self.business_file = './data/business.json'
+		self.business_id = dict()
+
+	def filter_business_file(self):
+		f = open(self.business_file, 'r', encoding='utf-8')
+		# business_dict = json.load(open(self.business_file, 'r',  encoding='utf-8'))
+		# print(business_dict)
+
+	def load_business_data(self, business_dir):
+		f = io.open(business_dir, 'r', encoding='utf-8')
+		print(business_dir)
+		business_all = []
+		business_name = dict()
+		for line in f.readlines():
+			dic = json.loads(line)
+			business_all.append(dic)
+
+		for dic in business_all:
+			if dic['review_count'] < 100:
+				continue
+			if dic['business_id'] not in business_name.keys():
+				business_name[dic['business_id']] = dic['name']
+			else:
+				print("business_id %s has different names %s" %(dic['business_id'], dic['name']))
+
+		return business_name
+
+	def load_review_data(self, review_dir, business_name):
+		f = io.open(review_dir, 'r', encoding='utf-8')
+		print(review_dir)
+		review_all = []
+		business_review = dict()
+		business_stars = dict()
+		review_text = dict()
+		user_star = dict()
+		review_user = dict()
+		review_star = dict()
+		count = 0
+		for line in f.readlines():
+			count += 1
+			#  for test
+			if count < 1000:
+				dic = json.loads(line)
+				review_all.append(dic)
+			else:
+				break
+
+		for dic in review_all:
+			if dic['user_id'] not in user_star.keys():
+				user_star[dic['user_id']] = []
+			user_star[dic['user_id']].append(dic['stars'])
+
+			if dic['review_id'] not in review_user.keys():
+				review_user[dic['review_id']] = []
+			review_user[dic['review_id']].append(dic['user_id'])
+
+			if dic['review_id'] not in review_star.keys():
+				review_star[dic['review_id']] = []
+			review_star[dic['review_id']].append(dic['stars'])
+
+			if dic['business_id'] not in business_name.keys():
+				continue
+
+			if dic['business_id'] not in business_review.keys():
+				business_review[dic['business_id']] = set()
+				business_stars[dic['business_id']] = []
+			if dic['review_id'] not in review_text.keys():
+				review_text[dic['review_id']] = []
+
+			business_review[dic['business_id']].add(dic['review_id'])
+			business_stars[dic['business_id']].append(dic['stars'])
+			review_text[dic['business_id']].append(dic['text'])
+
+		return business_review, business_stars, review_text, user_star, review_user, review_star
+
+	def generate_model_data(self, review_text, review_user, user_star, review_star):
+		data = []
+		for review_id, text in review_text.items():
+			users = review_user[review_id]
+			ave_star = dict()
+			for user in users:
+				if user not in ave_star.keys():
+					ave_star[user] = sum(user_star[user])/len(user_star[user])
+
+			if review_star[review_id]-ave_star[review_user[review_id]] >= 0.5:
+				data.append((text, 1))
+			if review_star[review_id]-ave_star[review_user[review_id]] <=-0.5:
+				data.append((text, 0))
+			else:
+				# drop
+				pass
+
+		random.shuffle(data)
+		train_data, test_data = data[0:len(data)*0.9], data[len(data)*0.9:]
+		return train_data, test_data
+
+	def get_aspect_summary(self, business_id, aspect):
+		pos_sents, neg_sents = [], []
+		stars = 0.0
+		reviews = self.data[business_id]
+		for review in reviews:
+			if not review.text.contains(aspect):
+				continue
+
+			review_segment = get_segment(review, aspect)
+			score = sentiment_model.predict(review_segment)
+			stars += review.stars
+			if score > threshold:
+				pos_sents.append(review_segment)
+			else:
+				neg_sents.append(review_segment)
+
+		stars = stars / (len(pos_sents)+len(neg_sents))
+
+		return dict(rating=stars, pos=pos_sents, neg=neg_sents)
+
+	def aspect_based_summary(self, business_id):
+		"""
+		返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 
+		具体细节请看给定的文档。 
+		"""
+		business_rating = self.get_business_score(business_id)
+
+		aspect_summary = defaultdict(dict)
+		aspects = self.get_business_aspects(business_id)
+		for aspect in aspects:
+			aspect_summary[aspect] = self.get_aspect_summary(business_id, aspect)
+
+		return dict(business_id=business_id,
+					business_name='',
+					business_rating=business_rating,
+					aspect_summary=aspect_summary)
+
+	def extract_aspects(self, business_text):
+		"""
+		从一个business的review中抽取aspects
+		"""
+		business_aspect = dict()
+		noun_tags = ['NN']
+		for business_id, texts in business_text.items():
+			for text in texts:
+				token = nltk.word_tokenize(text)
+				pos_tags = nltk.pos_tag(token)
+				np_tags = []
+				for (word, tag) in pos_tags:
+					if tag in noun_tags:
+						np_tags.append(word)
+				if business_id not in business_aspect.keys():
+					business_aspect[business_id] = []
+				business_aspect[business_id].append(np_tags)
+		business_top_aspect = dict()
+		for business_id, tags in business_aspect:
+			tag_count = dict()
+			for tag in tags:
+				if tag not in tag_count.keys():
+					tag_count[tag] = 0
+				tag_count[tag] += 1
+			sorted_tagcount = sorted(tag_count.items(),key=lambda tem:item[1], reverse=true)
+			if business_id not in business_top_aspect.keys():
+				business_top_aspect = []
+			business_top_aspect[business_id].append([tag.keys() for tag in sorted_tagcount[:5]])
+
+		return business_top_aspect
+
+
+if __name__ == '__main__':
+	business = Business()
+	business_name = business.load_business_data('./data/business.json')
+	[business_review, business_stars, business_text] = business.load_review_data('./data/review.json', business_name)
+	business.extract_aspects(business_text)
+	print('end')
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/README.md
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/README.md
+把数据解压到此文件夹里
+
+business.json:
+格式：
+business_id,name,address,postal_code,latitude,longitude,
+stars,review_count,is_open,attributes
+
+
+
+review.json:
+格式：
+review_id,user_id,business_id,stars,useful,funny,cool,text,date
+
+
+1、nltk punkt词库手动下载方法：
+ punkt 的迅雷下载 链接：https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip 
+2、解压缩:
+punkt  下载之后解压到本地的 nltk_data/tokenizer 
+3、测试用例：
+import nltk
+word='hello word'
+words = nltk.word_tokenize(word)
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/business.json
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/business.json
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/package.json
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/data/package.json
+{"review_id":"F7POrJsNbhu493DSTMPXjw","user_id":"nsS4oDfOsl20QdWc6XcOkw","business_id":"gnKjwL_1w79qoiV3IC_xQQ","stars":2.0,"useful":1,"funny":0,"cool":0,"text":"Husband was craving Chicken Teriyaki & gyoza, so we found Musashi. I was very unimpressed. We started with gyoza and edamame. Neither were anything special. We then ordered a chicken teriyaki plate and a few sushi rolls. The chicken teriyaki was nothing more than some boiled chicken smothered in teriyaki sauce. Was not good at all. The sushi was mediocre at best. While they were friendly and the service was pretty good - I will not be back.","date":"2014-02-24 02:51:56"}
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/main.py
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/main.py
+#!/usr/bin/env python
+#-*- coding=utf8 -*-
+
+
+def get_review_summary_for_business(biz_id):
+	# 获取每一个business的评论总结
+	
+
+def main():
+	mgr = BusinessManager('data/')
+
+	train_data, test_data = mgr.generate_model_data()
+
+	feature_builder = model.FeatureBuilder('tfidf')
+
+
+	X_train, y_train, X_test, y_test = feature_builder.get_feature(train_data, test_data)
+
+	lrmodel = model.LinearModel()
+	lrmodel.train(X_train, y_train)
+	lrmodel.save(model_path)
+
+	mgr.set_sentiment_model(lrmodel)
+
+	business_ids = get_business_ids()
+	for bid in business_ids:
+		summary = mgr.aspect_based_summary(business_ids)
+		print(summary)
+
+if __name__ == "__main__":
+	main()
+
+
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/model/README.md
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/model/README.md
+这里存放已经训练好的模型（情感分析模型或者其他模型）。模型方面请提前训练好，然后serialize到一个文件里，运行的时候直接使用即可以。 
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/model_training.py
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/model_training.py
+#!/usr/bin/env python
+#-*- coding=utf8 -*-
+# 此文件包含模型的训练。 给定数据集，训练出情感分类模型，并把模型文件存放在 model文件夹里。 
+
+import os, sys
+
+import sklearn
+from sklearn.linear_model import LogisticRegression
+import tensorflow as tf
+import tensorflow_datasets
+from transformers import *
+import torch
+
+
+class FeatureBuilder():
+    def __init_(self, method='tfidf'):
+        self.method = method
+
+    def get_feature(train_data, test_data, tokenizer=None):
+        if self.method == 'tfidf':
+            return get_tfidf_feature(train_data, test_data)
+        elif self.method == 'sentence piece':
+            return get_bert_feature(train_data, test_data)
+
+    def get_tfidf_feature(train_data, test_data):
+        X_train_data, y_train = zip(*train_data)
+        X_test_data, y_test = zip(*train_data)
+
+        vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()  # 定义一个tf-idf的vectorizer
+        X_train = vectorizer.fit_transform(X_train_data)  # 训练数据的特征
+        X_test = vectorizer.transform(X_test_data)  # 测试数据的特征
+        return X_train, y_train, X_test, y_test
+
+    def get_bert_feature(train_data, test_data, tokenizer):
+        return tokenizer.encode(train_data), tokenizer.encode(test_data)
+
+
+class LinearModel():
+    def __init__(self):
+        self.algorithm = 'LR'
+        grid = {"C": numpy.logspace(-3, 3, 7)}
+        self.logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
+        self.logreg_cv = sklearn.model_selection.GridSearchCV(logreg, grid, cv=10, scoring='f1')
+
+    def train(self, X_train, y_train):
+        self.logreg_cv.fit(X_train, y_train)
+        print(sklearn.metrics.classification_report(y_test, y_pred))
+
+    def predict(self, X_test):
+        y_pred = logreg_cv.predict(X_test)
+
+
+# TODO ...
+class NNModel():
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        self.model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+        self.init_model()
+
+    def init_model(self):
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+        model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def get_tokenizer(self):
+        return self.tokenizer
+
+    def train(self, X_train, y_train):
+        input_ids = torch.tensor(X_train)
+        history = self.model.fit(input_ids, epochs=2, steps_per_epoch=115,
+                                 validation_data=valid_dataset, validation_steps=7)
+        self.model.save_pretrained('./save/')
+
+    def predict(self, X_test):
+        return self.model(torch.tensor(X_test)).argmax().item()
+
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/requirements.txt
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/requirements.txt
+# dependency and version
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/sentence.py
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/sentence.py
+#!/usr/bin/env python
+#-*- coding=utf8 -*-
+import  nltk
+
+class Sentence(object):
+	
+	WORD_TOKENIZER = nltk.MyPottsTokenizer(preserve_case=False)
+	
+	LEMMATIZER = nltk.WordNetLemmatizer()
+
+	# 针对于每一句话抽取aspects
+	ASP_EXTRACTOR = 
+
+	def __init__(self):
+		
+
+	def word_tokenize(self):
+	
+
+	def pos_tag(self):
+		
+
+	def lemmatize(self):
+		
+
+	def contain_aspect(self):
+		
\ No newline at end of file
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/zhihu_link.txt
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/zhihu_link.txt
+# 请填写知乎的链接
--- a/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/情感分析系统搭建说明.pdf
+++ b/project_2/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1/情感分析系统搭建说明.pdf