Commit 441670aa by 20200519016

add project2

parent d5cfc57d
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Project1-master-5db594a1ca8abe8d7c541c2cce831979640929fc)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml" filepath="$PROJECT_DIR$/.idea/project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (Project1-master-5db594a1ca8abe8d7c541c2cce831979640929fc)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python
#-*- coding=utf8 -*-
import json
from collections import defaultdict
import os
import io
import nltk
import random
class Business(object):
"""
用来表示跟business相关的变量和函数
"""
# SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来
def __init__(self):
# 初始化变量以及函数
self.business_name = ""
self.overall_rateing = 0
self.business_file = './data/business.json'
self.business_id = dict()
def filter_business_file(self):
f = open(self.business_file, 'r', encoding='utf-8')
# business_dict = json.load(open(self.business_file, 'r', encoding='utf-8'))
# print(business_dict)
def load_business_data(self, business_dir):
f = io.open(business_dir, 'r', encoding='utf-8')
print(business_dir)
business_all = []
business_name = dict()
for line in f.readlines():
dic = json.loads(line)
business_all.append(dic)
for dic in business_all:
if dic['review_count'] < 100:
continue
if dic['business_id'] not in business_name.keys():
business_name[dic['business_id']] = dic['name']
else:
print("business_id %s has different names %s" %(dic['business_id'], dic['name']))
return business_name
def load_review_data(self, review_dir, business_name):
f = io.open(review_dir, 'r', encoding='utf-8')
print(review_dir)
review_all = []
business_review = dict()
business_stars = dict()
review_text = dict()
user_star = dict()
review_user = dict()
review_star = dict()
count = 0
for line in f.readlines():
count += 1
# for test
if count < 1000:
dic = json.loads(line)
review_all.append(dic)
else:
break
for dic in review_all:
if dic['user_id'] not in user_star.keys():
user_star[dic['user_id']] = []
user_star[dic['user_id']].append(dic['stars'])
if dic['review_id'] not in review_user.keys():
review_user[dic['review_id']] = []
review_user[dic['review_id']].append(dic['user_id'])
if dic['review_id'] not in review_star.keys():
review_star[dic['review_id']] = []
review_star[dic['review_id']].append(dic['stars'])
if dic['business_id'] not in business_name.keys():
continue
if dic['business_id'] not in business_review.keys():
business_review[dic['business_id']] = set()
business_stars[dic['business_id']] = []
if dic['review_id'] not in review_text.keys():
review_text[dic['review_id']] = []
business_review[dic['business_id']].add(dic['review_id'])
business_stars[dic['business_id']].append(dic['stars'])
review_text[dic['business_id']].append(dic['text'])
return business_review, business_stars, review_text, user_star, review_user, review_star
def generate_model_data(self, review_text, review_user, user_star, review_star):
data = []
for review_id, text in review_text.items():
users = review_user[review_id]
ave_star = dict()
for user in users:
if user not in ave_star.keys():
ave_star[user] = sum(user_star[user])/len(user_star[user])
if review_star[review_id]-ave_star[review_user[review_id]] >= 0.5:
data.append((text, 1))
if review_star[review_id]-ave_star[review_user[review_id]] <=-0.5:
data.append((text, 0))
else:
# drop
pass
random.shuffle(data)
train_data, test_data = data[0:len(data)*0.9], data[len(data)*0.9:]
return train_data, test_data
def get_aspect_summary(self, business_id, aspect):
pos_sents, neg_sents = [], []
stars = 0.0
reviews = self.data[business_id]
for review in reviews:
if not review.text.contains(aspect):
continue
review_segment = get_segment(review, aspect)
score = sentiment_model.predict(review_segment)
stars += review.stars
if score > threshold:
pos_sents.append(review_segment)
else:
neg_sents.append(review_segment)
stars = stars / (len(pos_sents)+len(neg_sents))
return dict(rating=stars, pos=pos_sents, neg=neg_sents)
def aspect_based_summary(self, business_id):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
business_rating = self.get_business_score(business_id)
aspect_summary = defaultdict(dict)
aspects = self.get_business_aspects(business_id)
for aspect in aspects:
aspect_summary[aspect] = self.get_aspect_summary(business_id, aspect)
return dict(business_id=business_id,
business_name='',
business_rating=business_rating,
aspect_summary=aspect_summary)
def extract_aspects(self, business_text):
"""
从一个business的review中抽取aspects
"""
business_aspect = dict()
noun_tags = ['NN']
for business_id, texts in business_text.items():
for text in texts:
token = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(token)
np_tags = []
for (word, tag) in pos_tags:
if tag in noun_tags:
np_tags.append(word)
if business_id not in business_aspect.keys():
business_aspect[business_id] = []
business_aspect[business_id].append(np_tags)
business_top_aspect = dict()
for business_id, tags in business_aspect:
tag_count = dict()
for tag in tags:
if tag not in tag_count.keys():
tag_count[tag] = 0
tag_count[tag] += 1
sorted_tagcount = sorted(tag_count.items(),key=lambda tem:item[1], reverse=true)
if business_id not in business_top_aspect.keys():
business_top_aspect = []
business_top_aspect[business_id].append([tag.keys() for tag in sorted_tagcount[:5]])
return business_top_aspect
if __name__ == '__main__':
business = Business()
business_name = business.load_business_data('./data/business.json')
[business_review, business_stars, business_text] = business.load_review_data('./data/review.json', business_name)
business.extract_aspects(business_text)
print('end')
\ No newline at end of file
把数据解压到此文件夹里
business.json:
格式:
business_id,name,address,postal_code,latitude,longitude,
stars,review_count,is_open,attributes
review.json:
格式:
review_id,user_id,business_id,stars,useful,funny,cool,text,date
1、nltk punkt词库手动下载方法:
punkt 的迅雷下载 链接:https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
2、解压缩:
punkt 下载之后解压到本地的 nltk_data/tokenizer
3、测试用例:
import nltk
word='hello word'
words = nltk.word_tokenize(word)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
{"review_id":"F7POrJsNbhu493DSTMPXjw","user_id":"nsS4oDfOsl20QdWc6XcOkw","business_id":"gnKjwL_1w79qoiV3IC_xQQ","stars":2.0,"useful":1,"funny":0,"cool":0,"text":"Husband was craving Chicken Teriyaki & gyoza, so we found Musashi. I was very unimpressed. We started with gyoza and edamame. Neither were anything special. We then ordered a chicken teriyaki plate and a few sushi rolls. The chicken teriyaki was nothing more than some boiled chicken smothered in teriyaki sauce. Was not good at all. The sushi was mediocre at best. While they were friendly and the service was pretty good - I will not be back.","date":"2014-02-24 02:51:56"}
\ No newline at end of file
#!/usr/bin/env python
#-*- coding=utf8 -*-
def get_review_summary_for_business(biz_id):
# 获取每一个business的评论总结
def main():
mgr = BusinessManager('data/')
train_data, test_data = mgr.generate_model_data()
feature_builder = model.FeatureBuilder('tfidf')
X_train, y_train, X_test, y_test = feature_builder.get_feature(train_data, test_data)
lrmodel = model.LinearModel()
lrmodel.train(X_train, y_train)
lrmodel.save(model_path)
mgr.set_sentiment_model(lrmodel)
business_ids = get_business_ids()
for bid in business_ids:
summary = mgr.aspect_based_summary(business_ids)
print(summary)
if __name__ == "__main__":
main()
这里存放已经训练好的模型(情感分析模型或者其他模型)。模型方面请提前训练好,然后serialize到一个文件里,运行的时候直接使用即可以。
#!/usr/bin/env python
#-*- coding=utf8 -*-
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。
import os, sys
import sklearn
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import tensorflow_datasets
from transformers import *
import torch
class FeatureBuilder():
def __init_(self, method='tfidf'):
self.method = method
def get_feature(train_data, test_data, tokenizer=None):
if self.method == 'tfidf':
return get_tfidf_feature(train_data, test_data)
elif self.method == 'sentence piece':
return get_bert_feature(train_data, test_data)
def get_tfidf_feature(train_data, test_data):
X_train_data, y_train = zip(*train_data)
X_test_data, y_test = zip(*train_data)
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer() # 定义一个tf-idf的vectorizer
X_train = vectorizer.fit_transform(X_train_data) # 训练数据的特征
X_test = vectorizer.transform(X_test_data) # 测试数据的特征
return X_train, y_train, X_test, y_test
def get_bert_feature(train_data, test_data, tokenizer):
return tokenizer.encode(train_data), tokenizer.encode(test_data)
class LinearModel():
def __init__(self):
self.algorithm = 'LR'
grid = {"C": numpy.logspace(-3, 3, 7)}
self.logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
self.logreg_cv = sklearn.model_selection.GridSearchCV(logreg, grid, cv=10, scoring='f1')
def train(self, X_train, y_train):
self.logreg_cv.fit(X_train, y_train)
print(sklearn.metrics.classification_report(y_test, y_pred))
def predict(self, X_test):
y_pred = logreg_cv.predict(X_test)
# TODO ...
class NNModel():
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
self.model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
self.init_model()
def init_model(self):
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
def get_tokenizer(self):
return self.tokenizer
def train(self, X_train, y_train):
input_ids = torch.tensor(X_train)
history = self.model.fit(input_ids, epochs=2, steps_per_epoch=115,
validation_data=valid_dataset, validation_steps=7)
self.model.save_pretrained('./save/')
def predict(self, X_test):
return self.model(torch.tensor(X_test)).argmax().item()
#!/usr/bin/env python
#-*- coding=utf8 -*-
import nltk
class Sentence(object):
WORD_TOKENIZER = nltk.MyPottsTokenizer(preserve_case=False)
LEMMATIZER = nltk.WordNetLemmatizer()
# 针对于每一句话抽取aspects
ASP_EXTRACTOR =
def __init__(self):
def word_tokenize(self):
def pos_tag(self):
def lemmatize(self):
def contain_aspect(self):
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment