Commit 41de1f9c by 20200519040

20200519040-Project2

parents
import nltk,os
from collections import Counter,defaultdict
import yaml
import json
businessfilepath = r'C:\Users\goooo\OneDrive\Coding\greed\project3-master-04600864b6ccdfe409baaf26c0daea95b3bb45c1\data\business.json'
class Review():
def __init__(self, data):
self.text = data['text']
self.stars = data['stars']
self.usesr_id = data['usesr_id']
self.business_id = data['business_id']
def extract_aspects(self, sent):
"""
从一个句子的review中抽取aspects
"""
aspects = set()
for word, tag in nltk.pos_tag(nltk.word_tokenize(sent)):
if tag=='NN':
aspects.add(word)
return aspects
def compute_doc_aspects(self, doc, topk=5):
sents = []
for line in doc:
sents_ = nltk.sent_tokenize(line)
sents.extend(sents_)
topic = Counter()
for sent in sents:
aspects_ = extract_aspects(sent)
topic.update(aspects_)
aspects, freq = zip(*topic.most_common(topk))
return aspects
class BusinessManager(object):
def __init__(self, init_dir=None):
self.data = defaultdict(list)
self.aspects = defaultdict(list)
self.user_stars = {}
self.sentiment_model = None
if init_dir:
self.load_data(init_dir)
def load_data(self, review_dir):
# all_business = defaultdict(business) // Business -> business
user_stars = defaultdict(float)
for review_file in os.listdir(review_dir):
review_path = os.path.join(review_dir, review_file)
# review_data = json.load(open(review_path, 'r', encoding='utf-8'))
# with open('business.json') as f:
# try:
# review_data = json.load(f)
# except:
# review_data = {}
file = open(review_path, 'r', encoding='utf-8')
review_data_temp = []
for line in file.readlines():
dic = json.loads(line)
review_data_temp .append(dic)
review_data = review_data_temp[0]
review = Review(review_data)
business_id = review.business_id
business = self.data.get(business_id)
business.append(review)
user_stars[review.user_id] += review.stars
self.user_stars = { user_id:stars/len(user_stars) for user_id, stars in user_stars.items()}
def get_business_ids():
return list(self.data.keys())
def get_business_reviews(self, business_id):
return self.data.get(business_id, [])
def load_aspects(self, aspect_config):
assert os.path.exists(aspect_config)
self.aspects = yaml.safe_load(aspect_config)
def build_aspects(self):
for business_id, reviews in self.data.items():
doc = [ review.text for review in reviews ]
self.aspects[business_id] = compute_doc_aspects(doc, topk=5)
def get_business_aspects(self, business_id):
if business_id not in self.aspects:
print('not find business_id')
return []
return self.aspects.get(business_id)
def get_all_reviews(self):
return [ review for review in reviews for reviews in list(self.data.values())]
def get_business_score(self, business_id):
reviews = self.data[business_id]
scores = [ review.stars for review in reviews ]
ave_score = sum(scores)/len(scores)
return ave_score
def get_user_score(self, user_id):
reviews = self.get_all_reviews()
scores = [ review.stars for review in reviews if review.user_id==user_id]
ave_score = sum(scores)/len(scores)
return ave_score
def get_aspect_summary(self, business_id, aspect):
pos_sents, neg_sents = [], []
stars = 0.0
reviews = self.data[business_id]
for review in reviews:
if not review.text.contains(aspect):
continue
review_segment = get_segment(review, aspect)
score = sentiment_model.predict(review_segment)
stars += review.stars
if score > threshold:
pos_sents.append(review_segment)
else:
neg_sents.append(review_segment)
stars = stars / (len(pos_sents)+len(neg_sents))
return dict(rating=stars, pos=pos_sents, neg=neg_sents)
def aspect_based_summary(self, business_id):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
business_rating = self.get_business_score(business_id)
aspect_summary = defaultdict(dict)
aspects = self.get_business_aspects(business_id)
for aspect in aspects:
aspect_summary[aspect] = self.get_aspect_summary(business_id, aspect)
return dict(business_id = business_id,
business_name = '',
business_rating = business_rating,
aspect_summary = aspect_summary)
def generate_model_data(self):
assert self.user_stars, "please load review data at first"
data = []
for review in self.get_all_reviews():
ave_star = self.user_stars.get(review.user_id)
if review.stars-ave_star >= 0.5:
data.append((review.text, 1))
if review.stars-ave_star <=-0.5:
data.append((review.text, 0))
else:
# drop
pass
random.shuffle(data)
train_data, test_data = data[0:len(data)*0.9], data[len(data)*0.9:]
return train_data, test_data
def set_sentiment_model(self, sentiment_model):
self.sentiment_model = sentiment_model
import os, sys
import json
from collections import defaultdict
from business import BusinessManager
import model
def main():
mgr = BusinessManager('data/')
train_data, test_data = mgr.generate_model_data()
feature_builder = model.FeatureBuilder('tfidf')
X_train, y_train, X_test, y_test = feature_builder.get_feature(train_data, test_data)
lrmodel = model.LinearModel()
lrmodel.train(X_train, y_train)
lrmodel.save(model_path)
mgr.set_sentiment_model(lrmodel)
business_ids = get_business_ids()
for bid in business_ids:
summary = mgr.aspect_based_summary(business_ids)
print(summary)
if __name__ == "__main__":
main()
import os, sys
import sklearn
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import tensorflow_datasets
from transformers import *
import torch
class FeatureBuilder():
def __init_(self, method='tfidf'):
self.method = method
def get_feature(train_data, test_data, tokenizer=None):
if self.method == 'tfidf':
return get_tfidf_feature(train_data, test_data)
elif self.method == 'sentence piece':
return get_bert_feature(train_data, test_data)
def get_tfidf_feature(train_data, test_data):
X_train_data, y_train = zip(*train_data)
X_test_data, y_test = zip(*train_data)
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer() # 定义一个tf-idf的vectorizer
X_train = vectorizer.fit_transform(X_train_data) # 训练数据的特征
X_test = vectorizer.transform(X_test_data) # 测试数据的特征
return X_train, y_train, X_test, y_test
def get_bert_feature(train_data, test_data, tokenizer):
return tokenizer.encode(train_data), tokenizer.encode(test_data)
class LinearModel():
def __init__(self):
self.algorithm = 'LR'
grid = {"C": numpy.logspace(-3, 3, 7)}
self.logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
self.logreg_cv = sklearn.model_selection.GridSearchCV(logreg, grid, cv=10, scoring='f1')
def train(self, X_train, y_train):
self.logreg_cv.fit(X_train, y_train)
print(sklearn.metrics.classification_report(y_test, y_pred))
def predict(self, X_test):
y_pred = logreg_cv.predict(X_test)
# TODO ...
class NNModel():
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
self.model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
self.init_model()
def init_model(self):
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
def get_tokenizer(self):
return self.tokenizer
def train(self, X_train, y_train):
input_ids = torch.tensor(X_train)
history = self.model.fit(input_ids, epochs=2, steps_per_epoch=115,
validation_data=valid_dataset, validation_steps=7)
self.model.save_pretrained('./save/')
def predict(self, X_test):
return self.model(torch.tensor(X_test)).argmax().item()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment