Commit 0ec5485c by 20200913050

add files

parent e0fb400e
import re
from model_training import SentimentModel
from sentence import Sentence
class Business(object):
"""
用来表示跟business相关的变量和函数
"""
SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来
def __init__(self, review_df):
# 初始化变量以及函数
self.reviews = review_df
def aspect_based_summary(self,biz_id,SENTIMENT_MODEL):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
review = self.reviews[self.reviews.business_id == biz_id] # extract business id reviews
business_name = review[review.business_id == biz_id].name.unique()[0] # extract business name
business_rating = review[review.business_id == biz_id].stars.sum()/len(review) # overall business rating
biz_info = dict()
biz_info['business_id'] = biz_id
biz_info['business_name'] = business_name
biz_info['business_rating'] = business_rating
pos = dict()
neg = dict()
main_aspects = self.main_aspect_extract(biz_id,review)
for i in main_aspects.keys():
aspect = list(main_aspects[i].keys())
for j in range(5):
# create empty dictionary for 5 aspects
biz_info[aspect[j]] = {}
biz_info[aspect[j]]['rating'] = []
biz_info[aspect[j]]['pos'] = {}
biz_info[aspect[j]]['neg'] = {}
for stars,rw in review[['stars','text']].values:
rw = rw.lower()
# following is to predict reviews that contains aspects and calculating rating for correponding aspects
if aspect[0] in rw:
sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
biz_info[aspect[0]]['pos'][rw] = sent_score_pred[0][2]
elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
biz_info[aspect[0]]['neg'][rw] = sent_score_pred[0][0]
else:
# print('neutral: do nothing')
None
biz_info[aspect[0]]['rating'].append(stars)
continue
elif aspect[1] in rw:
sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
biz_info[aspect[1]]['pos'][rw] = sent_score_pred[0][2]
elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
biz_info[aspect[1]]['neg'][rw] = sent_score_pred[0][0]
else:
# print('neutral: do nothing')
None
biz_info[aspect[1]]['rating'].append(stars)
continue
elif aspect[2] in rw:
sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
biz_info[aspect[2]]['pos'][rw] = sent_score_pred[0][2]
elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
biz_info[aspect[2]]['neg'][rw] = sent_score_pred[0][0]
else:
# print('neutral: do nothing')
None
biz_info[aspect[2]]['rating'].append(stars)
continue
elif aspect[3] in rw:
sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
biz_info[aspect[3]]['pos'][rw] = sent_score_pred[0][2]
elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
biz_info[aspect[3]]['neg'][rw] = sent_score_pred[0][0]
else:
# print('neutral: do nothing')
None
biz_info[aspect[3]]['rating'].append(stars)
continue
elif aspect[4] in rw:
sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
biz_info[aspect[4]]['pos'][rw] = sent_score_pred[0][2]
elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
biz_info[aspect[4]]['neg'][rw] = sent_score_pred[0][0]
else:
# print('neutral: do nothing')
None
biz_info[aspect[4]]['rating'].append(stars)
continue
for j in range(5):
# loop through all 5 aspects dictionary and pick top 5 reviews that has highest prediction score (from sentiment model)
biz_info[aspect[j]]['pos'] = dict(sorted(biz_info[aspect[j]]['pos'].items(), key=lambda item: item[1],reverse=True)[:5])
biz_info[aspect[j]]['neg'] = dict(sorted(biz_info[aspect[j]]['neg'].items(), key=lambda item: item[1],reverse=True)[:5])
biz_info[aspect[j]]['rating'] = round(sum(biz_info[aspect[j]]['rating'])/len(biz_info[aspect[j]]['rating']) if len(biz_info[aspect[j]]['rating']) else 0,2)
return biz_info
def main_aspect_extract(self,biz_id,review):
"""
从一个business的review中抽取aspects
"""
bus_aspect = dict()
sent = Sentence()
clean_words = sent.clean_symbols(review.text.to_list()) # clean symbol
clean_words = sent.extract_aspect(clean_words,sent) # extract all aspects (nouns words)
clean_words = sent.words_cnt(clean_words) # count the frequency of each aspects (nouns words)
aspects = dict(sorted(clean_words.items(), key=lambda item: item[1],reverse=True)[:5]) # pick top five aspects
bus_aspect[biz_id] = aspects
return bus_aspect
import time
import pandas as pd
from model_training import SentimentModel
from business import Business
def processed_review():
# dataset name
review = 'data/review.json'
business_json_path = 'data/business.json'
# load dataset
df_r = pd.read_json(review, lines=True)
df_b = pd.read_json(business_json_path, lines=True)
# count review of each business
review_cnt = df_r.groupby('business_id').agg({'review_id':pd.Series.nunique})
review_cnt = review_cnt.reset_index()
# filter out business that has reviews less than 100
filter_review_cnt = review_cnt[review_cnt.review_id > 100]
df_filter_review = df_r[df_r.business_id.isin(filter_review_cnt.business_id)]
# get business name
df_reviews = pd.merge(df_filter_review,df_b[['business_id','name']],on='business_id')
return df_reviews
def get_review_summary_for_business(biz_id,reviews):
# 获取每一个business的评论总结
sm = SentimentModel() # initialise SentimentModel class
business = Business(reviews) # initialise Business class
aspect_sum = business.aspect_based_summary(biz_id,sm) # get aspect summary
return aspect_sum
def main():
# reviews = processed_review() # uncomment this to process the raw reviews data
df_reviews = pd.read_csv('data/cleaned_reviews.csv') # results of previous function
bus_ids = ['ujmEBvifdJM6h6RLv4wQIg','eU_713ec6fTGNO4BegRaww'] # 指定几个business ids
for bus_id in bus_ids:
print('\n')
print ("Working on biz_id %s" % bus_id)
# start = time.time()
summary = get_review_summary_for_business(bus_id,df_reviews)
i = 0
for key, value in summary.items():
if i < 3:
print(key,':',value)
i += 1
else:
if i == 3:
j = 1
i += 1
print(f'Aspect {j}')
print(key,':',value)
j += 1
# format and print....
if __name__ == "__main__":
main()
\ No newline at end of file
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。
import xgboost as xgb
import pickle
from pickle import load
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics import confusion_matrix
class SentimentModel(object):
def __init__(self):
# load pretrained model that is trained using 500000 reviews
self.vect = load(open('pretrained_model/vect_model.sav', 'rb'))
self.tfidf = load(open('pretrained_model/tfidf_model.sav', 'rb'))
self.xgb_multi = load(open('pretrained_model/xgb_model.sav', 'rb'))
def load_data(self,df):
X = df.text.values
y = df.sentiment.values
return X, y
def tokenize(self,text):
symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
DIGITS = re.compile('\d+')
clean_words = symbols.sub(' ',text)
clean_words = DIGITS.sub(' ',clean_words)
tokens = word_tokenize(clean_words)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def display_results(self,y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
def model_building(self,df_review,sent):
# classify sentiment of each reviews by rating
df_review['sentiment'] = df_review.stars.apply(lambda x:'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral'))
X = df_review.text.values
y = df_review.sentiment.values
X_train, X_test, y_train, y_test = train_test_split(X, y)
vect = CountVectorizer(tokenizer=sent.tokenize)
tfidf = TfidfTransformer()
## Define xgboost multi-classifier(s)
xgb_multi = xgb.XGBClassifier(objective ='multi:softmax',silent=1, nthread=-1, n_jobs=-1, num_classes=3,
reg_alpha=0.9, reg_lambda=0.1, early_stopping_rounds=15,n_estimators=1000,
learning_rate=0.01)
# train classifier
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
xgb_multi.fit(X_train_tfidf, y_train)
# predict on test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)
y_pred = xgb_multi.predict(X_test_tfidf)
# display results
sent.display_results(y_test, y_pred)
return vect,tfidf,xgb_multi
def predict(self,text):
text_counts = self.vect.transform(text)
text_tfidf = self.tfidf.transform(text_counts)
sent_pred = self.xgb_multi.predict(text_tfidf)
sent_score_pred = self.xgb_multi.predict_proba(text_tfidf)
return (sent_pred,sent_score_pred)
\ No newline at end of file
# dependency and version
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
class Sentence(object):
def clean_symbols(self,reviews,replace = ' '):
"""clean symbols and digits"""
symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
DIGITS = re.compile('\d+')
clean_symbols = []
for review in reviews:
review = symbols.sub(replace,review)
review = DIGITS.sub(replace,review)
clean_symbols.append(review)
return clean_symbols
def get_wordnet_pos(self,word):
"""get the pos of each word"""
tag = nltk.pos_tag([word])[0][1][0].upper()
if tag.startswith('J'):
tag = wordnet.ADJ
elif tag.startswith('V'):
tag = wordnet.VERB
elif tag.startswith('N'):
tag = wordnet.NOUN
elif tag.startswith('R'):
tag = wordnet.ADV
else:
tag = None
return tag
def extract_aspect(self,clean_words,sent):
"""get noun in the sentence and lemmatize it for the extraction of top aspects (frequency) only """
aspect = []
for i in clean_words:
word_token = word_tokenize(i.lower()) # lower case words
rmv_stopwords = [i.strip() for i in word_token if i not in stop_words] # remove stop words
word_lem = [lemmatizer.lemmatize(l, sent.get_wordnet_pos(l)) for l in rmv_stopwords if (sent.get_wordnet_pos(l) == 'n') & (len(l) > 3)] # retain noun words and words that has more than 3 characters
aspect.append(word_lem)
return aspect
def words_cnt(self,clean_words):
# count the frequency of each words
count = 0
vocab_count = dict()
for cw in clean_words:
words = cw
count += len(words)
for w in words:
if w in vocab_count:
vocab_count[w] += 1
else:
vocab_count[w] = 1
return vocab_count
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment