add files

0ec5485c · 20200913050 · e0fb400e · 0ec5485c · 0ec5485c · 0ec5485c
Commit 0ec5485c authored Dec 08, 2020 by 20200913050
Hide whitespace changes
Inline Side-by-side

Showing with 381 additions and 0 deletions

README.md
+0 -0

business.py
+148 -0

main.py
+70 -0

model_training.py
+93 -0

requirements.txt
+1 -0

sentence.py
+69 -0

No files found.
--- a/README.md
+++ b/README.md
--- a/business.py
+++ b/business.py
+
+import re
+from model_training import SentimentModel
+from sentence import Sentence
+ 
+class Business(object):
+    """
+    用来表示跟business相关的变量和函数
+    """
+
+    SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里，并导入进来
+
+
+    def __init__(self, review_df):
+        # 初始化变量以及函数
+        self.reviews = review_df
+
+    def aspect_based_summary(self,biz_id,SENTIMENT_MODEL):
+        """
+        返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 
+        具体细节请看给定的文档。 
+        """
+        
+        
+        review = self.reviews[self.reviews.business_id == biz_id] # extract business id reviews
+        business_name = review[review.business_id == biz_id].name.unique()[0] # extract business name
+        business_rating = review[review.business_id == biz_id].stars.sum()/len(review) # overall business rating
+        
+        biz_info = dict()
+        biz_info['business_id'] = biz_id
+        biz_info['business_name'] = business_name
+        biz_info['business_rating'] = business_rating
+        
+        
+        pos = dict()
+        neg = dict()
+        main_aspects = self.main_aspect_extract(biz_id,review)
+
+        for i in main_aspects.keys():
+            aspect = list(main_aspects[i].keys())
+
+            for j in range(5):
+                # create empty dictionary for 5 aspects
+                biz_info[aspect[j]] = {}
+                biz_info[aspect[j]]['rating'] = []
+                biz_info[aspect[j]]['pos'] = {}
+                biz_info[aspect[j]]['neg'] = {}
+
+
+            for stars,rw in review[['stars','text']].values:
+                rw = rw.lower()
+                
+                # following is to predict reviews that contains aspects and calculating rating for correponding aspects
+                if aspect[0] in rw:
+                    sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
+                    if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
+                          biz_info[aspect[0]]['pos'][rw] = sent_score_pred[0][2] 
+                    elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
+                        biz_info[aspect[0]]['neg'][rw] = sent_score_pred[0][0] 
+                    else:
+#                         print('neutral: do nothing')
+                          None
+                    biz_info[aspect[0]]['rating'].append(stars)
+                    
+                    continue
+                
+                elif aspect[1] in rw:
+                    sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
+                    if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
+                        biz_info[aspect[1]]['pos'][rw] = sent_score_pred[0][2]
+                    elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
+                        biz_info[aspect[1]]['neg'][rw] = sent_score_pred[0][0]
+                    else:
+#                         print('neutral: do nothing')
+                          None
+
+                    biz_info[aspect[1]]['rating'].append(stars)
+                    
+                    continue
+                    
+                elif aspect[2] in rw:
+                    sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
+                    if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
+                        biz_info[aspect[2]]['pos'][rw] = sent_score_pred[0][2]
+                    elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
+                        biz_info[aspect[2]]['neg'][rw] = sent_score_pred[0][0]
+                    else:
+#                         print('neutral: do nothing')
+                          None
+
+                    biz_info[aspect[2]]['rating'].append(stars)
+                    
+                    continue
+                    
+                elif aspect[3] in rw:
+                    sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
+                    if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
+                        biz_info[aspect[3]]['pos'][rw] = sent_score_pred[0][2]
+                    elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
+                        biz_info[aspect[3]]['neg'][rw] = sent_score_pred[0][0]
+                    else:
+#                         print('neutral: do nothing')
+                          None
+                    biz_info[aspect[3]]['rating'].append(stars)
+                    
+                    continue
+                    
+                elif aspect[4] in rw:
+                    
+                    sent_pred, sent_score_pred = SENTIMENT_MODEL.predict([rw])
+                    if (sent_pred[0] == 'positive') & (sent_score_pred[0][2] > 0.7):
+                        biz_info[aspect[4]]['pos'][rw] = sent_score_pred[0][2]
+                    elif (sent_pred[0] == 'negative') & (sent_score_pred[0][0] > 0.7):
+                        biz_info[aspect[4]]['neg'][rw] = sent_score_pred[0][0]
+                    else:
+#                         print('neutral: do nothing')
+                          None
+                    biz_info[aspect[4]]['rating'].append(stars)
+                    continue
+
+
+            for j in range(5):
+                # loop through all 5 aspects dictionary and pick top 5 reviews that has highest prediction score (from sentiment model)
+                biz_info[aspect[j]]['pos'] = dict(sorted(biz_info[aspect[j]]['pos'].items(), key=lambda item: item[1],reverse=True)[:5])
+                biz_info[aspect[j]]['neg'] = dict(sorted(biz_info[aspect[j]]['neg'].items(), key=lambda item: item[1],reverse=True)[:5])
+                biz_info[aspect[j]]['rating'] = round(sum(biz_info[aspect[j]]['rating'])/len(biz_info[aspect[j]]['rating']) if len(biz_info[aspect[j]]['rating']) else 0,2)
+
+
+        return biz_info
+
+    def main_aspect_extract(self,biz_id,review):
+        """
+        从一个business的review中抽取aspects
+        """
+        
+        bus_aspect = dict()
+        sent = Sentence()
+        clean_words = sent.clean_symbols(review.text.to_list()) # clean symbol
+        clean_words = sent.extract_aspect(clean_words,sent) # extract all aspects (nouns words)
+        clean_words = sent.words_cnt(clean_words) # count the frequency of each aspects (nouns words)
+        aspects = dict(sorted(clean_words.items(), key=lambda item: item[1],reverse=True)[:5]) # pick top five aspects
+        bus_aspect[biz_id] = aspects
+        
+        return bus_aspect
+
+
+
+
--- a/main.py
+++ b/main.py
+import time
+import pandas as pd
+from model_training import SentimentModel
+from business import Business
+
+def processed_review():
+    
+    # dataset name
+    review = 'data/review.json'
+    business_json_path = 'data/business.json'
+
+    # load dataset
+    df_r = pd.read_json(review, lines=True)
+    df_b = pd.read_json(business_json_path, lines=True)
+
+    # count review of each business
+    review_cnt = df_r.groupby('business_id').agg({'review_id':pd.Series.nunique})
+    review_cnt = review_cnt.reset_index()
+    
+    # filter out business that has reviews less than 100
+    filter_review_cnt = review_cnt[review_cnt.review_id > 100]
+    df_filter_review = df_r[df_r.business_id.isin(filter_review_cnt.business_id)]
+
+    #  get business name
+    df_reviews = pd.merge(df_filter_review,df_b[['business_id','name']],on='business_id')
+    return df_reviews
+
+def get_review_summary_for_business(biz_id,reviews):
+    # 获取每一个business的评论总结
+
+    sm = SentimentModel() # initialise SentimentModel class
+    business = Business(reviews) # initialise Business class
+    aspect_sum = business.aspect_based_summary(biz_id,sm) # get aspect summary
+
+    return aspect_sum
+   
+    
+def main(): 
+
+
+
+#     reviews = processed_review() # uncomment this to process the raw reviews data
+    df_reviews = pd.read_csv('data/cleaned_reviews.csv') # results of previous function
+
+
+    bus_ids = ['ujmEBvifdJM6h6RLv4wQIg','eU_713ec6fTGNO4BegRaww'] # 指定几个business ids
+    
+    for bus_id in bus_ids:
+        print('\n')
+        print ("Working on biz_id %s" % bus_id)
+#         start = time.time()
+        summary = get_review_summary_for_business(bus_id,df_reviews)
+        i = 0
+        for key, value in summary.items():
+            if i < 3:
+                print(key,':',value)
+                i += 1
+            else:
+                if i == 3:
+                    j = 1
+                    i += 1
+                    
+                print(f'Aspect {j}')
+                print(key,':',value)
+                j += 1
+        # format and print....
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/model_training.py
+++ b/model_training.py
+
+# 此文件包含模型的训练。 给定数据集，训练出情感分类模型，并把模型文件存放在 model文件夹里。 
+
+import xgboost as xgb
+import pickle
+from pickle import load 
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+import re
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer 
+import numpy as np
+from sklearn.metrics import confusion_matrix
+
+class SentimentModel(object):
+
+    def __init__(self):
+        
+        # load pretrained model that is trained using 500000 reviews
+        self.vect = load(open('pretrained_model/vect_model.sav', 'rb'))
+        self.tfidf = load(open('pretrained_model/tfidf_model.sav', 'rb'))
+        self.xgb_multi = load(open('pretrained_model/xgb_model.sav', 'rb'))
+    
+    def load_data(self,df):
+        X = df.text.values
+        y = df.sentiment.values
+        return X, y
+
+    def tokenize(self,text):
+        symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
+        DIGITS = re.compile('\d+')
+        clean_words = symbols.sub(' ',text)
+        clean_words = DIGITS.sub(' ',clean_words)
+        tokens = word_tokenize(clean_words)
+
+        lemmatizer = WordNetLemmatizer()
+
+        clean_tokens = []
+        for tok in tokens:
+            clean_tok = lemmatizer.lemmatize(tok).lower().strip()
+            clean_tokens.append(clean_tok)
+
+        return clean_tokens
+
+
+    def display_results(self,y_test, y_pred):
+        labels = np.unique(y_pred)
+        confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
+        accuracy = (y_pred == y_test).mean()
+
+        print("Labels:", labels)
+        print("Confusion Matrix:\n", confusion_mat)
+        print("Accuracy:", accuracy)
+
+
+    def model_building(self,df_review,sent):
+        # classify sentiment of each reviews by rating
+        df_review['sentiment'] = df_review.stars.apply(lambda x:'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral'))
+        X = df_review.text.values
+        y = df_review.sentiment.values
+        X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+        vect = CountVectorizer(tokenizer=sent.tokenize)
+        tfidf = TfidfTransformer()
+        
+        ## Define xgboost multi-classifier(s)
+        xgb_multi = xgb.XGBClassifier(objective ='multi:softmax',silent=1, nthread=-1, n_jobs=-1, num_classes=3,
+                                  reg_alpha=0.9, reg_lambda=0.1, early_stopping_rounds=15,n_estimators=1000, 
+                                  learning_rate=0.01)
+
+        # train classifier
+        X_train_counts = vect.fit_transform(X_train)
+        X_train_tfidf = tfidf.fit_transform(X_train_counts)
+        xgb_multi.fit(X_train_tfidf, y_train)
+
+        # predict on test data
+        X_test_counts = vect.transform(X_test)
+        X_test_tfidf = tfidf.transform(X_test_counts)
+        y_pred = xgb_multi.predict(X_test_tfidf)
+
+        # display results
+        sent.display_results(y_test, y_pred)
+        
+        return vect,tfidf,xgb_multi
+        
+
+    def predict(self,text):
+        text_counts = self.vect.transform(text)
+        text_tfidf = self.tfidf.transform(text_counts)
+        sent_pred = self.xgb_multi.predict(text_tfidf)
+        sent_score_pred = self.xgb_multi.predict_proba(text_tfidf)
+        return (sent_pred,sent_score_pred)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+# dependency and version
--- a/sentence.py
+++ b/sentence.py
+
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer 
+from nltk.corpus import stopwords
+from nltk.corpus import wordnet
+import re
+  
+lemmatizer = WordNetLemmatizer()
+stop_words = set(stopwords.words('english'))
+
+class Sentence(object):
+
+    def clean_symbols(self,reviews,replace = ' '):
+        """clean symbols and digits"""
+        symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
+        DIGITS = re.compile('\d+')
+        clean_symbols = []
+        for review in reviews:
+            review = symbols.sub(replace,review)
+            review = DIGITS.sub(replace,review)
+            clean_symbols.append(review)
+  
+        return clean_symbols
+
+
+    def get_wordnet_pos(self,word):
+        """get the pos of each word"""
+        
+        tag = nltk.pos_tag([word])[0][1][0].upper()
+
+        if tag.startswith('J'): 
+            tag = wordnet.ADJ 
+        elif tag.startswith('V'): 
+            tag = wordnet.VERB 
+        elif tag.startswith('N'): 
+            tag = wordnet.NOUN 
+        elif tag.startswith('R'): 
+            tag = wordnet.ADV 
+        else:           
+            tag = None
+
+        return tag        
+
+
+    def extract_aspect(self,clean_words,sent):
+        """get noun in the sentence and lemmatize it for the extraction of top aspects (frequency) only """ 
+        aspect = []
+        for i in clean_words:
+            word_token = word_tokenize(i.lower()) # lower case words
+            rmv_stopwords = [i.strip() for i in word_token if i not in stop_words] # remove stop words
+            word_lem = [lemmatizer.lemmatize(l, sent.get_wordnet_pos(l)) for l in rmv_stopwords if (sent.get_wordnet_pos(l) == 'n') & (len(l) > 3)] # retain noun words and words that has more than 3 characters
+            aspect.append(word_lem)
+        return aspect
+
+    def words_cnt(self,clean_words):
+        # count the frequency of each words
+        count = 0
+        vocab_count = dict()
+        for cw in clean_words:
+            words = cw
+            count += len(words)
+            for w in words:
+                if w in vocab_count:
+                    vocab_count[w] += 1
+                else:
+                    vocab_count[w] = 1
+        return vocab_count
\ No newline at end of file