# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 

import xgboost as xgb
import pickle
from pickle import load 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import numpy as np
from sklearn.metrics import confusion_matrix

class SentimentModel(object):

    def __init__(self):
        
        # load pretrained model that is trained using 500000 reviews
        self.vect = load(open('pretrained_model/vect_model.sav', 'rb'))
        self.tfidf = load(open('pretrained_model/tfidf_model.sav', 'rb'))
        self.xgb_multi = load(open('pretrained_model/xgb_model.sav', 'rb'))
    
    def load_data(self,df):
        X = df.text.values
        y = df.sentiment.values
        return X, y

    def tokenize(self,text):
        symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
        DIGITS = re.compile('\d+')
        clean_words = symbols.sub(' ',text)
        clean_words = DIGITS.sub(' ',clean_words)
        tokens = word_tokenize(clean_words)

        lemmatizer = WordNetLemmatizer()

        clean_tokens = []
        for tok in tokens:
            clean_tok = lemmatizer.lemmatize(tok).lower().strip()
            clean_tokens.append(clean_tok)

        return clean_tokens


    def display_results(self,y_test, y_pred):
        labels = np.unique(y_pred)
        confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
        accuracy = (y_pred == y_test).mean()

        print("Labels:", labels)
        print("Confusion Matrix:\n", confusion_mat)
        print("Accuracy:", accuracy)


    def model_building(self,df_review,sent):
        # classify sentiment of each reviews by rating
        df_review['sentiment'] = df_review.stars.apply(lambda x:'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral'))
        X = df_review.text.values
        y = df_review.sentiment.values
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        vect = CountVectorizer(tokenizer=sent.tokenize)
        tfidf = TfidfTransformer()
        
        ## Define xgboost multi-classifier(s)
        xgb_multi = xgb.XGBClassifier(objective ='multi:softmax',silent=1, nthread=-1, n_jobs=-1, num_classes=3,
                                  reg_alpha=0.9, reg_lambda=0.1, early_stopping_rounds=15,n_estimators=1000, 
                                  learning_rate=0.01)

        # train classifier
        X_train_counts = vect.fit_transform(X_train)
        X_train_tfidf = tfidf.fit_transform(X_train_counts)
        xgb_multi.fit(X_train_tfidf, y_train)

        # predict on test data
        X_test_counts = vect.transform(X_test)
        X_test_tfidf = tfidf.transform(X_test_counts)
        y_pred = xgb_multi.predict(X_test_tfidf)

        # display results
        sent.display_results(y_test, y_pred)
        
        return vect,tfidf,xgb_multi
        

    def predict(self,text):
        text_counts = self.vect.transform(text)
        text_tfidf = self.tfidf.transform(text_counts)
        sent_pred = self.xgb_multi.predict(text_tfidf)
        sent_score_pred = self.xgb_multi.predict_proba(text_tfidf)
        return (sent_pred,sent_score_pred)