# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 import xgboost as xgb import pickle from pickle import load from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer import re from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import numpy as np from sklearn.metrics import confusion_matrix class SentimentModel(object): def __init__(self): # load pretrained model that is trained using 500000 reviews self.vect = load(open('pretrained_model/vect_model.sav', 'rb')) self.tfidf = load(open('pretrained_model/tfidf_model.sav', 'rb')) self.xgb_multi = load(open('pretrained_model/xgb_model.sav', 'rb')) def load_data(self,df): X = df.text.values y = df.sentiment.values return X, y def tokenize(self,text): symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+') DIGITS = re.compile('\d+') clean_words = symbols.sub(' ',text) clean_words = DIGITS.sub(' ',clean_words) tokens = word_tokenize(clean_words) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens def display_results(self,y_test, y_pred): labels = np.unique(y_pred) confusion_mat = confusion_matrix(y_test, y_pred, labels=labels) accuracy = (y_pred == y_test).mean() print("Labels:", labels) print("Confusion Matrix:\n", confusion_mat) print("Accuracy:", accuracy) def model_building(self,df_review,sent): # classify sentiment of each reviews by rating df_review['sentiment'] = df_review.stars.apply(lambda x:'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral')) X = df_review.text.values y = df_review.sentiment.values X_train, X_test, y_train, y_test = train_test_split(X, y) vect = CountVectorizer(tokenizer=sent.tokenize) tfidf = TfidfTransformer() ## Define xgboost multi-classifier(s) xgb_multi = xgb.XGBClassifier(objective ='multi:softmax',silent=1, nthread=-1, n_jobs=-1, num_classes=3, reg_alpha=0.9, reg_lambda=0.1, early_stopping_rounds=15,n_estimators=1000, learning_rate=0.01) # train classifier X_train_counts = vect.fit_transform(X_train) X_train_tfidf = tfidf.fit_transform(X_train_counts) xgb_multi.fit(X_train_tfidf, y_train) # predict on test data X_test_counts = vect.transform(X_test) X_test_tfidf = tfidf.transform(X_test_counts) y_pred = xgb_multi.predict(X_test_tfidf) # display results sent.display_results(y_test, y_pred) return vect,tfidf,xgb_multi def predict(self,text): text_counts = self.vect.transform(text) text_tfidf = self.tfidf.transform(text_counts) sent_pred = self.xgb_multi.predict(text_tfidf) sent_score_pred = self.xgb_multi.predict_proba(text_tfidf) return (sent_pred,sent_score_pred)