# encoding: utf-8 # 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize import word_tokenize from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import pickle import json import pandas as pd from tqdm import tqdm class SentimentModel(object): def __init__(self,df_review=None,pos_star=4,neg_star=2): if df_review is not None: self.pos_review = [] self.neg_review = [] for idx in tqdm(range(len(df_review))): review = df_review.iloc[idx] if review.stars >= pos_star: self.pos_review.append(" ".join([ word for word in word_tokenize(review.text)])) elif review.stars <= neg_star: self.neg_review.append(" ".join([ word for word in word_tokenize(review.text)])) print("样本统计:\n正例: {}\n负例: {}".format(len(self.pos_review),len(self.neg_review))) # print("样本示例:\n正例: {}\n负例: {}".format("\n".join(self.pos_review[:2]),"\n".join(self.neg_review[:2]))) self.vectorizer = TfidfVectorizer() corpus = self.pos_review + self.neg_review self.X = self.vectorizer.fit_transform(corpus) self.y = [1] * len(self.pos_review) + [0] * len(self.neg_review) else: self.clf = None self.vectorizer = None def train(self,test_size=0.25,*args,**kwargs): if kwargs["save_model_path"] is not None: save_model_path = kwargs["save_model_path"] if kwargs["save_vectorizer_path"] is not None: save_vectorizer_path = kwargs["save_vectorizer_path"] self.clf = LogisticRegression(random_state=0,C=10) X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=0) self.clf.fit(X_train,y_train) score = self.clf.score(X_test,y_test) with open(save_model_path,"wb") as f: pickle.dump(self.clf,f) with open(save_vectorizer_path,"wb") as f: pickle.dump(self.vectorizer,f) print("Finish training, score is {}.".format(str(score))) def predict(self,*args,**kwargs): if self.clf is None: if kwargs["model_path"] is not None: model_path = kwargs["model_path"] with open(model_path,'rb') as f: self.clf = pickle.load(f) else: raise Exception("model_path missing") if self.vectorizer is None: if kwargs["vectorizer_path"] is not None: vectorizer_path = kwargs["vectorizer_path"] with open(vectorizer_path,'rb') as f: self.vectorizer = pickle.load(f) else: raise Exception("vectorizer_path missing") data = args[0] tmp = [] for d in data: tmp.append(" ".join([word for word in word_tokenize(d)])) data = tmp features = self.vectorizer.transform(data) result = self.clf.predict(features) score = self.clf.predict_proba(features) return result,score if __name__ == "__main__": review_path = "data/review.json" model_path = "model/model.pkl" vector_path = "model/vector.pkl" is_train = False if is_train: reviews = [] with open(review_path,"r",encoding="utf-8") as f: for line in f.readlines(): if len(line.strip()) == 0: continue ele = json.loads(line.strip()) reviews.append(ele) review_df = pd.DataFrame(reviews) print(review_df.head()) model = SentimentModel(review_df) model.train(save_model_path=model_path,save_vectorizer_path=vector_path) else: model = SentimentModel() data = ["Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","kkkkkk"] result,score = model.predict(data,model_path=model_path,vectorizer_path=vector_path) print("\n".join(["{}\t{}\t{}".format(str(label),str(s[label]),d) for label,s,d in zip(result,score,data)]))