model_training.py 4.35 KB
Newer Older
1
# encoding: utf-8
20200203113 committed
2
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle
import json
import pandas as pd
from tqdm import tqdm

class SentimentModel(object):
    def __init__(self,df_review=None,pos_star=4,neg_star=2):
        if df_review is not None:
            self.pos_review = []
            self.neg_review = []
            for idx in tqdm(range(len(df_review))):
                review = df_review.iloc[idx]
                if review.stars >= pos_star:
                    self.pos_review.append(" ".join([ word for word in word_tokenize(review.text)]))
                elif review.stars <= neg_star: 
                    self.neg_review.append(" ".join([ word for word in word_tokenize(review.text)]))
            print("样本统计:\n正例: {}\n负例: {}".format(len(self.pos_review),len(self.neg_review)))
            # print("样本示例:\n正例: {}\n负例: {}".format("\n".join(self.pos_review[:2]),"\n".join(self.neg_review[:2])))
            self.vectorizer = TfidfVectorizer()
            corpus = self.pos_review + self.neg_review
            self.X = self.vectorizer.fit_transform(corpus)
            self.y = [1] * len(self.pos_review) + [0] * len(self.neg_review)
        else:
            self.clf = None
            self.vectorizer = None
20200203113 committed
32

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    
    def train(self,test_size=0.25,*args,**kwargs):
        if kwargs["save_model_path"] is not None:
            save_model_path = kwargs["save_model_path"]
        if kwargs["save_vectorizer_path"] is not None:
            save_vectorizer_path = kwargs["save_vectorizer_path"]
        self.clf = LogisticRegression(random_state=0,C=10)
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=0)
        self.clf.fit(X_train,y_train)
        score = self.clf.score(X_test,y_test)
        with open(save_model_path,"wb") as f:
            pickle.dump(self.clf,f)
        with open(save_vectorizer_path,"wb") as f:
            pickle.dump(self.vectorizer,f)
        print("Finish training, score is {}.".format(str(score)))
 
    def predict(self,*args,**kwargs):
        if self.clf is None:
            if kwargs["model_path"] is not None:
                model_path = kwargs["model_path"]
                with open(model_path,'rb') as f:
                    self.clf = pickle.load(f)
            else:
                raise Exception("model_path missing")
        if self.vectorizer is None:
            if kwargs["vectorizer_path"] is not None:
                vectorizer_path = kwargs["vectorizer_path"]
                with open(vectorizer_path,'rb') as f:
                    self.vectorizer = pickle.load(f)
            else:
                raise Exception("vectorizer_path missing")
        data = args[0]
        tmp = []
        for d in data:
            tmp.append(" ".join([word for word in word_tokenize(d)]))
        data = tmp
        features = self.vectorizer.transform(data)
        result = self.clf.predict(features)
        score = self.clf.predict_proba(features)
        return result,score
20200203113 committed
73

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
if __name__ == "__main__":
    review_path = "data/review.json"
    model_path = "model/model.pkl"
    vector_path = "model/vector.pkl"
    is_train = False
    if is_train:
        reviews = []
        with open(review_path,"r",encoding="utf-8") as f:
            for line in f.readlines():
                if len(line.strip()) == 0:
                    continue
                ele = json.loads(line.strip())
                reviews.append(ele)
        review_df = pd.DataFrame(reviews)
        print(review_df.head())
        model = SentimentModel(review_df)
        model.train(save_model_path=model_path,save_vectorizer_path=vector_path)
    else:
        model = SentimentModel()
    data = ["Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","kkkkkk"]
    result,score = model.predict(data,model_path=model_path,vectorizer_path=vector_path)
    print("\n".join(["{}\t{}\t{}".format(str(label),str(s[label]),d) for label,s,d in zip(result,score,data)]))