diff --git b/.gitignore a/.gitignore new file mode 100644 index 0000000..07f43b8 --- /dev/null +++ a/.gitignore @@ -0,0 +1 @@ +data/* \ No newline at end of file diff --git b/business.py a/business.py new file mode 100644 index 0000000..fe4c528 --- /dev/null +++ a/business.py @@ -0,0 +1,118 @@ +# encoding: utf-8 + +from model_training import SentimentModel +from gen_id2business import id2business +from tqdm import tqdm +from sentence import Sentence +import json +import pandas as pd + +model_path = "model/model.pkl" +vector_path = "model/vector.pkl" +SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来 + +class Business(object): + """ + 用来表示跟business相关的变量和函数 + """ + def __init__(self, review_df): + # 初始化变量以及函数 + self.review_df = review_df + self.business_id = self.review_df.iloc[0].business_id + self.business_name = id2business[self.business_id]["name"] + self.review_nps,self.aspects = self.extract_aspects() + + + def aspect_based_summary(self,threshold=0.5): + """ + 返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. + 具体细节请看给定的文档。 + """ + aspect_info = [] + for aspect in self.aspects: + aspect_info.append({ + "aspect" : aspect, + "stars" : [], + "pos" : [], + "neg" : [] + }) + for idx in tqdm(range(len(self.review_df))): + review = self.review_df.iloc[idx] + text = review.text + current_review_nps = self.review_nps[idx] + for idx_aspect in range(len(self.aspects)): + aspect = self.aspects[idx_aspect] + if aspect in current_review_nps: + aspect_info[idx_aspect]["stars"].append(review.stars) + data = [text] + result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path) + print(result,score) + if score[0][1] >= threshold: + aspect_info[idx_aspect]["pos"].append((idx,score[0])) + else: + aspect_info[idx_aspect]["neg"].append((idx,score[0])) + + business_rating = 0 + detail = [] + for idx_aspect in range(len(self.aspects)): + aspect = self.aspects[idx_aspect] + business_rating += sum(aspect_info[idx_aspect]["stars"]) + step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1 + step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1 + info = {"aspect":aspect, + "rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]), + "pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])), + "neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))} + detail.append(info) + + business_rating = business_rating/len(self.review_df) + + return {'business_id': self.business_id, + 'business_name': self.business_name, + 'business_rating': business_rating, + 'aspect_summary': detail + } + + + def extract_aspects(self): + """ + 从一个business的review中抽取aspects + """ + np_dict = dict() + review_nps = [] + for idx in tqdm(range(len(self.review_df))): + review = self.review_df.iloc[idx] + sen = Sentence(review.text) + nps = [] + for np in sen.extract_noun_phrase(): + print(np) + nps.append(np) + if np not in np_dict: + np_dict[np] = 0 + np_dict[np] += 1 + review_nps.append(nps) + sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5] + aspects = [aspect for aspect,times in sort_np_dict_items_top_5] + return review_nps,aspects + +if __name__ == "__main__": + review_path = "data/review.json" + reviews = [] + with open(review_path,"r",encoding="utf-8") as f: + for line in f.readlines(): + if len(line.strip()) == 0: + continue + ele = json.loads(line.strip()) + reviews.append(ele) + review_df = pd.DataFrame(reviews) + print(len(review_df)) + print(review_df.head()) + + business_ids = ["ujmEBvifdJM6h6RLv4wQIg"] + for business_id in business_ids: + current_review_df = review_df[review_df.business_id==business_id] + print(current_review_df.head()) + print(len(current_review_df)) + business = Business(current_review_df) + print("Aspects",business.aspects) + print(business.aspect_based_summary()) \ No newline at end of file diff --git b/gen_id2business.py a/gen_id2business.py new file mode 100644 index 0000000..663a53b --- /dev/null +++ a/gen_id2business.py @@ -0,0 +1,33 @@ +# encoding: utf-8 +import json +import pickle +import os + +business_path = "data/business.json" +save_id2business_path = "model/id2business.pkl" + +def load_id2business(): + id2business = None + if not os.path.exists(save_id2business_path): + id2business = dict() + with open(business_path,"r",encoding="utf-8") as f: + for line in f.readlines(): + if len(line.strip()) == 0: + continue + ele = json.loads(line.strip()) + business_id = ele["business_id"] + if ele["business_id"] in id2business: + print("{} duplicated".format(str(business_id))) + continue + id2business[business_id] = ele + + with open(save_id2business_path,'wb') as f: + pickle.dump(id2business,f) + + else: + with open(save_id2business_path,'rb') as f: + id2business = pickle.load(f) + return id2business + + +id2business = load_id2business() \ No newline at end of file diff --git b/gen_valid_business_id.py a/gen_valid_business_id.py new file mode 100644 index 0000000..23d607d --- /dev/null +++ a/gen_valid_business_id.py @@ -0,0 +1,30 @@ +#encoding: utf-8 +import pandas as pd +import json +from tqdm import tqdm + +review_path = "data/review.json" +valid_business_id_path = "data/valid_business_id.txt" +def gen_valid_business_id(review_path,count_citerion=100): + valid_business = [] + count_business_id = dict() + reviews = [] + with open(review_path,"r",encoding="utf-8") as f: + for line in tqdm(f.readlines()): + if len(line.strip()) == 0: + continue + ele = json.loads(line.strip()) + if not ele["business_id"] in count_business_id: + count_business_id[ele["business_id"]] = 0 + count_business_id[ele["business_id"]] += 1 + reviews.append(ele) + review_df = pd.DataFrame(reviews) + print("total count of business id in {}: {}".format(len(count_business_id),review_path)) + for key,value in count_business_id.items(): + if value >= count_citerion: + valid_business.append("{}\t{}".format(str(key),str(value))) + return valid_business + +valid_business = gen_valid_business_id(review_path,count_citerion=100) +with open(valid_business_id_path,'w',encoding='utf-8') as f: + f.write("\n".join(valid_business)) diff --git b/main.py a/main.py new file mode 100644 index 0000000..a15bf60 --- /dev/null +++ a/main.py @@ -0,0 +1,40 @@ +# encoding: utf-8 +import pandas as pd +import json +from business import Business +import time + +def get_review_summary_for_business(biz_id,review_df): + # 获取每一个business的评论总结 + business = Business(review_df) + return business.aspect_based_summary() + +def main(): + review_path = "data/review.json" + reviews = [] + with open(review_path,"r",encoding="utf-8") as f: + for line in f.readlines(): + if len(line.strip()) == 0: + continue + ele = json.loads(line.strip()) + reviews.append(ele) + review_df = pd.DataFrame(reviews) + print(len(review_df)) + print(review_df.head()) + + bus_ids = ["4JNXUYY8wbaaDmk3BPzlWw"] # 指定几个business ids + + for bus_id in bus_ids: + print ("Working on biz_id %s" % bus_id) + start = time.time() + + summary = get_review_summary_for_business(bus_id,review_df[review_df.business_id==bus_id]) + + # format and print.... + print(summary) + print("--------------------------------------------------------") + +if __name__ == "__main__": + main() + + diff --git b/model/README.md a/model/README.md new file mode 100644 index 0000000..f640b12 --- /dev/null +++ a/model/README.md @@ -0,0 +1 @@ +这里存放已经训练好的模型(情感分析模型或者其他模型)。模型方面请提前训练好,然后serialize到一个文件里,运行的时候直接使用即可以。 diff --git b/model/id2business.pkl a/model/id2business.pkl new file mode 100644 index 0000000..77d2ad1 Binary files /dev/null and a/model/id2business.pkl differ diff --git b/model/model.pkl a/model/model.pkl new file mode 100644 index 0000000..bdbc4f9 Binary files /dev/null and a/model/model.pkl differ diff --git b/model/vector.pkl a/model/vector.pkl new file mode 100644 index 0000000..f67b569 Binary files /dev/null and a/model/vector.pkl differ diff --git b/model_training.py a/model_training.py new file mode 100644 index 0000000..359977b --- /dev/null +++ a/model_training.py @@ -0,0 +1,95 @@ +# encoding: utf-8 +# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.tokenize import word_tokenize +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +import pickle +import json +import pandas as pd +from tqdm import tqdm + +class SentimentModel(object): + def __init__(self,df_review=None,pos_star=4,neg_star=2): + if df_review is not None: + self.pos_review = [] + self.neg_review = [] + for idx in tqdm(range(len(df_review))): + review = df_review.iloc[idx] + if review.stars >= pos_star: + self.pos_review.append(" ".join([ word for word in word_tokenize(review.text)])) + elif review.stars <= neg_star: + self.neg_review.append(" ".join([ word for word in word_tokenize(review.text)])) + print("样本统计:\n正例: {}\n负例: {}".format(len(self.pos_review),len(self.neg_review))) + # print("样本示例:\n正例: {}\n负例: {}".format("\n".join(self.pos_review[:2]),"\n".join(self.neg_review[:2]))) + self.vectorizer = TfidfVectorizer() + corpus = self.pos_review + self.neg_review + self.X = self.vectorizer.fit_transform(corpus) + self.y = [1] * len(self.pos_review) + [0] * len(self.neg_review) + else: + self.clf = None + self.vectorizer = None + + + def train(self,test_size=0.25,*args,**kwargs): + if kwargs["save_model_path"] is not None: + save_model_path = kwargs["save_model_path"] + if kwargs["save_vectorizer_path"] is not None: + save_vectorizer_path = kwargs["save_vectorizer_path"] + self.clf = LogisticRegression(random_state=0,C=10) + X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=0) + self.clf.fit(X_train,y_train) + score = self.clf.score(X_test,y_test) + with open(save_model_path,"wb") as f: + pickle.dump(self.clf,f) + with open(save_vectorizer_path,"wb") as f: + pickle.dump(self.vectorizer,f) + print("Finish training, score is {}.".format(str(score))) + + def predict(self,*args,**kwargs): + if self.clf is None: + if kwargs["model_path"] is not None: + model_path = kwargs["model_path"] + with open(model_path,'rb') as f: + self.clf = pickle.load(f) + else: + raise Exception("model_path missing") + if self.vectorizer is None: + if kwargs["vectorizer_path"] is not None: + vectorizer_path = kwargs["vectorizer_path"] + with open(vectorizer_path,'rb') as f: + self.vectorizer = pickle.load(f) + else: + raise Exception("vectorizer_path missing") + data = args[0] + tmp = [] + for d in data: + tmp.append(" ".join([word for word in word_tokenize(d)])) + data = tmp + features = self.vectorizer.transform(data) + result = self.clf.predict(features) + score = self.clf.predict_proba(features) + return result,score + +if __name__ == "__main__": + review_path = "data/review.json" + model_path = "model/model.pkl" + vector_path = "model/vector.pkl" + is_train = False + if is_train: + reviews = [] + with open(review_path,"r",encoding="utf-8") as f: + for line in f.readlines(): + if len(line.strip()) == 0: + continue + ele = json.loads(line.strip()) + reviews.append(ele) + review_df = pd.DataFrame(reviews) + print(review_df.head()) + model = SentimentModel(review_df) + model.train(save_model_path=model_path,save_vectorizer_path=vector_path) + else: + model = SentimentModel() + data = ["Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","kkkkkk"] + result,score = model.predict(data,model_path=model_path,vectorizer_path=vector_path) + print("\n".join(["{}\t{}\t{}".format(str(label),str(s[label]),d) for label,s,d in zip(result,score,data)])) diff --git b/requirements.txt a/requirements.txt new file mode 100644 index 0000000..17e093a --- /dev/null +++ a/requirements.txt @@ -0,0 +1 @@ +# dependency and version diff --git b/sentence.py a/sentence.py new file mode 100644 index 0000000..e97b385 --- /dev/null +++ a/sentence.py @@ -0,0 +1,79 @@ +# encoding: utf-8 +import nltk +from nltk.corpus import stopwords +stopwords = stopwords.words('english') +# stopwords = [] +grammar = r""" + NBAR: + {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns + NP: + {<NBAR>} + {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... +""" +lemmatizer = nltk.WordNetLemmatizer() +#stemmer = nltk.stem.porter.PorterStemmer() + + +class Sentence(object): + + # WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False) + + # LEMMATIZER = WordNetLemmatizer() + + # 针对于每一句话抽取aspects + # ASP_EXTRACTOR = + + def __init__(self,sentence): + self.sentence = sentence + + + # def word_tokenize(self): + # return + + # def pos_tag(self): + # return + + # def lemmatize(self): + # return + + # def contain_aspect(self): + # return + + def extract_noun_phrase(self): + tree = self.execute() + nps = [np for np in self.get_terms(tree)] + return nps + + + def execute(self): + # Taken from Su Nam Kim Paper... + chunker = nltk.RegexpParser(grammar) + #toks = nltk.regexp_tokenize(text, sentence_re) + # #postoks = nltk.tag.pos_tag(toks) + toks = nltk.word_tokenize(self.sentence) + postoks = nltk.tag.pos_tag(toks) + tree = chunker.parse(postoks) + return tree + + def leaves(self,tree): + """Finds NP (nounphrase) leaf nodes of a chunk tree.""" + for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): + yield subtree.leaves() + + def normalise(self,word): + """Normalises words to lowercase and stems and lemmatizes it.""" + word = word.lower() + # word = stemmer.stem(word) + word = lemmatizer.lemmatize(word) + return word + + def acceptable_word(self,word): + """Checks conditions for acceptable word: length, stopword.""" + accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) + return accepted + + def get_terms(self,tree): + for leaf in self.leaves(tree): + term = " ".join([self.normalise(w) for w, t in leaf if self.acceptable_word(w)]) + yield term + diff --git b/zhihu_link.txt a/zhihu_link.txt new file mode 100644 index 0000000..b1081b3 --- /dev/null +++ a/zhihu_link.txt @@ -0,0 +1 @@ +# 请填写知乎的链接 diff --git "b/\346\203\205\346\204\237\345\210\206\346\236\220\347\263\273\347\273\237\346\220\255\345\273\272\350\257\264\346\230\216.pdf" "a/\346\203\205\346\204\237\345\210\206\346\236\220\347\263\273\347\273\237\346\220\255\345\273\272\350\257\264\346\230\216.pdf" new file mode 100644 index 0000000..69a3da1 Binary files /dev/null and "a/\346\203\205\346\204\237\345\210\206\346\236\220\347\263\273\347\273\237\346\220\255\345\273\272\350\257\264\346\230\216.pdf" differ