Commit 3beaf3bb by 20200203048

first commit

parents
data/*
\ No newline at end of file
# encoding: utf-8
from model_training import SentimentModel
from gen_id2business import id2business
from tqdm import tqdm
from sentence import Sentence
import json
import pandas as pd
model_path = "model/model.pkl"
vector_path = "model/vector.pkl"
SENTIMENT_MODEL = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来
class Business(object):
"""
用来表示跟business相关的变量和函数
"""
def __init__(self, review_df):
# 初始化变量以及函数
self.review_df = review_df
self.business_id = self.review_df.iloc[0].business_id
self.business_name = id2business[self.business_id]["name"]
self.review_nps,self.aspects = self.extract_aspects()
def aspect_based_summary(self,threshold=0.5):
"""
返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
具体细节请看给定的文档。
"""
aspect_info = []
for aspect in self.aspects:
aspect_info.append({
"aspect" : aspect,
"stars" : [],
"pos" : [],
"neg" : []
})
for idx in tqdm(range(len(self.review_df))):
review = self.review_df.iloc[idx]
text = review.text
current_review_nps = self.review_nps[idx]
for idx_aspect in range(len(self.aspects)):
aspect = self.aspects[idx_aspect]
if aspect in current_review_nps:
aspect_info[idx_aspect]["stars"].append(review.stars)
data = [text]
result,score = SENTIMENT_MODEL.predict(data,model_path=model_path,vectorizer_path=vector_path)
print(result,score)
if score[0][1] >= threshold:
aspect_info[idx_aspect]["pos"].append((idx,score[0]))
else:
aspect_info[idx_aspect]["neg"].append((idx,score[0]))
business_rating = 0
detail = []
for idx_aspect in range(len(self.aspects)):
aspect = self.aspects[idx_aspect]
business_rating += sum(aspect_info[idx_aspect]["stars"])
step_pos = len(aspect_info[idx_aspect]["pos"]) // 100 if len(aspect_info[idx_aspect]["pos"]) > 100 else 1
step_neg = len(aspect_info[idx_aspect]["neg"]) // 100 if len(aspect_info[idx_aspect]["neg"]) > 100 else 1
info = {"aspect":aspect,
"rating":sum(aspect_info[idx_aspect]["stars"])/len(aspect_info[idx_aspect]["stars"]),
"pos":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["pos"],key=lambda x: x[1])[::step_pos][:5])),
"neg":list(map(lambda y: self.review_df.iloc[y[0]].text,sorted(aspect_info[idx_aspect]["neg"],key=lambda x: x[1])[::step_neg][:5]))}
detail.append(info)
business_rating = business_rating/len(self.review_df)
return {'business_id': self.business_id,
'business_name': self.business_name,
'business_rating': business_rating,
'aspect_summary': detail
}
def extract_aspects(self):
"""
从一个business的review中抽取aspects
"""
np_dict = dict()
review_nps = []
for idx in tqdm(range(len(self.review_df))):
review = self.review_df.iloc[idx]
sen = Sentence(review.text)
nps = []
for np in sen.extract_noun_phrase():
print(np)
nps.append(np)
if np not in np_dict:
np_dict[np] = 0
np_dict[np] += 1
review_nps.append(nps)
sort_np_dict_items_top_5 = sorted(np_dict.items(),key=lambda x: x[1])[:5]
aspects = [aspect for aspect,times in sort_np_dict_items_top_5]
return review_nps,aspects
if __name__ == "__main__":
review_path = "data/review.json"
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in f.readlines():
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print(len(review_df))
print(review_df.head())
business_ids = ["ujmEBvifdJM6h6RLv4wQIg"]
for business_id in business_ids:
current_review_df = review_df[review_df.business_id==business_id]
print(current_review_df.head())
print(len(current_review_df))
business = Business(current_review_df)
print("Aspects",business.aspects)
print(business.aspect_based_summary())
\ No newline at end of file
# encoding: utf-8
import json
import pickle
import os
business_path = "data/business.json"
save_id2business_path = "model/id2business.pkl"
def load_id2business():
id2business = None
if not os.path.exists(save_id2business_path):
id2business = dict()
with open(business_path,"r",encoding="utf-8") as f:
for line in f.readlines():
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
business_id = ele["business_id"]
if ele["business_id"] in id2business:
print("{} duplicated".format(str(business_id)))
continue
id2business[business_id] = ele
with open(save_id2business_path,'wb') as f:
pickle.dump(id2business,f)
else:
with open(save_id2business_path,'rb') as f:
id2business = pickle.load(f)
return id2business
id2business = load_id2business()
\ No newline at end of file
#encoding: utf-8
import pandas as pd
import json
from tqdm import tqdm
review_path = "data/review.json"
valid_business_id_path = "data/valid_business_id.txt"
def gen_valid_business_id(review_path,count_citerion=100):
valid_business = []
count_business_id = dict()
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in tqdm(f.readlines()):
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
if not ele["business_id"] in count_business_id:
count_business_id[ele["business_id"]] = 0
count_business_id[ele["business_id"]] += 1
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print("total count of business id in {}: {}".format(len(count_business_id),review_path))
for key,value in count_business_id.items():
if value >= count_citerion:
valid_business.append("{}\t{}".format(str(key),str(value)))
return valid_business
valid_business = gen_valid_business_id(review_path,count_citerion=100)
with open(valid_business_id_path,'w',encoding='utf-8') as f:
f.write("\n".join(valid_business))
# encoding: utf-8
import pandas as pd
import json
from business import Business
import time
def get_review_summary_for_business(biz_id,review_df):
# 获取每一个business的评论总结
business = Business(review_df)
return business.aspect_based_summary()
def main():
review_path = "data/review.json"
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in f.readlines():
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print(len(review_df))
print(review_df.head())
bus_ids = ["4JNXUYY8wbaaDmk3BPzlWw"] # 指定几个business ids
for bus_id in bus_ids:
print ("Working on biz_id %s" % bus_id)
start = time.time()
summary = get_review_summary_for_business(bus_id,review_df[review_df.business_id==bus_id])
# format and print....
print(summary)
print("--------------------------------------------------------")
if __name__ == "__main__":
main()
这里存放已经训练好的模型(情感分析模型或者其他模型)。模型方面请提前训练好,然后serialize到一个文件里,运行的时候直接使用即可以。
File added
# encoding: utf-8
# 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle
import json
import pandas as pd
from tqdm import tqdm
class SentimentModel(object):
def __init__(self,df_review=None,pos_star=4,neg_star=2):
if df_review is not None:
self.pos_review = []
self.neg_review = []
for idx in tqdm(range(len(df_review))):
review = df_review.iloc[idx]
if review.stars >= pos_star:
self.pos_review.append(" ".join([ word for word in word_tokenize(review.text)]))
elif review.stars <= neg_star:
self.neg_review.append(" ".join([ word for word in word_tokenize(review.text)]))
print("样本统计:\n正例: {}\n负例: {}".format(len(self.pos_review),len(self.neg_review)))
# print("样本示例:\n正例: {}\n负例: {}".format("\n".join(self.pos_review[:2]),"\n".join(self.neg_review[:2])))
self.vectorizer = TfidfVectorizer()
corpus = self.pos_review + self.neg_review
self.X = self.vectorizer.fit_transform(corpus)
self.y = [1] * len(self.pos_review) + [0] * len(self.neg_review)
else:
self.clf = None
self.vectorizer = None
def train(self,test_size=0.25,*args,**kwargs):
if kwargs["save_model_path"] is not None:
save_model_path = kwargs["save_model_path"]
if kwargs["save_vectorizer_path"] is not None:
save_vectorizer_path = kwargs["save_vectorizer_path"]
self.clf = LogisticRegression(random_state=0,C=10)
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=0)
self.clf.fit(X_train,y_train)
score = self.clf.score(X_test,y_test)
with open(save_model_path,"wb") as f:
pickle.dump(self.clf,f)
with open(save_vectorizer_path,"wb") as f:
pickle.dump(self.vectorizer,f)
print("Finish training, score is {}.".format(str(score)))
def predict(self,*args,**kwargs):
if self.clf is None:
if kwargs["model_path"] is not None:
model_path = kwargs["model_path"]
with open(model_path,'rb') as f:
self.clf = pickle.load(f)
else:
raise Exception("model_path missing")
if self.vectorizer is None:
if kwargs["vectorizer_path"] is not None:
vectorizer_path = kwargs["vectorizer_path"]
with open(vectorizer_path,'rb') as f:
self.vectorizer = pickle.load(f)
else:
raise Exception("vectorizer_path missing")
data = args[0]
tmp = []
for d in data:
tmp.append(" ".join([word for word in word_tokenize(d)]))
data = tmp
features = self.vectorizer.transform(data)
result = self.clf.predict(features)
score = self.clf.predict_proba(features)
return result,score
if __name__ == "__main__":
review_path = "data/review.json"
model_path = "model/model.pkl"
vector_path = "model/vector.pkl"
is_train = False
if is_train:
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in f.readlines():
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print(review_df.head())
model = SentimentModel(review_df)
model.train(save_model_path=model_path,save_vectorizer_path=vector_path)
else:
model = SentimentModel()
data = ["Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","kkkkkk"]
result,score = model.predict(data,model_path=model_path,vectorizer_path=vector_path)
print("\n".join(["{}\t{}\t{}".format(str(label),str(s[label]),d) for label,s,d in zip(result,score,data)]))
# dependency and version
# encoding: utf-8
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# stopwords = []
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
lemmatizer = nltk.WordNetLemmatizer()
#stemmer = nltk.stem.porter.PorterStemmer()
class Sentence(object):
# WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
# LEMMATIZER = WordNetLemmatizer()
# 针对于每一句话抽取aspects
# ASP_EXTRACTOR =
def __init__(self,sentence):
self.sentence = sentence
# def word_tokenize(self):
# return
# def pos_tag(self):
# return
# def lemmatize(self):
# return
# def contain_aspect(self):
# return
def extract_noun_phrase(self):
tree = self.execute()
nps = [np for np in self.get_terms(tree)]
return nps
def execute(self):
# Taken from Su Nam Kim Paper...
chunker = nltk.RegexpParser(grammar)
#toks = nltk.regexp_tokenize(text, sentence_re)
# #postoks = nltk.tag.pos_tag(toks)
toks = nltk.word_tokenize(self.sentence)
postoks = nltk.tag.pos_tag(toks)
tree = chunker.parse(postoks)
return tree
def leaves(self,tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
yield subtree.leaves()
def normalise(self,word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
# word = stemmer.stem(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(self,word):
"""Checks conditions for acceptable word: length, stopword."""
accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
return accepted
def get_terms(self,tree):
for leaf in self.leaves(tree):
term = " ".join([self.normalise(w) for w, t in leaf if self.acceptable_word(w)])
yield term
# 请填写知乎的链接
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment