start_code.ipynb
12.8 KB
In [177]:
import json
In [178]:
import pandas as pd
In [179]:
df_all = pd.read_json('review.json', encoding='utf-8', lines=True)
In [186]:
print(df_all.shape, df_all.columns)
Out [186]:
(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'], dtype='object')
In [181]:
print(df_all.head())
Out [181]:
review_id user_id business_id \ 0 Q1sbwvVQXV2734tPgoKj4Q hG7b0MtEbXx5QzbzE6C_VA ujmEBvifdJM6h6RLv4wQIg 1 GJXCdrto3ASJOqKeVWPi6Q yXQM5uF2jS6es16SJzNHfg NZnhc2sEQy3RmzKTZnqtwQ 2 2TzJjDVDEuAW6MR5Vuc1ug n6-Gk65cPZL6Uz8qRm3NYw WTqjgwHlXbSFevF32_DJVw 3 yi0R0Ugj_xUx_Nek0-_Qig dacAIZ6fTM6mqwW5uxkskg ikCg8xy5JIg_NGPx-MSIDA 4 11a8sVPMUFtaC7_ABRkmtw ssoyf2_x0EQMed6fgHeMyQ b1b1eb3uo-w561D0ZfCEiQ stars useful funny cool \ 0 1 6 1 0 1 5 0 0 0 2 5 3 0 0 3 5 0 0 0 4 1 7 0 0 text date 0 Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36 1 I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33 2 I have to say that this office really has it t... 2016-11-09 20:09:03 3 Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38 4 Today was my second out of three sessions I ha... 2018-01-30 23:07:38
1. 分组统合信息
In [182]:
# 用于提取信息的类 import nltk from nltk.corpus import stopwords stopwords = stopwords.words('english') grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() class NounPhraseExtractor(object): def __init__(self, sentence): self.sentence = sentence def execute(self): # Taken from Su Nam Kim Paper... chunker = nltk.RegexpParser(grammar) #toks = nltk.regexp_tokenize(text, sentence_re) # #postoks = nltk.tag.pos_tag(toks) toks = nltk.word_tokenize(self.sentence) postoks = nltk.tag.pos_tag(toks) tree = chunker.parse(postoks) return tree def leaves(self, tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): yield subtree.leaves() def normalise(self, word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(self, word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(self,tree): for leaf in self.leaves(tree): term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)] yield term def extract(self): terms = self.get_terms(self.execute()) matches = [] for term in terms: for word in term: matches.append(word) return matches
In [187]:
# example document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor' extract = NounPhraseExtractor(document) extract.extract()
Out [187]:
['bimorph', 'forc', 'sensor']
In [184]:
group_bus_id = df_all.groupby(['business_id'])
In [185]:
import time from collections import defaultdict import json f = open('result_dict.txt', 'a+') g_ct = 0 for business_id, df in group_bus_id: # 遍历每个组 start = time.time() business_info = defaultdict(dict) g_ct += 1 if g_ct <= 192606: # 控制读取个数 continue print(g_ct, length) length = len(df) if length < 100: continue scores = [] aspect_ct = defaultdict(int) aspect_reverse_index = defaultdict(list) # 组内每个aspect对于(text, business_id的倒排表) for _, row in df.iterrows(): # print('_'*32) try: starts = row['stars'] review_id = row['review_id'] text = row['text'] # print(starts, review_id, text) scores.append(starts) extract_aspects = NounPhraseExtractor(text).extract() # 对于每个评论生成aspect for extract_aspect in extract_aspects: aspect_reverse_index[extract_aspect].append((review_id, text)) aspect_ct[extract_aspect] += 1 except Exception as e: print(e) pass sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5] business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct] for k, _ in sorted_aspect_ct: business_info[business_id][k] = aspect_reverse_index[k] business_info[business_id]['scores'] = scores # print(business_info) f.write('{}\n'.format(json.dumps(business_info))) print(g_ct, length, time.time() - start) f.close()
2. 训练模型
df_train = df_all[['text', 'stars']]
In [192]:
df_train = pd.read_csv('dataset.csv', sep='\t', nrows=3000)
In [193]:
text_list = df_train['text'].tolist()
In [199]:
# todo: 划分训练, 测试集
In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.fit_transform(text_list)
Out [194]:
<3000x14900 sparse matrix of type '<class 'numpy.float64'>' with 213295 stored elements in Compressed Sparse Row format>
In [196]:
train_X = tfidf_vectorizer.transform(text_list).toarray() train_y = df_train['stars'].tolist()
In [198]:
from sklearn import datasets iris = datasets.load_iris() from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf = clf.fit(train_X, train_y) y_pred=clf.predict(train_X)
3. 读取信息, 预测结果
# todo: 何判断一个评语有没有包含指定的 aspect
In [169]:
f = open('result_dict.txt', 'r')
<img src="./line.png" width = "300" height = "200" align=center />
result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)
In [201]:
for line in f: info = json.loads(line) # todo: 解析
f.close()