start_code.ipynb 12.8 KB
In [177]:
import json
In [178]:
import pandas as pd
In [179]:
df_all = pd.read_json('review.json', encoding='utf-8', lines=True)
In [186]:
print(df_all.shape, df_all.columns)
Out [186]:
(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
In [181]:
print(df_all.head())
Out [181]:
                review_id                 user_id             business_id  \
0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA  ujmEBvifdJM6h6RLv4wQIg   
1  GJXCdrto3ASJOqKeVWPi6Q  yXQM5uF2jS6es16SJzNHfg  NZnhc2sEQy3RmzKTZnqtwQ   
2  2TzJjDVDEuAW6MR5Vuc1ug  n6-Gk65cPZL6Uz8qRm3NYw  WTqjgwHlXbSFevF32_DJVw   
3  yi0R0Ugj_xUx_Nek0-_Qig  dacAIZ6fTM6mqwW5uxkskg  ikCg8xy5JIg_NGPx-MSIDA   
4  11a8sVPMUFtaC7_ABRkmtw  ssoyf2_x0EQMed6fgHeMyQ  b1b1eb3uo-w561D0ZfCEiQ   

   stars  useful  funny  cool  \
0      1       6      1     0   
1      5       0      0     0   
2      5       3      0     0   
3      5       0      0     0   
4      1       7      0     0   

                                                text                date  
0  Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36  
1  I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33  
2  I have to say that this office really has it t... 2016-11-09 20:09:03  
3  Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38  
4  Today was my second out of three sessions I ha... 2018-01-30 23:07:38  

1. 分组统合信息

In [182]:
# 用于提取信息的类

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
grammar = r"""
 NBAR:
    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
 NP:
    {<NBAR>}
    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

class NounPhraseExtractor(object):

    def __init__(self, sentence):
        self.sentence = sentence

    def execute(self):
        # Taken from Su Nam Kim Paper...
        chunker = nltk.RegexpParser(grammar)
        #toks = nltk.regexp_tokenize(text, sentence_re)
        # #postoks = nltk.tag.pos_tag(toks)
        toks = nltk.word_tokenize(self.sentence)
        postoks = nltk.tag.pos_tag(toks)
        tree = chunker.parse(postoks)
        return tree

    def leaves(self, tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
            yield subtree.leaves()

    def normalise(self, word):
        """Normalises words to lowercase and stems and lemmatizes it."""
        word = word.lower()
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
        return word

    def acceptable_word(self, word):
        """Checks conditions for acceptable word: length, stopword."""
        accepted = bool(2 <= len(word) <= 40
                    and word.lower() not in stopwords)
        return accepted

    def get_terms(self,tree):
        for leaf in self.leaves(tree):
            term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]
        yield term

    def extract(self):
        terms = self.get_terms(self.execute())
        matches = []
        for term in terms:
            for word in term:
                matches.append(word)
        return matches
In [187]:
# example
document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'
extract = NounPhraseExtractor(document)
extract.extract()
Out [187]:
['bimorph', 'forc', 'sensor']
In [184]:
group_bus_id = df_all.groupby(['business_id'])
In [185]:
import time
from collections import defaultdict
import json

f = open('result_dict.txt', 'a+')


g_ct = 0

for business_id, df in group_bus_id:  # 遍历每个组
    start = time.time()
    business_info = defaultdict(dict)
    
    g_ct += 1
    if g_ct <= 192606:  # 控制读取个数
        continue
    
    print(g_ct, length)
    length = len(df)
    if length < 100:
        continue
    
    scores = []
    aspect_ct = defaultdict(int)
    aspect_reverse_index = defaultdict(list)  # 组内每个aspect对于(text, business_id的倒排表)
    
    for _, row in df.iterrows():
#         print('_'*32)
        
        
        try:
            starts = row['stars']
            review_id = row['review_id']
            text = row['text']
            
#             print(starts, review_id, text)
            scores.append(starts)
            extract_aspects = NounPhraseExtractor(text).extract()  # 对于每个评论生成aspect
            for extract_aspect in extract_aspects:
                aspect_reverse_index[extract_aspect].append((review_id, text))
                aspect_ct[extract_aspect] += 1
        except Exception as e:
            print(e)
            pass
    
    sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5]
    business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct]
    for k, _ in sorted_aspect_ct:
        business_info[business_id][k] = aspect_reverse_index[k]
    business_info[business_id]['scores'] = scores
            
#     print(business_info)
    f.write('{}\n'.format(json.dumps(business_info)))
    print(g_ct, length, time.time() - start)
    
f.close()

2. 训练模型

df_train = df_all[['text', 'stars']]
In [192]:
df_train = pd.read_csv('dataset.csv', sep='\t', nrows=3000)
In [193]:
text_list = df_train['text'].tolist()
In [199]:
# todo: 划分训练, 测试集

In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform(text_list)
Out [194]:
<3000x14900 sparse matrix of type '<class 'numpy.float64'>'
	with 213295 stored elements in Compressed Sparse Row format>
In [196]:
train_X = tfidf_vectorizer.transform(text_list).toarray()
train_y = df_train['stars'].tolist()
In [198]:
from sklearn import datasets
iris = datasets.load_iris()

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf = clf.fit(train_X, train_y)
y_pred=clf.predict(train_X)

3. 读取信息, 预测结果

# todo: 何判断一个评语有没有包含指定的 aspect
In [169]:
f = open('result_dict.txt', 'r')

<img src="./line.png" width = "300" height = "200" align=center />

result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)
In [201]:
for line in f:
    info = json.loads(line)
    # todo: 解析
    
f.close()