{ "cells": [ { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [], "source": [ "df_all = pd.read_json('review.json', encoding='utf-8', lines=True)" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',\n", " 'cool', 'text', 'date'],\n", " dtype='object')\n" ] } ], "source": [ "print(df_all.shape, df_all.columns)" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review_id user_id business_id \\\n", "0 Q1sbwvVQXV2734tPgoKj4Q hG7b0MtEbXx5QzbzE6C_VA ujmEBvifdJM6h6RLv4wQIg \n", "1 GJXCdrto3ASJOqKeVWPi6Q yXQM5uF2jS6es16SJzNHfg NZnhc2sEQy3RmzKTZnqtwQ \n", "2 2TzJjDVDEuAW6MR5Vuc1ug n6-Gk65cPZL6Uz8qRm3NYw WTqjgwHlXbSFevF32_DJVw \n", "3 yi0R0Ugj_xUx_Nek0-_Qig dacAIZ6fTM6mqwW5uxkskg ikCg8xy5JIg_NGPx-MSIDA \n", "4 11a8sVPMUFtaC7_ABRkmtw ssoyf2_x0EQMed6fgHeMyQ b1b1eb3uo-w561D0ZfCEiQ \n", "\n", " stars useful funny cool \\\n", "0 1 6 1 0 \n", "1 5 0 0 0 \n", "2 5 3 0 0 \n", "3 5 0 0 0 \n", "4 1 7 0 0 \n", "\n", " text date \n", "0 Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36 \n", "1 I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33 \n", "2 I have to say that this office really has it t... 2016-11-09 20:09:03 \n", "3 Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38 \n", "4 Today was my second out of three sessions I ha... 2018-01-30 23:07:38 \n" ] } ], "source": [ "print(df_all.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 1. 分组统合信息" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [], "source": [ "# 用于提取信息的类\n", "\n", "import nltk\n", "from nltk.corpus import stopwords\n", "stopwords = stopwords.words('english')\n", "grammar = r\"\"\"\n", " NBAR:\n", " {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns\n", " NP:\n", " {<NBAR>}\n", " {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...\n", "\"\"\"\n", "lemmatizer = nltk.WordNetLemmatizer()\n", "stemmer = nltk.stem.porter.PorterStemmer()\n", "\n", "class NounPhraseExtractor(object):\n", "\n", " def __init__(self, sentence):\n", " self.sentence = sentence\n", "\n", " def execute(self):\n", " # Taken from Su Nam Kim Paper...\n", " chunker = nltk.RegexpParser(grammar)\n", " #toks = nltk.regexp_tokenize(text, sentence_re)\n", " # #postoks = nltk.tag.pos_tag(toks)\n", " toks = nltk.word_tokenize(self.sentence)\n", " postoks = nltk.tag.pos_tag(toks)\n", " tree = chunker.parse(postoks)\n", " return tree\n", "\n", " def leaves(self, tree):\n", " \"\"\"Finds NP (nounphrase) leaf nodes of a chunk tree.\"\"\"\n", " for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n", " yield subtree.leaves()\n", "\n", " def normalise(self, word):\n", " \"\"\"Normalises words to lowercase and stems and lemmatizes it.\"\"\"\n", " word = word.lower()\n", " word = stemmer.stem(word)\n", " word = lemmatizer.lemmatize(word)\n", " return word\n", "\n", " def acceptable_word(self, word):\n", " \"\"\"Checks conditions for acceptable word: length, stopword.\"\"\"\n", " accepted = bool(2 <= len(word) <= 40\n", " and word.lower() not in stopwords)\n", " return accepted\n", "\n", " def get_terms(self,tree):\n", " for leaf in self.leaves(tree):\n", " term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]\n", " yield term\n", "\n", " def extract(self):\n", " terms = self.get_terms(self.execute())\n", " matches = []\n", " for term in terms:\n", " for word in term:\n", " matches.append(word)\n", " return matches\n" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bimorph', 'forc', 'sensor']" ] }, "execution_count": 187, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example\n", "document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'\n", "extract = NounPhraseExtractor(document)\n", "extract.extract()" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "group_bus_id = df_all.groupby(['business_id'])" ] }, { "cell_type": "code", "execution_count": 185, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import time\n", "from collections import defaultdict\n", "import json\n", "\n", "f = open('result_dict.txt', 'a+')\n", "\n", "\n", "g_ct = 0\n", "\n", "for business_id, df in group_bus_id: # 遍历每个组\n", " start = time.time()\n", " business_info = defaultdict(dict)\n", " \n", " g_ct += 1\n", " if g_ct <= 192606: # 控制读取个数\n", " continue\n", " \n", " print(g_ct, length)\n", " length = len(df)\n", " if length < 100:\n", " continue\n", " \n", " scores = []\n", " aspect_ct = defaultdict(int)\n", " aspect_reverse_index = defaultdict(list) # 组内每个aspect对于(text, business_id的倒排表)\n", " \n", " for _, row in df.iterrows():\n", "# print('_'*32)\n", " \n", " \n", " try:\n", " starts = row['stars']\n", " review_id = row['review_id']\n", " text = row['text']\n", " \n", "# print(starts, review_id, text)\n", " scores.append(starts)\n", " extract_aspects = NounPhraseExtractor(text).extract() # 对于每个评论生成aspect\n", " for extract_aspect in extract_aspects:\n", " aspect_reverse_index[extract_aspect].append((review_id, text))\n", " aspect_ct[extract_aspect] += 1\n", " except Exception as e:\n", " print(e)\n", " pass\n", " \n", " sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5]\n", " business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct]\n", " for k, _ in sorted_aspect_ct:\n", " business_info[business_id][k] = aspect_reverse_index[k]\n", " business_info[business_id]['scores'] = scores\n", " \n", "# print(business_info)\n", " f.write('{}\\n'.format(json.dumps(business_info)))\n", " print(g_ct, length, time.time() - start)\n", " \n", "f.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. 训练模型" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_train = df_all[['text', 'stars']]" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "df_train = pd.read_csv('dataset.csv', sep='\\t', nrows=3000)" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [], "source": [ "text_list = df_train['text'].tolist()" ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [], "source": [ "# todo: 划分训练, 测试集\n", "\n" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<3000x14900 sparse matrix of type '<class 'numpy.float64'>'\n", "\twith 213295 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 194, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "tfidf_vectorizer = TfidfVectorizer()\n", "tfidf_vectorizer.fit_transform(text_list)" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [], "source": [ "train_X = tfidf_vectorizer.transform(text_list).toarray()\n", "train_y = df_train['stars'].tolist()" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", "iris = datasets.load_iris()\n", "\n", "from sklearn.naive_bayes import GaussianNB\n", "clf = GaussianNB()\n", "clf = clf.fit(train_X, train_y)\n", "y_pred=clf.predict(train_X)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. 读取信息, 预测结果" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# todo: 何判断一个评语有没有包含指定的 aspect" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "f = open('result_dict.txt', 'r')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "<img src=\"./line.png\" width = \"300\" height = \"200\" align=center />" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "I/O operation on closed file.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-201-022a4aafbf13>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0minfo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# todo: 解析\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file." ] } ], "source": [ "for line in f:\n", " info = json.loads(line)\n", " # todo: 解析\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f.close()" ] } ], "metadata": { "kernelspec": { "display_name": "py37", "language": "python", "name": "py37" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }