{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all = pd.read_json('review.json', encoding='utf-8', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',\n",
      "       'cool', 'text', 'date'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "print(df_all.shape, df_all.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                review_id                 user_id             business_id  \\\n",
      "0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA  ujmEBvifdJM6h6RLv4wQIg   \n",
      "1  GJXCdrto3ASJOqKeVWPi6Q  yXQM5uF2jS6es16SJzNHfg  NZnhc2sEQy3RmzKTZnqtwQ   \n",
      "2  2TzJjDVDEuAW6MR5Vuc1ug  n6-Gk65cPZL6Uz8qRm3NYw  WTqjgwHlXbSFevF32_DJVw   \n",
      "3  yi0R0Ugj_xUx_Nek0-_Qig  dacAIZ6fTM6mqwW5uxkskg  ikCg8xy5JIg_NGPx-MSIDA   \n",
      "4  11a8sVPMUFtaC7_ABRkmtw  ssoyf2_x0EQMed6fgHeMyQ  b1b1eb3uo-w561D0ZfCEiQ   \n",
      "\n",
      "   stars  useful  funny  cool  \\\n",
      "0      1       6      1     0   \n",
      "1      5       0      0     0   \n",
      "2      5       3      0     0   \n",
      "3      5       0      0     0   \n",
      "4      1       7      0     0   \n",
      "\n",
      "                                                text                date  \n",
      "0  Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36  \n",
      "1  I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33  \n",
      "2  I have to say that this office really has it t... 2016-11-09 20:09:03  \n",
      "3  Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38  \n",
      "4  Today was my second out of three sessions I ha... 2018-01-30 23:07:38  \n"
     ]
    }
   ],
   "source": [
    "print(df_all.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1. 分组统合信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 用于提取信息的类\n",
    "\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "stopwords = stopwords.words('english')\n",
    "grammar = r\"\"\"\n",
    " NBAR:\n",
    "    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns\n",
    " NP:\n",
    "    {<NBAR>}\n",
    "    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...\n",
    "\"\"\"\n",
    "lemmatizer = nltk.WordNetLemmatizer()\n",
    "stemmer = nltk.stem.porter.PorterStemmer()\n",
    "\n",
    "class NounPhraseExtractor(object):\n",
    "\n",
    "    def __init__(self, sentence):\n",
    "        self.sentence = sentence\n",
    "\n",
    "    def execute(self):\n",
    "        # Taken from Su Nam Kim Paper...\n",
    "        chunker = nltk.RegexpParser(grammar)\n",
    "        #toks = nltk.regexp_tokenize(text, sentence_re)\n",
    "        # #postoks = nltk.tag.pos_tag(toks)\n",
    "        toks = nltk.word_tokenize(self.sentence)\n",
    "        postoks = nltk.tag.pos_tag(toks)\n",
    "        tree = chunker.parse(postoks)\n",
    "        return tree\n",
    "\n",
    "    def leaves(self, tree):\n",
    "        \"\"\"Finds NP (nounphrase) leaf nodes of a chunk tree.\"\"\"\n",
    "        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n",
    "            yield subtree.leaves()\n",
    "\n",
    "    def normalise(self, word):\n",
    "        \"\"\"Normalises words to lowercase and stems and lemmatizes it.\"\"\"\n",
    "        word = word.lower()\n",
    "        word = stemmer.stem(word)\n",
    "        word = lemmatizer.lemmatize(word)\n",
    "        return word\n",
    "\n",
    "    def acceptable_word(self, word):\n",
    "        \"\"\"Checks conditions for acceptable word: length, stopword.\"\"\"\n",
    "        accepted = bool(2 <= len(word) <= 40\n",
    "                    and word.lower() not in stopwords)\n",
    "        return accepted\n",
    "\n",
    "    def get_terms(self,tree):\n",
    "        for leaf in self.leaves(tree):\n",
    "            term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]\n",
    "        yield term\n",
    "\n",
    "    def extract(self):\n",
    "        terms = self.get_terms(self.execute())\n",
    "        matches = []\n",
    "        for term in terms:\n",
    "            for word in term:\n",
    "                matches.append(word)\n",
    "        return matches\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bimorph', 'forc', 'sensor']"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example\n",
    "document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'\n",
    "extract = NounPhraseExtractor(document)\n",
    "extract.extract()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_bus_id = df_all.groupby(['business_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "from collections import defaultdict\n",
    "import json\n",
    "\n",
    "f = open('result_dict.txt', 'a+')\n",
    "\n",
    "\n",
    "g_ct = 0\n",
    "\n",
    "for business_id, df in group_bus_id:  # 遍历每个组\n",
    "    start = time.time()\n",
    "    business_info = defaultdict(dict)\n",
    "    \n",
    "    g_ct += 1\n",
    "    if g_ct <= 192606:  # 控制读取个数\n",
    "        continue\n",
    "    \n",
    "    print(g_ct, length)\n",
    "    length = len(df)\n",
    "    if length < 100:\n",
    "        continue\n",
    "    \n",
    "    scores = []\n",
    "    aspect_ct = defaultdict(int)\n",
    "    aspect_reverse_index = defaultdict(list)  # 组内每个aspect对于(text, business_id的倒排表)\n",
    "    \n",
    "    for _, row in df.iterrows():\n",
    "#         print('_'*32)\n",
    "        \n",
    "        \n",
    "        try:\n",
    "            starts = row['stars']\n",
    "            review_id = row['review_id']\n",
    "            text = row['text']\n",
    "            \n",
    "#             print(starts, review_id, text)\n",
    "            scores.append(starts)\n",
    "            extract_aspects = NounPhraseExtractor(text).extract()  # 对于每个评论生成aspect\n",
    "            for extract_aspect in extract_aspects:\n",
    "                aspect_reverse_index[extract_aspect].append((review_id, text))\n",
    "                aspect_ct[extract_aspect] += 1\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            pass\n",
    "    \n",
    "    sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5]\n",
    "    business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct]\n",
    "    for k, _ in sorted_aspect_ct:\n",
    "        business_info[business_id][k] = aspect_reverse_index[k]\n",
    "    business_info[business_id]['scores'] = scores\n",
    "            \n",
    "#     print(business_info)\n",
    "    f.write('{}\\n'.format(json.dumps(business_info)))\n",
    "    print(g_ct, length, time.time() - start)\n",
    "    \n",
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df_all[['text', 'stars']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('dataset.csv', sep='\\t', nrows=3000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_list = df_train['text'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [],
   "source": [
    "# todo: 划分训练, 测试集\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3000x14900 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 213295 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 194,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tfidf_vectorizer = TfidfVectorizer()\n",
    "tfidf_vectorizer.fit_transform(text_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X = tfidf_vectorizer.transform(text_list).toarray()\n",
    "train_y = df_train['stars'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "iris = datasets.load_iris()\n",
    "\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "clf = GaussianNB()\n",
    "clf = clf.fit(train_X, train_y)\n",
    "y_pred=clf.predict(train_X)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 读取信息, 预测结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# todo: 何判断一个评语有没有包含指定的 aspect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('result_dict.txt', 'r')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"./line.png\" width = \"300\" height = \"200\" align=center />"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "I/O operation on closed file.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-201-022a4aafbf13>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0minfo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;31m# todo: 解析\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file."
     ]
    }
   ],
   "source": [
    "for line in f:\n",
    "    info = json.loads(line)\n",
    "    # todo: 解析\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py37",
   "language": "python",
   "name": "py37"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}