project2 未完

4083d8eb · 20200913012 · 1f320c49 · 4083d8eb
Commit 4083d8eb authored Dec 07, 2020 by 20200913012
Show whitespace changes
Inline Side-by-side

Showing with 452 additions and 0 deletions

project2情感分析/start_code.ipynb
+452 -0

No files found.
--- a/project2情感分析/start_code.ipynb
+++ b/project2情感分析/start_code.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 177,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 178,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 179,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_all = pd.read_json('review.json', encoding='utf-8', lines=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 186,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',\n",
+      "       'cool', 'text', 'date'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df_all.shape, df_all.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 181,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                review_id                 user_id             business_id  \\\n",
+      "0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA  ujmEBvifdJM6h6RLv4wQIg   \n",
+      "1  GJXCdrto3ASJOqKeVWPi6Q  yXQM5uF2jS6es16SJzNHfg  NZnhc2sEQy3RmzKTZnqtwQ   \n",
+      "2  2TzJjDVDEuAW6MR5Vuc1ug  n6-Gk65cPZL6Uz8qRm3NYw  WTqjgwHlXbSFevF32_DJVw   \n",
+      "3  yi0R0Ugj_xUx_Nek0-_Qig  dacAIZ6fTM6mqwW5uxkskg  ikCg8xy5JIg_NGPx-MSIDA   \n",
+      "4  11a8sVPMUFtaC7_ABRkmtw  ssoyf2_x0EQMed6fgHeMyQ  b1b1eb3uo-w561D0ZfCEiQ   \n",
+      "\n",
+      "   stars  useful  funny  cool  \\\n",
+      "0      1       6      1     0   \n",
+      "1      5       0      0     0   \n",
+      "2      5       3      0     0   \n",
+      "3      5       0      0     0   \n",
+      "4      1       7      0     0   \n",
+      "\n",
+      "                                                text                date  \n",
+      "0  Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36  \n",
+      "1  I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33  \n",
+      "2  I have to say that this office really has it t... 2016-11-09 20:09:03  \n",
+      "3  Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38  \n",
+      "4  Today was my second out of three sessions I ha... 2018-01-30 23:07:38  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df_all.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1. 分组统合信息"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 182,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 用于提取信息的类\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "stopwords = stopwords.words('english')\n",
+    "grammar = r\"\"\"\n",
+    " NBAR:\n",
+    "    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns\n",
+    " NP:\n",
+    "    {<NBAR>}\n",
+    "    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...\n",
+    "\"\"\"\n",
+    "lemmatizer = nltk.WordNetLemmatizer()\n",
+    "stemmer = nltk.stem.porter.PorterStemmer()\n",
+    "\n",
+    "class NounPhraseExtractor(object):\n",
+    "\n",
+    "    def __init__(self, sentence):\n",
+    "        self.sentence = sentence\n",
+    "\n",
+    "    def execute(self):\n",
+    "        # Taken from Su Nam Kim Paper...\n",
+    "        chunker = nltk.RegexpParser(grammar)\n",
+    "        #toks = nltk.regexp_tokenize(text, sentence_re)\n",
+    "        # #postoks = nltk.tag.pos_tag(toks)\n",
+    "        toks = nltk.word_tokenize(self.sentence)\n",
+    "        postoks = nltk.tag.pos_tag(toks)\n",
+    "        tree = chunker.parse(postoks)\n",
+    "        return tree\n",
+    "\n",
+    "    def leaves(self, tree):\n",
+    "        \"\"\"Finds NP (nounphrase) leaf nodes of a chunk tree.\"\"\"\n",
+    "        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n",
+    "            yield subtree.leaves()\n",
+    "\n",
+    "    def normalise(self, word):\n",
+    "        \"\"\"Normalises words to lowercase and stems and lemmatizes it.\"\"\"\n",
+    "        word = word.lower()\n",
+    "        word = stemmer.stem(word)\n",
+    "        word = lemmatizer.lemmatize(word)\n",
+    "        return word\n",
+    "\n",
+    "    def acceptable_word(self, word):\n",
+    "        \"\"\"Checks conditions for acceptable word: length, stopword.\"\"\"\n",
+    "        accepted = bool(2 <= len(word) <= 40\n",
+    "                    and word.lower() not in stopwords)\n",
+    "        return accepted\n",
+    "\n",
+    "    def get_terms(self,tree):\n",
+    "        for leaf in self.leaves(tree):\n",
+    "            term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]\n",
+    "        yield term\n",
+    "\n",
+    "    def extract(self):\n",
+    "        terms = self.get_terms(self.execute())\n",
+    "        matches = []\n",
+    "        for term in terms:\n",
+    "            for word in term:\n",
+    "                matches.append(word)\n",
+    "        return matches\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 187,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bimorph', 'forc', 'sensor']"
+      ]
+     },
+     "execution_count": 187,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# example\n",
+    "document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'\n",
+    "extract = NounPhraseExtractor(document)\n",
+    "extract.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_bus_id = df_all.groupby(['business_id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 185,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from collections import defaultdict\n",
+    "import json\n",
+    "\n",
+    "f = open('result_dict.txt', 'a+')\n",
+    "\n",
+    "\n",
+    "g_ct = 0\n",
+    "\n",
+    "for business_id, df in group_bus_id:  # 遍历每个组\n",
+    "    start = time.time()\n",
+    "    business_info = defaultdict(dict)\n",
+    "    \n",
+    "    g_ct += 1\n",
+    "    if g_ct <= 192606:  # 控制读取个数\n",
+    "        continue\n",
+    "    \n",
+    "    print(g_ct, length)\n",
+    "    length = len(df)\n",
+    "    if length < 100:\n",
+    "        continue\n",
+    "    \n",
+    "    scores = []\n",
+    "    aspect_ct = defaultdict(int)\n",
+    "    aspect_reverse_index = defaultdict(list)  # 组内每个aspect对于(text, business_id的倒排表)\n",
+    "    \n",
+    "    for _, row in df.iterrows():\n",
+    "#         print('_'*32)\n",
+    "        \n",
+    "        \n",
+    "        try:\n",
+    "            starts = row['stars']\n",
+    "            review_id = row['review_id']\n",
+    "            text = row['text']\n",
+    "            \n",
+    "#             print(starts, review_id, text)\n",
+    "            scores.append(starts)\n",
+    "            extract_aspects = NounPhraseExtractor(text).extract()  # 对于每个评论生成aspect\n",
+    "            for extract_aspect in extract_aspects:\n",
+    "                aspect_reverse_index[extract_aspect].append((review_id, text))\n",
+    "                aspect_ct[extract_aspect] += 1\n",
+    "        except Exception as e:\n",
+    "            print(e)\n",
+    "            pass\n",
+    "    \n",
+    "    sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5]\n",
+    "    business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct]\n",
+    "    for k, _ in sorted_aspect_ct:\n",
+    "        business_info[business_id][k] = aspect_reverse_index[k]\n",
+    "    business_info[business_id]['scores'] = scores\n",
+    "            \n",
+    "#     print(business_info)\n",
+    "    f.write('{}\\n'.format(json.dumps(business_info)))\n",
+    "    print(g_ct, length, time.time() - start)\n",
+    "    \n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. 训练模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = df_all[['text', 'stars']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 192,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = pd.read_csv('dataset.csv', sep='\\t', nrows=3000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 193,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_list = df_train['text'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 199,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# todo: 划分训练, 测试集\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 194,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<3000x14900 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 213295 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 194,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "tfidf_vectorizer = TfidfVectorizer()\n",
+    "tfidf_vectorizer.fit_transform(text_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 196,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_X = tfidf_vectorizer.transform(text_list).toarray()\n",
+    "train_y = df_train['stars'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 198,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import datasets\n",
+    "iris = datasets.load_iris()\n",
+    "\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "clf = GaussianNB()\n",
+    "clf = clf.fit(train_X, train_y)\n",
+    "y_pred=clf.predict(train_X)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. 读取信息, 预测结果"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# todo: 何判断一个评语有没有包含指定的 aspect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 169,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = open('result_dict.txt', 'r')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"./line.png\" width = \"300\" height = \"200\" align=center />"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "I/O operation on closed file.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-201-022a4aafbf13>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0minfo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;31m# todo: 解析\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file."
+     ]
+    }
+   ],
+   "source": [
+    "for line in f:\n",
+    "    info = json.loads(line)\n",
+    "    # todo: 解析\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py37",
+   "language": "python",
+   "name": "py37"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}