Commit 4083d8eb by 20200913012

project2 未完

parent 1f320c49
{
{
"cells": [
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.read_json('review.json', encoding='utf-8', lines=True)"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(37, 9) Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',\n",
" 'cool', 'text', 'date'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(df_all.shape, df_all.columns)"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review_id user_id business_id \\\n",
"0 Q1sbwvVQXV2734tPgoKj4Q hG7b0MtEbXx5QzbzE6C_VA ujmEBvifdJM6h6RLv4wQIg \n",
"1 GJXCdrto3ASJOqKeVWPi6Q yXQM5uF2jS6es16SJzNHfg NZnhc2sEQy3RmzKTZnqtwQ \n",
"2 2TzJjDVDEuAW6MR5Vuc1ug n6-Gk65cPZL6Uz8qRm3NYw WTqjgwHlXbSFevF32_DJVw \n",
"3 yi0R0Ugj_xUx_Nek0-_Qig dacAIZ6fTM6mqwW5uxkskg ikCg8xy5JIg_NGPx-MSIDA \n",
"4 11a8sVPMUFtaC7_ABRkmtw ssoyf2_x0EQMed6fgHeMyQ b1b1eb3uo-w561D0ZfCEiQ \n",
"\n",
" stars useful funny cool \\\n",
"0 1 6 1 0 \n",
"1 5 0 0 0 \n",
"2 5 3 0 0 \n",
"3 5 0 0 0 \n",
"4 1 7 0 0 \n",
"\n",
" text date \n",
"0 Total bill for this horrible service? Over $8G... 2013-05-07 04:34:36 \n",
"1 I *adore* Travis at the Hard Rock's new Kelly ... 2017-01-14 21:30:33 \n",
"2 I have to say that this office really has it t... 2016-11-09 20:09:03 \n",
"3 Went in for a lunch. Steak sandwich was delici... 2018-01-09 20:56:38 \n",
"4 Today was my second out of three sessions I ha... 2018-01-30 23:07:38 \n"
]
}
],
"source": [
"print(df_all.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. 分组统合信息"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [],
"source": [
"# 用于提取信息的类\n",
"\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"stopwords = stopwords.words('english')\n",
"grammar = r\"\"\"\n",
" NBAR:\n",
" {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns\n",
" NP:\n",
" {<NBAR>}\n",
" {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...\n",
"\"\"\"\n",
"lemmatizer = nltk.WordNetLemmatizer()\n",
"stemmer = nltk.stem.porter.PorterStemmer()\n",
"\n",
"class NounPhraseExtractor(object):\n",
"\n",
" def __init__(self, sentence):\n",
" self.sentence = sentence\n",
"\n",
" def execute(self):\n",
" # Taken from Su Nam Kim Paper...\n",
" chunker = nltk.RegexpParser(grammar)\n",
" #toks = nltk.regexp_tokenize(text, sentence_re)\n",
" # #postoks = nltk.tag.pos_tag(toks)\n",
" toks = nltk.word_tokenize(self.sentence)\n",
" postoks = nltk.tag.pos_tag(toks)\n",
" tree = chunker.parse(postoks)\n",
" return tree\n",
"\n",
" def leaves(self, tree):\n",
" \"\"\"Finds NP (nounphrase) leaf nodes of a chunk tree.\"\"\"\n",
" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n",
" yield subtree.leaves()\n",
"\n",
" def normalise(self, word):\n",
" \"\"\"Normalises words to lowercase and stems and lemmatizes it.\"\"\"\n",
" word = word.lower()\n",
" word = stemmer.stem(word)\n",
" word = lemmatizer.lemmatize(word)\n",
" return word\n",
"\n",
" def acceptable_word(self, word):\n",
" \"\"\"Checks conditions for acceptable word: length, stopword.\"\"\"\n",
" accepted = bool(2 <= len(word) <= 40\n",
" and word.lower() not in stopwords)\n",
" return accepted\n",
"\n",
" def get_terms(self,tree):\n",
" for leaf in self.leaves(tree):\n",
" term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]\n",
" yield term\n",
"\n",
" def extract(self):\n",
" terms = self.get_terms(self.execute())\n",
" matches = []\n",
" for term in terms:\n",
" for word in term:\n",
" matches.append(word)\n",
" return matches\n"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bimorph', 'forc', 'sensor']"
]
},
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# example\n",
"document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'\n",
"extract = NounPhraseExtractor(document)\n",
"extract.extract()"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"group_bus_id = df_all.groupby(['business_id'])"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import time\n",
"from collections import defaultdict\n",
"import json\n",
"\n",
"f = open('result_dict.txt', 'a+')\n",
"\n",
"\n",
"g_ct = 0\n",
"\n",
"for business_id, df in group_bus_id: # 遍历每个组\n",
" start = time.time()\n",
" business_info = defaultdict(dict)\n",
" \n",
" g_ct += 1\n",
" if g_ct <= 192606: # 控制读取个数\n",
" continue\n",
" \n",
" print(g_ct, length)\n",
" length = len(df)\n",
" if length < 100:\n",
" continue\n",
" \n",
" scores = []\n",
" aspect_ct = defaultdict(int)\n",
" aspect_reverse_index = defaultdict(list) # 组内每个aspect对于(text, business_id的倒排表)\n",
" \n",
" for _, row in df.iterrows():\n",
"# print('_'*32)\n",
" \n",
" \n",
" try:\n",
" starts = row['stars']\n",
" review_id = row['review_id']\n",
" text = row['text']\n",
" \n",
"# print(starts, review_id, text)\n",
" scores.append(starts)\n",
" extract_aspects = NounPhraseExtractor(text).extract() # 对于每个评论生成aspect\n",
" for extract_aspect in extract_aspects:\n",
" aspect_reverse_index[extract_aspect].append((review_id, text))\n",
" aspect_ct[extract_aspect] += 1\n",
" except Exception as e:\n",
" print(e)\n",
" pass\n",
" \n",
" sorted_aspect_ct = sorted(aspect_ct.items(), key = lambda x: x[1], reverse=True)[:5]\n",
" business_info[business_id]['aspects'] = [k for k, _ in sorted_aspect_ct]\n",
" for k, _ in sorted_aspect_ct:\n",
" business_info[business_id][k] = aspect_reverse_index[k]\n",
" business_info[business_id]['scores'] = scores\n",
" \n",
"# print(business_info)\n",
" f.write('{}\\n'.format(json.dumps(business_info)))\n",
" print(g_ct, length, time.time() - start)\n",
" \n",
"f.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. 训练模型"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_train = df_all[['text', 'stars']]"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv('dataset.csv', sep='\\t', nrows=3000)"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"text_list = df_train['text'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [],
"source": [
"# todo: 划分训练, 测试集\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<3000x14900 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 213295 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vectorizer = TfidfVectorizer()\n",
"tfidf_vectorizer.fit_transform(text_list)"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"train_X = tfidf_vectorizer.transform(text_list).toarray()\n",
"train_y = df_train['stars'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import datasets\n",
"iris = datasets.load_iris()\n",
"\n",
"from sklearn.naive_bayes import GaussianNB\n",
"clf = GaussianNB()\n",
"clf = clf.fit(train_X, train_y)\n",
"y_pred=clf.predict(train_X)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. 读取信息, 预测结果"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# todo: 何判断一个评语有没有包含指定的 aspect"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"f = open('result_dict.txt', 'r')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./line.png\" width = \"300\" height = \"200\" align=center />"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### result_dict.txt中每一行为一个字典, business_id为key, value为5个aspects以及每个aspects对应的(review_id, text)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "I/O operation on closed file.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-201-022a4aafbf13>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0minfo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# todo: 解析\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: I/O operation on closed file."
]
}
],
"source": [
"for line in f:\n",
" info = json.loads(line)\n",
" # todo: 解析\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py37",
"language": "python",
"name": "py37"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment