Upload New File

aa1d73db · 20210516036 · 7beb87f3 · aa1d73db
Commit aa1d73db authored Jul 04, 2021 by 20210516036
Show whitespace changes
Inline Side-by-side

Showing with 515 additions and 0 deletions

codes/preprocessor.ipynb
+515 -0

No files found.
--- a/codes/preprocessor.ipynb
+++ b/codes/preprocessor.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 文本的预处理\n",
+    "关于这个模块，你并不需要完成任何任务，所有的模块已经写好，你只需要读一读就可以了。输入为question_answer.txt，最后处理之后的结果存放在question_answer_parse.pkl文件中。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import emoji\n",
+    "import re\n",
+    "import jieba\n",
+    "import os\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 导入原始数据\n",
+    "QApares_df = pd.read_csv('data/question_answer.txt',sep='\\t',header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>买 二份 有没有 少点 呀</td>\n",
+       "      <td>亲亲 真的 不好意思 我们 已经 是 优惠价 了 呢 小本生意 请亲 谅解</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>那 就 等 你们 处理 喽</td>\n",
+       "      <td>好 的 亲退 了</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>那 我 不 喜欢</td>\n",
+       "      <td>颜色 的话 一般 茶刀 茶针 和 二合一 的话 都 是 红木 檀 和 黑木 檀 哦</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>不是 免 运费</td>\n",
+       "      <td>本店 茶具 订单 满 99 包邮除 宁夏 青海 内蒙古 海南 新疆 西藏 满 39 包邮</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>好吃 吗</td>\n",
+       "      <td>好吃 的</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               0                                             1\n",
+       "0  买 二份 有没有 少点 呀         亲亲 真的 不好意思 我们 已经 是 优惠价 了 呢 小本生意 请亲 谅解\n",
+       "1  那 就 等 你们 处理 喽                                      好 的 亲退 了\n",
+       "2       那 我 不 喜欢     颜色 的话 一般 茶刀 茶针 和 二合一 的话 都 是 红木 檀 和 黑木 檀 哦\n",
+       "3        不是 免 运费  本店 茶具 订单 满 99 包邮除 宁夏 青海 内蒙古 海南 新疆 西藏 满 39 包邮\n",
+       "4           好吃 吗                                          好吃 的"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#观察原始数据格式，查看头几个样本\n",
+    "QApares_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question_before_preprocessing</th>\n",
+       "      <th>answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>买 二份 有没有 少点 呀</td>\n",
+       "      <td>亲亲 真的 不好意思 我们 已经 是 优惠价 了 呢 小本生意 请亲 谅解</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>那 就 等 你们 处理 喽</td>\n",
+       "      <td>好 的 亲退 了</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>那 我 不 喜欢</td>\n",
+       "      <td>颜色 的话 一般 茶刀 茶针 和 二合一 的话 都 是 红木 檀 和 黑木 檀 哦</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>不是 免 运费</td>\n",
+       "      <td>本店 茶具 订单 满 99 包邮除 宁夏 青海 内蒙古 海南 新疆 西藏 满 39 包邮</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>好吃 吗</td>\n",
+       "      <td>好吃 的</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  question_before_preprocessing                                        answer\n",
+       "0                 买 二份 有没有 少点 呀         亲亲 真的 不好意思 我们 已经 是 优惠价 了 呢 小本生意 请亲 谅解\n",
+       "1                 那 就 等 你们 处理 喽                                      好 的 亲退 了\n",
+       "2                      那 我 不 喜欢     颜色 的话 一般 茶刀 茶针 和 二合一 的话 都 是 红木 檀 和 黑木 檀 哦\n",
+       "3                       不是 免 运费  本店 茶具 订单 满 99 包邮除 宁夏 青海 内蒙古 海南 新疆 西藏 满 39 包邮\n",
+       "4                          好吃 吗                                          好吃 的"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 将 QApares_df列重命名\n",
+    "QApares_df.columns = ['question_before_preprocessing','answer']\n",
+    "QApares_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def rm_space(sentence):\n",
+    "    '''将已分好词的句子去掉词之间的空格变成原句子，如将‘那 就 等 你们 处理 喽’改为‘那就等你们处理喽’'''\n",
+    "    return ''.join(sentence.split())\n",
+    "QApares_df['question'] = QApares_df.question_before_preprocessing.apply(rm_space)\n",
+    "QApares_df['answer'] = QApares_df.answer.apply(rm_space)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question_before_preprocessing</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>question</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>买 二份 有没有 少点 呀</td>\n",
+       "      <td>亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解</td>\n",
+       "      <td>买二份有没有少点呀</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>那 就 等 你们 处理 喽</td>\n",
+       "      <td>好的亲退了</td>\n",
+       "      <td>那就等你们处理喽</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>那 我 不 喜欢</td>\n",
+       "      <td>颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦</td>\n",
+       "      <td>那我不喜欢</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>不是 免 运费</td>\n",
+       "      <td>本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮</td>\n",
+       "      <td>不是免运费</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>好吃 吗</td>\n",
+       "      <td>好吃的</td>\n",
+       "      <td>好吃吗</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  question_before_preprocessing                          answer   question\n",
+       "0                 买 二份 有没有 少点 呀      亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解  买二份有没有少点呀\n",
+       "1                 那 就 等 你们 处理 喽                           好的亲退了   那就等你们处理喽\n",
+       "2                      那 我 不 喜欢      颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦      那我不喜欢\n",
+       "3                       不是 免 运费  本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮      不是免运费\n",
+       "4                          好吃 吗                             好吃的        好吃吗"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "QApares_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "接下来对数据进行预处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 清洗数据\n",
+    "def clean(content):\n",
+    "    #将数据中的emoji表情转化为文字\n",
+    "    content = emoji.demojize(content)\n",
+    "    #过滤其中的html标签\n",
+    "    content = re.sub('<.*>','',content)\n",
+    "    return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 对每个问题应用上一clean函数进行清洗\n",
+    "QApares_df['question_after_preprocessing'] = QApares_df['question_before_preprocessing'].apply(clean)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按照一般的流程，之后应该是对问题进行分词，但由于我们的数据已经是分好词的数据，所以这里不再进行中文分词，而是直接将分好的词以列表的形式存储。\n",
+    "QApares_df['question_after_preprocessing'] = QApares_df['question_after_preprocessing'].apply(lambda x:x.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 对question_after_preprocessing列去除单词中的空格回车等符号\n",
+    "def strip(wordList):\n",
+    "    return [word.strip() for word in wordList if word.strip()!='']\n",
+    "QApares_df['question_after_preprocessing'] = QApares_df['question_after_preprocessing'].apply(strip)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取data/stopWord.json中保存的的停用词表，并保存在列表中\n",
+    "# 中文停用词表的下载地址:https://github.com/goto456/stopwords/\n",
+    "with open(\"data/stopWord.json\",\"r\") as f:\n",
+    "    stopWords = f.read().split(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 去除停用词\n",
+    "def rm_stop_word(wordList):\n",
+    "    return [word for word in wordList if word not in stopWords]\n",
+    "QApares_df['question_after_preprocessing'] = QApares_df['question_after_preprocessing'].apply(rm_stop_word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 去除低频词\n",
+    "allWords = [word for question in QApares_df['question_after_preprocessing'] for word in question] #所有词组成的列表\n",
+    "freWord = Counter(allWords) #统计词频，一个字典，键为词，值为词出现的次数\n",
+    "highFreWords = [word for word in freWord.keys() if freWord[word]>5] # 词频超过5的词列表, 剩下的词去掉\n",
+    "def rm_low_fre_word(content):\n",
+    "    return [word for word in content if word in highFreWords]\n",
+    "\n",
+    "# 去除低频词\n",
+    "QApares_df['question_after_preprocessing'] = QApares_df['question_after_preprocessing'].apply(rm_low_fre_word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>question_after_preprocessing</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>买二份有没有少点呀</td>\n",
+       "      <td>亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解</td>\n",
+       "      <td>[买, 二份, 有没有, 少点]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>那就等你们处理喽</td>\n",
+       "      <td>好的亲退了</td>\n",
+       "      <td>[处理]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>那我不喜欢</td>\n",
+       "      <td>颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦</td>\n",
+       "      <td>[喜欢]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>不是免运费</td>\n",
+       "      <td>本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮</td>\n",
+       "      <td>[免, 运费]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>好吃吗</td>\n",
+       "      <td>好吃的</td>\n",
+       "      <td>[好吃]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    question                          answer question_after_preprocessing\n",
+       "0  买二份有没有少点呀      亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解             [买, 二份, 有没有, 少点]\n",
+       "1   那就等你们处理喽                           好的亲退了                         [处理]\n",
+       "2      那我不喜欢      颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦                         [喜欢]\n",
+       "3      不是免运费  本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮                      [免, 运费]\n",
+       "4        好吃吗                             好吃的                         [好吃]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 只保留question,answer和question_after_preprocessing列\n",
+    "QApares_df = QApares_df[['question','answer','question_after_preprocessing']]\n",
+    "#查看预处理之后的数据\n",
+    "QApares_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#由于保存为csv格式会将数据默认保存为str格式，但我们的数据中question_after_preprocessing列中每一项为列表形式，保存为csv格式会增加我们后面处理数据的难度，所以这里我们将保存为pickle形式\n",
+    "import pickle\n",
+    "with open('data/question_answer_pares.pkl','wb') as f:\n",
+    "    pickle.dump(QApares_df,f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:greedyaiqa] *",
+   "language": "python",
+   "name": "conda-env-greedyaiqa-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}