{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/Users/zeyusu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import fool\n",
    "import time\n",
    "import re\n",
    "import random\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stopwords = {}\n",
    "with open(r'stopword.txt', 'r', encoding='utf-8') as fr:\n",
    "    for word in fr:\n",
    "        stopwords[word.strip()] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class CLF_MODEL:\n",
    "    # 类目标：该类将所有模型训练、预测、数据预处理、意图识别的函数包括其中\n",
    "\n",
    "    # 初始化模块\n",
    "    def __init__(self):\n",
    "        self.model = \"\"  # 成员变量，用于存储模型\n",
    "        self.vectorizer = \"\"  # 成员变量，用于存储tfidf统计值\n",
    "\n",
    "    # 训练模块\n",
    "    def train(self):\n",
    "        # 函数目标：读取训练数据，训练意图分类模型，并将训练好的分类模型赋值给成员变量self.model\n",
    "        # input：无\n",
    "        # output：无\n",
    "\n",
    "        # 从excel文件读取训练样本\n",
    "        d_train = pd.read_excel(\"data_train.xlsx\")\n",
    "        # 对训练数据进行预处理\n",
    "        d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean)\n",
    "        print(\"训练样本 = %d\" % len(d_train))\n",
    "\n",
    "        \"\"\"\n",
    "        TODO：利用sklearn中的函数进行训练，将句子转化为特征features\n",
    "        \"\"\"\n",
    "        self.vectorizer = TfidfVectorizer()\n",
    "        X = self.vectorizer.fit_transform(d_train.sentence_train)\n",
    "        self.model = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "\n",
    "        self.model.fit(X, d_train.label)\n",
    "\n",
    "    # 预测模块（使用模型预测）\n",
    "    def predict_model(self, sentence):\n",
    "        # 函数目标：使用意图分类模型预测意图\n",
    "        #  input：sentence（用户输入）\n",
    "        # output：clf_result（意图类别），score（意图分数）\n",
    "\n",
    "        # --------------\n",
    "        # 对样本中没有的特殊情况做特别判断\n",
    "        if sentence in [\"好的\", \"需要\", \"是的\", \"要的\", \"好\", \"要\", \"是\"]:\n",
    "            return 1, 0.8\n",
    "        # --------------\n",
    "\n",
    "        \"\"\"\n",
    "        TODO：利用已训练好的意图分类模型进行意图识别\n",
    "        \"\"\"\n",
    "        sentence = self.fun_clean(sentence)\n",
    "        x = self.vectorizer.transform([sentence])\n",
    "        clf_result = self.model.predict(x)[0]\n",
    "        score = self.model.predict_proba(x)[0][clf_result]\n",
    "\n",
    "\n",
    "        return clf_result, score\n",
    "\n",
    "    # 预测模块（使用规则）\n",
    "    def predict_rule(self, sentence):\n",
    "        # 函数目标：如果模型训练出现异常，可以使用规则进行预测，同时也可以让学员融合\"模型\"及\"规则\"的预测方式\n",
    "        # input：sentence（用户输入）\n",
    "        # output：clf_result（意图类别），score（意图分数）\n",
    "\n",
    "        sentence = sentence.replace(' ', '')\n",
    "        if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence):\n",
    "            return 2, 0.8\n",
    "        elif re.findall(r'订|定|预定|买|购', sentence) or sentence in [\"好的\",\"需要\",\"是的\",\"要的\",\"好\",\"要\",\"是\"]:\n",
    "            return 1, 0.8\n",
    "        else:\n",
    "            return 0, 0.8\n",
    "\n",
    "    # 预处理函数\n",
    "    def fun_clean(self, sentence):\n",
    "        # 函数目标：预处理函数，将必要的实体转换成统一符号（利于分类准确），去除停用词等\n",
    "        # input：sentence（用户输入语句）\n",
    "        # output：sentence（预处理结果）\n",
    "\n",
    "        \"\"\"\n",
    "        TODO：预处理函数，将必要的实体转换成统一符号（利于分类准确），去除停用词等\n",
    "        \"\"\"\n",
    "        words, ners = fool.analysis(sentence)\n",
    "\n",
    "        res = []\n",
    "        ner_idx = 0\n",
    "        cur_idx = 0\n",
    "        for word in words[0]:\n",
    "            if ner_idx < len(ners[0]) and cur_idx == ners[0][ner_idx][0]:\n",
    "                if ners[0][ner_idx][0] == 'time':\n",
    "                    res.append('DATE')\n",
    "                elif ners[0][ner_idx][0] == 'location':\n",
    "                    res.append('CITY')\n",
    "                elif word[0] not in stopwords:\n",
    "                    res.append(word[0])\n",
    "                ner_idx += 1\n",
    "            elif word[0] not in stopwords:\n",
    "                res.append(word[0])\n",
    "            cur_idx += len(word)\n",
    "                \n",
    "        return ' '.join(res)\n",
    "\n",
    "    # 分类主函数\n",
    "    def fun_clf(self, sentence):\n",
    "        # 函数目标：意图识别主函数\n",
    "        # input：sentence（ 用户输入语句）\n",
    "        # output：clf_result（意图类别），score（意图分数）\n",
    "\n",
    "        # 对用户输入进行预处理\n",
    "        sentence = self.fun_clean(sentence)\n",
    "        # 得到意图分类结果（0为“查询”类别，1为“订票”类别，2为“终止服务”类别）\n",
    "        clf_result, score = self.predict_model(sentence)  # 使用训练的模型进行意图预测\n",
    "        # clf_result, score = self.predict_rule(sentence)  # 使用规则进行意图预测（可与用模型进行意图识别的方法二选一）\n",
    "        return clf_result, score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentence = '帮 我 查 一下 DATE 去 CITY 的 票'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练样本 = 99\n"
     ]
    }
   ],
   "source": [
    "clf_obj = CLF_MODEL()\n",
    "# 完成意图识别模型的训练\n",
    "clf_obj.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf_result, score = clf_obj.predict_model(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def fun_replace_num(sentence):\n",
    "    # 函数目标：替换时间中的数字（目的是便于实体识别包fool对实体的识别）\n",
    "    # input：sentence\n",
    "    # output：sentence\n",
    "\n",
    "    # 定义要替换的数字\n",
    "    time_num = {\"一\":\"1\",\"二\":\"2\",\"三\":\"3\",\"四\":\"4\",\"五\":\"5\",\"六\":\"6\",\"七\":\"7\",\"八\":\"8\",\"九\":\"9\",\"十\":\"10\",\"十一\":\"11\",\"十二\":\"12\"}\n",
    "    for k, v in time_num.items():\n",
    "        sentence = sentence.replace(k, v)\n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def slot_fill(sentence, key=None):\n",
    "    # 函数目标：填槽函数（该函数从sentence中寻找需要的内容，完成填槽工作）\n",
    "    # input：sentence（用户输入）, key（指定槽位，只对该句话提取指定槽位的信息）\n",
    "    # output：slot（返回填槽的结果，以json格式返回，key为槽位名，value为值）\n",
    "\n",
    "    slot = {}\n",
    "    # 进行实体识别\n",
    "    slot_lst = {\"time\":\"time\", \"date\":\"time\", \"from_city\":'location', \"to_city\":'location'}\n",
    "    words, ners = fool.analysis(sentence)\n",
    "    \"\"\"\n",
    "    TODO：从sentence中寻找需要的内容，完成填槽工作\n",
    "    \"\"\"\n",
    "    if not key:\n",
    "        keys = [\"time\", \"date\", \"from_city\", \"to_city\"]\n",
    "    else:\n",
    "        keys = [key]\n",
    "    for key in keys:\n",
    "        print(key)\n",
    "        if ners != [[]]:\n",
    "            for i in range(len(ners[0])):\n",
    "                if ners[0][i][2] == slot_lst[key]:\n",
    "                    if key == \"time\" and not isTimeFormat(ners[0][i][3]):\n",
    "                        continue\n",
    "                    slot[key] = ners[0][i][3]\n",
    "                    del ners[0][i]\n",
    "                    break\n",
    "    return slot\n",
    "\n",
    "\n",
    "def fun_wait(clf_obj):\n",
    "    # 函数目标：等待，获取用户输入问句\n",
    "    # input：CLF_MODEL类实例化对象\n",
    "    # output：clf_result（用户输入意图类别）, score（意图识别分数）, sentence（用户输入）\n",
    "\n",
    "    # 等待用户输入\n",
    "    print(\"\\n\\n\\n\")\n",
    "    print(\"-------------------------------------------------------------\")\n",
    "    print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n",
    "    print(\"Starting ...\")\n",
    "    sentence = input(\"客服：请问需要什么服务？(时间请用12小时制表示）\\n\")\n",
    "    # 对用户输入进行意图识别\n",
    "    clf_result, score = clf_obj.fun_clf(sentence)\n",
    "    return clf_result, score, sentence\n",
    "\n",
    "\n",
    "def fun_search(clf_result, sentence):\n",
    "    # 函数目标：为用户查询余票\n",
    "    # input：clf_result（意图分类结果）, sentence（用户输入问句）\n",
    "    # output：是否有票\n",
    "\n",
    "    # 定义槽存储空间\n",
    "    name = {\"time\":\"出发时间\", \"date\":\"出发日期\", \"from_city\":\"出发城市\", \"to_city\":\"到达城市\"}\n",
    "    slot = {\"time\":\"\", \"date\":\"\", \"from_city\":\"\", \"to_city\":\"\"}\n",
    "    # 使用用户第一句话进行填槽\n",
    "    sentence = fun_replace_num(sentence)\n",
    "    slot_init = slot_fill(sentence)\n",
    "    for key in slot_init.keys():\n",
    "        slot[key] = slot_init[key]\n",
    "    # 对未填充对槽位，向用户提问，进行针对性填槽\n",
    "    while \"\" in slot.values():\n",
    "        for key in slot.keys():\n",
    "            if slot[key]==\"\":\n",
    "                sentence = input(\"客服：请问%s是？\\n\"%(name[key]))\n",
    "                sentence = fun_replace_num(sentence)\n",
    "                slot_cur = slot_fill(sentence, key)\n",
    "                for key in slot_cur.keys():\n",
    "                    if slot[key]==\"\":\n",
    "                        slot[key] = slot_cur[key]\n",
    "\n",
    "    # 查询是否有票，并答复用户（本次查询是否有票使用随机数完成，实际情况可查询数据库返回）\n",
    "    if random.random()>0.5:\n",
    "        print(\"客服：%s%s从%s到%s的票充足\"%(slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n",
    "        # 返回1表示有票\n",
    "        return 1\n",
    "    else:\n",
    "        print(\"客服：%s%s从%s到%s无票\" % (slot[\"date\"], slot[\"time\"], slot[\"from_city\"], slot[\"to_city\"]))\n",
    "        print(\"End !!!\")\n",
    "        print(\"----*------*-----*-----*----*-----*-----*-----*-----*------\")\n",
    "        print(\"-------------------------------------------------------------\")\n",
    "        # 返回0表示无票\n",
    "        return 0\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 3), match='4pm'>\n"
     ]
    }
   ],
   "source": [
    "time='4pm' \n",
    "print(re.match('^\\d{1,2}([:.]?\\d{1,2})?([ ]?[a|p]m)?$',time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[('4', 'm'), ('pm', 'nx')]]\n",
      "[[]]\n"
     ]
    }
   ],
   "source": [
    "s = '4pm'\n",
    "words, ners = fool.analysis(s)\n",
    "print(words)\n",
    "print(ners)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}