hw2

bc857197 · 20200116311 · ccfb41ea · bc857197 · bc857197 · bc857197
Commit bc857197 authored Feb 22, 2020 by 20200116311
Hide whitespace changes
Inline Side-by-side

Showing with 323 additions and 0 deletions

homework2/.ipynb_checkpoints/spell_correct-checkpoint.ipynb
+165 -0

homework2/bayes_train_text.txt
+0 -0

homework2/spell_correct.ipynb
+158 -0

No files found.
--- a/homework2/.ipynb_checkpoints/spell_correct-checkpoint.ipynb
+++ b/homework2/.ipynb_checkpoints/spell_correct-checkpoint.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import collections #可以使用.defaultdict给字典变量设置一个默认值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 提取语料库中的所有单词并且转化为小写\n",
+    "def readWords(text):\n",
+    "    file = open(text).read().lower()\n",
+    "    for ch in '''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,''':\n",
+    "        file = file.replace(ch,'')\n",
+    "    file = file.split()\n",
+    "    return file\n",
+    "   \n",
+    "# 若单词不在语料库中，默认词频为1，避免先验概率为0的情况\n",
+    "def train(features):\n",
+    "    model = collections.defaultdict(lambda:1)#若key为空，默认值为1\n",
+    "    #统计features中的词频并保存在model中\n",
+    "    for i in features:\n",
+    "        model[i] = model.get(i, 0) + 1\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 编辑距离为1的所有单词\n",
+    "def edits1(word):\n",
+    "    list = train(readWords('a')).keys()\n",
+    "    edits1_words = []\n",
+    "    for l in list:\n",
+    "        if ed(word,l) == 1:\n",
+    "            if l not in edits1_words:\n",
+    "                edits1_words.append(l)\n",
+    "    return edits1_words\n",
+    "\n",
+    "# 编辑距离为2的所有单词\n",
+    "def edits2(word):\n",
+    "    list = train(readWords('a')).keys()\n",
+    "    edits2_words = []\n",
+    "    for l in list:\n",
+    "        if ed(word,l) == 2:\n",
+    "            if l not in edits2_words:\n",
+    "                edits2_words.append(l)\n",
+    "    #返回所有编辑距离为2的单词\n",
+    "    return edits2_words\n",
+    "def ed(word1, word2):\n",
+    "    len_word1 = len(word1)\n",
+    "    len_word2 = len(word2)\n",
+    "    edit = [[0]*(len_word2+1) for _ in range(len_word1+1)]\n",
+    "    for i in range(1, len_word1+1):\n",
+    "        edit[i][0] = i\n",
+    "    for j in range(1, len_word2+1):\n",
+    "        edit[0][j] = j\n",
+    "    for i in range(1, len_word1+1):\n",
+    "        for j in range(1, len_word2+1):\n",
+    "            if word1[i-1] == word2[j-1]:\n",
+    "                d = 0\n",
+    "            else:\n",
+    "                d = 1\n",
+    "            edit[i][j] = min(edit[i-1][j]+1,edit[i][j-1]+1,edit[i-1][j-1]+d)\n",
+    "    return edit[len_word1][len_word2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 过滤非词典中的单词，输入为一个单词列表，返回只有在词典中出现过的单词\n",
+    "def known(words):\n",
+    "    list = words.split()\n",
+    "    for i in list:\n",
+    "        if i not in readWords('E:\\\\git_project\\\\course-info\\\\课件\\\\homework\\\\homework2\\\\bayes_train_text.txt'):\n",
+    "            list.remove(i)\n",
+    "    return list\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#输入为一个单词，若语料库中存在该单词就输出None，若不存在则通过编辑距离进行纠错\n",
+    "def correct(word):\n",
+    "    if len(known(word)):\n",
+    "        return None\n",
+    "    else:\n",
+    "        print(edits1(word))\n",
+    "        print(edits2(word))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'a'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-18-a678357b20c2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorrect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"het\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorrect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"annd\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-17-1e542d42bc49>\u001b[0m in \u001b[0;36mcorrect\u001b[1;34m(word)\u001b[0m\n\u001b[0;32m      4\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m         \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0medits1\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m         \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0medits2\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-13-9b2d397f8757>\u001b[0m in \u001b[0;36medits1\u001b[1;34m(word)\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 编辑距离为1的所有单词\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0medits1\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[0mlist\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreadWords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[0medits1_words\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-11-ddb581f2d976>\u001b[0m in \u001b[0;36mreadWords\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 提取语料库中的所有单词并且转化为小写\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreadWords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[0mfile\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mch\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;34m'''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,'''\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m         \u001b[0mfile\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'a'"
+     ]
+    }
+   ],
+   "source": [
+    "print(correct(\"het\"))\n",
+    "print(correct(\"annd\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/homework2/bayes_train_text.txt
+++ b/homework2/bayes_train_text.txt
--- a/homework2/spell_correct.ipynb
+++ b/homework2/spell_correct.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import collections #可以使用.defaultdict给字典变量设置一个默认值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 提取语料库中的所有单词并且转化为小写\n",
+    "def readWords():\n",
+    "    file = open(\"E:\\\\git_project\\\\course-info\\\\课件\\\\homework\\\\homework2\\\\bayes_train_text.txt\").read().lower()\n",
+    "    for ch in '''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,''':\n",
+    "        file = file.replace(ch,'')\n",
+    "    file = file.split()\n",
+    "    return file\n",
+    "   \n",
+    "# 若单词不在语料库中，默认词频为1，避免先验概率为0的情况\n",
+    "def train(features):\n",
+    "    model = collections.defaultdict(lambda:1)#若key为空，默认值为1\n",
+    "    #统计features中的词频并保存在model中\n",
+    "    for i in features:\n",
+    "        model[i] = model.get(i, 0) + 1\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 编辑距离为1的所有单词\n",
+    "def edits1(word):\n",
+    "    list = train(readWords()).keys()\n",
+    "    edits1_words = []\n",
+    "    for l in list:\n",
+    "        if ed(word,l) == 1:\n",
+    "            if l not in edits1_words:\n",
+    "                edits1_words.append(l)\n",
+    "    return edits1_words\n",
+    "\n",
+    "# 编辑距离为2的所有单词\n",
+    "def edits2(word):\n",
+    "    list = train(readWords()).keys()\n",
+    "    edits2_words = []\n",
+    "    for l in list:\n",
+    "        if ed(word,l) == 2:\n",
+    "            if l not in edits2_words:\n",
+    "                edits2_words.append(l)\n",
+    "    #返回所有编辑距离为2的单词\n",
+    "    return edits2_words\n",
+    "def ed(word1, word2):\n",
+    "    len_word1 = len(word1)\n",
+    "    len_word2 = len(word2)\n",
+    "    edit = [[0]*(len_word2+1) for _ in range(len_word1+1)]\n",
+    "    for i in range(1, len_word1+1):\n",
+    "        edit[i][0] = i\n",
+    "    for j in range(1, len_word2+1):\n",
+    "        edit[0][j] = j\n",
+    "    for i in range(1, len_word1+1):\n",
+    "        for j in range(1, len_word2+1):\n",
+    "            if word1[i-1] == word2[j-1]:\n",
+    "                d = 0\n",
+    "            else:\n",
+    "                d = 1\n",
+    "            edit[i][j] = min(edit[i-1][j]+1,edit[i][j-1]+1,edit[i-1][j-1]+d)\n",
+    "    return edit[len_word1][len_word2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 过滤非词典中的单词，输入为一个单词列表，返回只有在词典中出现过的单词\n",
+    "def known(words):\n",
+    "    list = words.split()\n",
+    "    for i in list:\n",
+    "        if i not in readWords():\n",
+    "            list.remove(i)\n",
+    "    return list\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#输入为一个单词，若语料库中存在该单词就输出None，若不存在则通过编辑距离进行纠错\n",
+    "def correct(word):\n",
+    "    if len(known(word)):\n",
+    "        return None\n",
+    "    else:\n",
+    "        return '{}编辑距离为1的单词为：{}'.format(word,edits1(word)),'{}编辑距离为1的单词为：{}'.format(word,edits2(word))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(\"het编辑距离为1的单词为：['get', 'set', 'her', 'he', 'yet', 'hot', 'wet', 'let', 'hat', 'hes', 'met', 'hed', 'jet', 'hit', 'net', 'pet', 'bet', 'heh', 'heat', 'aet', 'hey', 'et', 'cet', 'hew', 'hem', 'hut', 'hen']\", \"het编辑距离为1的单词为：['the', 'be', 'when', 'not', 'it', 'at', 'how', 'out', 'she', 'him', 'his', 'sex', 'that', 'felt', 'but', 'has', 'they', 'had', 'led', 'me', 'see', 'lit', 'head', 'chest', 'new', 'then', 'put', 'few', 'left', 'help', 'hear', 'sheet', 'we', 'what', 'e', 't', 'eg', 'ha', 'sent', 'here', 'yes', 'bit', 'sit', 'best', 'feet', 'seat', 'men', 'hum', 'them', 'next', 'st', 'chat', 'sat', 'lent', 'went', 'neat', 'shot', 'eat', 'meet', 'ten', 'hurt', 'hell', 'heart', 'red', 'key', 'got', 'nest', 'rest', 'nee', 'held', 'wit', 'gets', 'shut', 'west', 'beg', 'beat', 'vex', 'hunt', 'heel', 'hint', 'bed', 'kept', 'cest', 'test', 'apt', 'de', 'ned', 'per', 'cent', 'fit', 'pen', 'ft', 'es', 'rat', 'tea', 'rent', 'bent', 'belt', 'tut', 'herd', 'leg', 'cut', 'jot', 'wee', 'text', 'sea', 'lee', 'act', 'web', 'art', 'h', 'den', 'heed', 'lot', 'kent', 'hate', 'heap', 'debt', 'fat', 'hats', 'eh', 'gem', 'host', 'lets', 'shed', 'jem', 'zest', 'cat', 'pets', 'pit', 'en', 'fee', 'lest', 'dew', 'etc', 'pew', 'nut', 'oct', 'hay', 'bee', 'theft', 'mat', 'hid', 'jest', 'feat', 'hers', 'fed', 'hart', 'hw', 'leo', 'poet', 'pot', 'hath', 'hj', 'whit', 'hemp', 'el', 'rev', 'ed', 'mete', 'meat', 'eb', 'ae', 'keg', 'le', 'veto', 'ge', 'jt', 'ye', 'hero', 'sect', 'wept', 'wheat', 'ghent', 'se', 'kit', 'em', 'fe', 'hoes', 'er', 'pe', 'heal', 'bt', 'mt', 'ew', 'hp', 'huts', 'hh', 'hk', 'ev', 'thee', 'bets', 'hr', 'rt', 'vest', 'wt', 'hon', 'des', 'ea', 'heir', '1st', 'ee', 'ex', 'sept', 'dec', 'rep', 'dem', 're', 'hb', '1e8', '1e', '1e1', '1e2', '1e7', '1e9', '1e3', '1e4', '1e5', '1e6', 'ut', 'oe', 'rete', 'hue', 'sets', 'diet', 'gut', 'pea', 'ham', 'hata', 'cret', 'jets', 'hip', 'hens', 'heath', 'pes', 'ext', 'wen', 'med', 'fete', 'beto', 'cher', 'shes', 'est', 'hm', 'zat', 'ze', 'je', 've', 'zen', 'fret', 'lea', 'dat', 'ce', 'sen', 'ho', 'peg', 'cheat', 'len', 'jew', 'halt', 'vent', 'mot', 'welt', 'wat', 'hi', 'hilt', 'les', 'feu', 'tent', 'herb', 'hop', 'vot', 'melt', 'hethe', 'chef', 'ken', 'herr', 'tit', 'een', 'oer', 'sot', 'chit', 'hast', 'ne', 'cot', 'wed', 'pat', 'gee', 'hewn', 'rut', '6st', 'fez', 'ces', 'ses', 'heah', 'whew', 'nen', 'peu', 'der', 'ist', 'ney', 'hita', 'deft', 'tt', 'rec', 'sew', 'ont', 'ihe', 'hehe', 'pelt', 'hits', 'eut', 'vert', 'ke', 'vat', 'opt', 'shit', 'yep', 'bat', 'dot', 'ant', 'chew', 'hug', 'pest', 'def', 'ety', 'ben', 'yer', 'del', 'shew', 'git', 'oft', 'yea', 'feb', 'te', 'helm', 'gen', 'sez', 'dey', 'ted', 'aft', 'hetty', 'wert', 'meg', 'holt', 'eer', 'nets', 'rot', 'beth', 'http']\")\n",
+      "(\"annd编辑距离为1的单词为：['and', 'anne', 'ann', 'anna']\", \"annd编辑距离为1的单词为：['any', 'find', 'band', 'mind', 'an', 'hand', 'end', 'send', 'nod', 'wind', 'kind', 'fund', 'acid', 'amid', 'sand', 'ned', 'fond', 'aid', 'add', 'land', 'bond', '2nd', 'waned', '22nd', 'inn', 'tend', 'aunt', 'bind', 'aged', 'bend', 'annal', 'anew', 'annes', 'manned', 'fanned', 'canned', 'amend', 'lend', 'inns', 'ana', 'arid', 'hanna', 'ind', 'anus', 'acne', 'ad', 'unna', 'fand', 'anal', 'anel', 'anode', 'ante', 'und', 'enns', 'ants', 'awed', 'hind', 'pond', 'nn', 'mann', 'manna', 'mend', 'kann', 'manand', 'onand', 'annex', 'cond', 'iand', 'annoy', 'ant', 'aint', 'fanny', 'andy', 'nd', 'anon', 'wand']\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(correct(\"het\"))\n",
+    "print(correct(\"annd\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}