Commit bc857197 by 20200116311

hw2

parent ccfb41ea
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import collections #可以使用.defaultdict给字典变量设置一个默认值"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# 提取语料库中的所有单词并且转化为小写\n",
"def readWords(text):\n",
" file = open(text).read().lower()\n",
" for ch in '''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,''':\n",
" file = file.replace(ch,'')\n",
" file = file.split()\n",
" return file\n",
" \n",
"# 若单词不在语料库中,默认词频为1,避免先验概率为0的情况\n",
"def train(features):\n",
" model = collections.defaultdict(lambda:1)#若key为空,默认值为1\n",
" #统计features中的词频并保存在model中\n",
" for i in features:\n",
" model[i] = model.get(i, 0) + 1\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# 编辑距离为1的所有单词\n",
"def edits1(word):\n",
" list = train(readWords('a')).keys()\n",
" edits1_words = []\n",
" for l in list:\n",
" if ed(word,l) == 1:\n",
" if l not in edits1_words:\n",
" edits1_words.append(l)\n",
" return edits1_words\n",
"\n",
"# 编辑距离为2的所有单词\n",
"def edits2(word):\n",
" list = train(readWords('a')).keys()\n",
" edits2_words = []\n",
" for l in list:\n",
" if ed(word,l) == 2:\n",
" if l not in edits2_words:\n",
" edits2_words.append(l)\n",
" #返回所有编辑距离为2的单词\n",
" return edits2_words\n",
"def ed(word1, word2):\n",
" len_word1 = len(word1)\n",
" len_word2 = len(word2)\n",
" edit = [[0]*(len_word2+1) for _ in range(len_word1+1)]\n",
" for i in range(1, len_word1+1):\n",
" edit[i][0] = i\n",
" for j in range(1, len_word2+1):\n",
" edit[0][j] = j\n",
" for i in range(1, len_word1+1):\n",
" for j in range(1, len_word2+1):\n",
" if word1[i-1] == word2[j-1]:\n",
" d = 0\n",
" else:\n",
" d = 1\n",
" edit[i][j] = min(edit[i-1][j]+1,edit[i][j-1]+1,edit[i-1][j-1]+d)\n",
" return edit[len_word1][len_word2]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# 过滤非词典中的单词,输入为一个单词列表,返回只有在词典中出现过的单词\n",
"def known(words):\n",
" list = words.split()\n",
" for i in list:\n",
" if i not in readWords('E:\\\\git_project\\\\course-info\\\\课件\\\\homework\\\\homework2\\\\bayes_train_text.txt'):\n",
" list.remove(i)\n",
" return list\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#输入为一个单词,若语料库中存在该单词就输出None,若不存在则通过编辑距离进行纠错\n",
"def correct(word):\n",
" if len(known(word)):\n",
" return None\n",
" else:\n",
" print(edits1(word))\n",
" print(edits2(word))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'a'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-18-a678357b20c2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorrect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"het\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorrect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"annd\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-17-1e542d42bc49>\u001b[0m in \u001b[0;36mcorrect\u001b[1;34m(word)\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0medits1\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0medits2\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-13-9b2d397f8757>\u001b[0m in \u001b[0;36medits1\u001b[1;34m(word)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 编辑距离为1的所有单词\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0medits1\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mlist\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreadWords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0medits1_words\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-11-ddb581f2d976>\u001b[0m in \u001b[0;36mreadWords\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 提取语料库中的所有单词并且转化为小写\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreadWords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mfile\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mch\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;34m'''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,'''\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mfile\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'a'"
]
}
],
"source": [
"print(correct(\"het\"))\n",
"print(correct(\"annd\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import collections #可以使用.defaultdict给字典变量设置一个默认值"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# 提取语料库中的所有单词并且转化为小写\n",
"def readWords():\n",
" file = open(\"E:\\\\git_project\\\\course-info\\\\课件\\\\homework\\\\homework2\\\\bayes_train_text.txt\").read().lower()\n",
" for ch in '''`~!@#$%^&*()_+-={}|[]\\\\:\"?>”<;'“—‘’.…/,''':\n",
" file = file.replace(ch,'')\n",
" file = file.split()\n",
" return file\n",
" \n",
"# 若单词不在语料库中,默认词频为1,避免先验概率为0的情况\n",
"def train(features):\n",
" model = collections.defaultdict(lambda:1)#若key为空,默认值为1\n",
" #统计features中的词频并保存在model中\n",
" for i in features:\n",
" model[i] = model.get(i, 0) + 1\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# 编辑距离为1的所有单词\n",
"def edits1(word):\n",
" list = train(readWords()).keys()\n",
" edits1_words = []\n",
" for l in list:\n",
" if ed(word,l) == 1:\n",
" if l not in edits1_words:\n",
" edits1_words.append(l)\n",
" return edits1_words\n",
"\n",
"# 编辑距离为2的所有单词\n",
"def edits2(word):\n",
" list = train(readWords()).keys()\n",
" edits2_words = []\n",
" for l in list:\n",
" if ed(word,l) == 2:\n",
" if l not in edits2_words:\n",
" edits2_words.append(l)\n",
" #返回所有编辑距离为2的单词\n",
" return edits2_words\n",
"def ed(word1, word2):\n",
" len_word1 = len(word1)\n",
" len_word2 = len(word2)\n",
" edit = [[0]*(len_word2+1) for _ in range(len_word1+1)]\n",
" for i in range(1, len_word1+1):\n",
" edit[i][0] = i\n",
" for j in range(1, len_word2+1):\n",
" edit[0][j] = j\n",
" for i in range(1, len_word1+1):\n",
" for j in range(1, len_word2+1):\n",
" if word1[i-1] == word2[j-1]:\n",
" d = 0\n",
" else:\n",
" d = 1\n",
" edit[i][j] = min(edit[i-1][j]+1,edit[i][j-1]+1,edit[i-1][j-1]+d)\n",
" return edit[len_word1][len_word2]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# 过滤非词典中的单词,输入为一个单词列表,返回只有在词典中出现过的单词\n",
"def known(words):\n",
" list = words.split()\n",
" for i in list:\n",
" if i not in readWords():\n",
" list.remove(i)\n",
" return list\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"#输入为一个单词,若语料库中存在该单词就输出None,若不存在则通过编辑距离进行纠错\n",
"def correct(word):\n",
" if len(known(word)):\n",
" return None\n",
" else:\n",
" return '{}编辑距离为1的单词为:{}'.format(word,edits1(word)),'{}编辑距离为1的单词为:{}'.format(word,edits2(word))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(\"het编辑距离为1的单词为:['get', 'set', 'her', 'he', 'yet', 'hot', 'wet', 'let', 'hat', 'hes', 'met', 'hed', 'jet', 'hit', 'net', 'pet', 'bet', 'heh', 'heat', 'aet', 'hey', 'et', 'cet', 'hew', 'hem', 'hut', 'hen']\", \"het编辑距离为1的单词为:['the', 'be', 'when', 'not', 'it', 'at', 'how', 'out', 'she', 'him', 'his', 'sex', 'that', 'felt', 'but', 'has', 'they', 'had', 'led', 'me', 'see', 'lit', 'head', 'chest', 'new', 'then', 'put', 'few', 'left', 'help', 'hear', 'sheet', 'we', 'what', 'e', 't', 'eg', 'ha', 'sent', 'here', 'yes', 'bit', 'sit', 'best', 'feet', 'seat', 'men', 'hum', 'them', 'next', 'st', 'chat', 'sat', 'lent', 'went', 'neat', 'shot', 'eat', 'meet', 'ten', 'hurt', 'hell', 'heart', 'red', 'key', 'got', 'nest', 'rest', 'nee', 'held', 'wit', 'gets', 'shut', 'west', 'beg', 'beat', 'vex', 'hunt', 'heel', 'hint', 'bed', 'kept', 'cest', 'test', 'apt', 'de', 'ned', 'per', 'cent', 'fit', 'pen', 'ft', 'es', 'rat', 'tea', 'rent', 'bent', 'belt', 'tut', 'herd', 'leg', 'cut', 'jot', 'wee', 'text', 'sea', 'lee', 'act', 'web', 'art', 'h', 'den', 'heed', 'lot', 'kent', 'hate', 'heap', 'debt', 'fat', 'hats', 'eh', 'gem', 'host', 'lets', 'shed', 'jem', 'zest', 'cat', 'pets', 'pit', 'en', 'fee', 'lest', 'dew', 'etc', 'pew', 'nut', 'oct', 'hay', 'bee', 'theft', 'mat', 'hid', 'jest', 'feat', 'hers', 'fed', 'hart', 'hw', 'leo', 'poet', 'pot', 'hath', 'hj', 'whit', 'hemp', 'el', 'rev', 'ed', 'mete', 'meat', 'eb', 'ae', 'keg', 'le', 'veto', 'ge', 'jt', 'ye', 'hero', 'sect', 'wept', 'wheat', 'ghent', 'se', 'kit', 'em', 'fe', 'hoes', 'er', 'pe', 'heal', 'bt', 'mt', 'ew', 'hp', 'huts', 'hh', 'hk', 'ev', 'thee', 'bets', 'hr', 'rt', 'vest', 'wt', 'hon', 'des', 'ea', 'heir', '1st', 'ee', 'ex', 'sept', 'dec', 'rep', 'dem', 're', 'hb', '1e8', '1e', '1e1', '1e2', '1e7', '1e9', '1e3', '1e4', '1e5', '1e6', 'ut', 'oe', 'rete', 'hue', 'sets', 'diet', 'gut', 'pea', 'ham', 'hata', 'cret', 'jets', 'hip', 'hens', 'heath', 'pes', 'ext', 'wen', 'med', 'fete', 'beto', 'cher', 'shes', 'est', 'hm', 'zat', 'ze', 'je', 've', 'zen', 'fret', 'lea', 'dat', 'ce', 'sen', 'ho', 'peg', 'cheat', 'len', 'jew', 'halt', 'vent', 'mot', 'welt', 'wat', 'hi', 'hilt', 'les', 'feu', 'tent', 'herb', 'hop', 'vot', 'melt', 'hethe', 'chef', 'ken', 'herr', 'tit', 'een', 'oer', 'sot', 'chit', 'hast', 'ne', 'cot', 'wed', 'pat', 'gee', 'hewn', 'rut', '6st', 'fez', 'ces', 'ses', 'heah', 'whew', 'nen', 'peu', 'der', 'ist', 'ney', 'hita', 'deft', 'tt', 'rec', 'sew', 'ont', 'ihe', 'hehe', 'pelt', 'hits', 'eut', 'vert', 'ke', 'vat', 'opt', 'shit', 'yep', 'bat', 'dot', 'ant', 'chew', 'hug', 'pest', 'def', 'ety', 'ben', 'yer', 'del', 'shew', 'git', 'oft', 'yea', 'feb', 'te', 'helm', 'gen', 'sez', 'dey', 'ted', 'aft', 'hetty', 'wert', 'meg', 'holt', 'eer', 'nets', 'rot', 'beth', 'http']\")\n",
"(\"annd编辑距离为1的单词为:['and', 'anne', 'ann', 'anna']\", \"annd编辑距离为1的单词为:['any', 'find', 'band', 'mind', 'an', 'hand', 'end', 'send', 'nod', 'wind', 'kind', 'fund', 'acid', 'amid', 'sand', 'ned', 'fond', 'aid', 'add', 'land', 'bond', '2nd', 'waned', '22nd', 'inn', 'tend', 'aunt', 'bind', 'aged', 'bend', 'annal', 'anew', 'annes', 'manned', 'fanned', 'canned', 'amend', 'lend', 'inns', 'ana', 'arid', 'hanna', 'ind', 'anus', 'acne', 'ad', 'unna', 'fand', 'anal', 'anel', 'anode', 'ante', 'und', 'enns', 'ants', 'awed', 'hind', 'pond', 'nn', 'mann', 'manna', 'mend', 'kann', 'manand', 'onand', 'annex', 'cond', 'iand', 'annoy', 'ant', 'aint', 'fanny', 'andy', 'nd', 'anon', 'wand']\")\n"
]
}
],
"source": [
"print(correct(\"het\"))\n",
"print(correct(\"annd\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment