Commit 7c638758 by 20200519016

modify project3

parent 0a92a1fa
{
{
......@@ -129,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
......@@ -158,13 +158,13 @@
" self.en_word_dict, self.en_total_words, self.en_index_dict = self.build_dict(self.train_en)\n",
" self.cn_word_dict, self.cn_total_words, self.cn_index_dict = self.build_dict(self.train_cn)\n",
"\n",
"# # id化\n",
"# self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, self.en_word_dict, self.cn_word_dict)\n",
"# self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, self.en_word_dict, self.cn_word_dict)\n",
" # id化\n",
" self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, self.en_word_dict, self.cn_word_dict)\n",
" self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, self.en_word_dict, self.cn_word_dict)\n",
"\n",
"# # 划分batch + padding + mask\n",
"# self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)\n",
"# self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)\n",
" # 划分batch + padding + mask\n",
" self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)\n",
" self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)\n",
"\n",
" def load_data(self, path):\n",
" \"\"\"\n",
......@@ -285,7 +285,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
......
{
{
......@@ -129,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
......@@ -158,13 +158,13 @@
" self.en_word_dict, self.en_total_words, self.en_index_dict = self.build_dict(self.train_en)\n",
" self.cn_word_dict, self.cn_total_words, self.cn_index_dict = self.build_dict(self.train_cn)\n",
"\n",
"# # id化\n",
"# self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, self.en_word_dict, self.cn_word_dict)\n",
"# self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, self.en_word_dict, self.cn_word_dict)\n",
" # id化\n",
" self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, self.en_word_dict, self.cn_word_dict)\n",
" self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, self.en_word_dict, self.cn_word_dict)\n",
"\n",
"# # 划分batch + padding + mask\n",
"# self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)\n",
"# self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)\n",
" # 划分batch + padding + mask\n",
" self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)\n",
" self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)\n",
"\n",
" def load_data(self, path):\n",
" \"\"\"\n",
......@@ -219,8 +219,8 @@
" length = len(en)\n",
" \n",
" # TODO: 将翻译前(英文)数据和翻译后(中文)数据都转换为id表示的形式\n",
" out_en_ids = [[en_dict.get(w, 0) for w in sent] for sent in en]\n",
" out_cn_ids = [[cn_dict.get(w, 0) for w in sent] for sent in cn]\n",
" out_en_ids = [[en_dict.get(w) for w in sent] for sent in en]\n",
" out_cn_ids = [[cn_dict.get(w) for w in sent] for sent in cn]\n",
"\n",
"\n",
" # 构建一个按照句子长度排序的函数\n",
......@@ -285,14 +285,27 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 31,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# test load data\n",
"prepare = PrepareData('nmt/en-cn/train_mini.txt','nmt/en-cn/train_mini.txt')\n",
"# [en,cn] = prepare.load_data('nmt/en-cn/train_mini.txt')\n",
"# print(cn)"
"# print(cn)\n",
"d = {'a':3,'b':5}\n",
"d.get('a')"
]
},
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment