Commit aa5a6c20 by 20200203098

first project

parents 4ec16fce 68595987
++ "b/\350\257\276\344\273\266/0411Skip-gram\346\272\220\347\240\201\350\256\262\350\247\243/.gitkeep"
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 11:53:12 2020
@author: youngwells
"""
# 加载转化后的文件
import json
import codecs
from collections import defaultdict
from gensim.test.utils import datapath,get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file=datapath("/Users/youngwells/Downloads/course-info/课件/Project1-master/glove.6B.200d.txt")
tmp_file =get_tmpfile( 'word2vec.txt')
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)
# 获取所有词向量表
word_list = []
for word in model.vocab.keys():
print(word)
word_list += [word]
# print(len(word_list))
similar_list = []
# 获取每个词的前10大相关
for word_temp in word_list:
# 建构成字典
similar_word = []
# 取模型最相似的前10个单词和向量,参数默认为10
similar_top = model.most_similar(word_temp)
for similar_top_word in similar_top:
similar_word += [similar_top_word[0]]
similar_list += [similar_word]
# 转换成字典
zip_list = zip(word_list, similar_list)
d = defaultdict(list)
for key, value in zip_list:
d[key].append(value)
##########################保存成txt檔####################################
# 相关词字典写成txt
file = codecs.open('/Users/youngwells/Downloads/course-info/课件/Project1-master/related_words.txt', 'w', 'utf-8-sig')
# 将每项元素的key和value分拆组成字符串,添加分隔符和换行符
for k, v in d.items():
file.write(str(k) + ' ' + str(v) + '\n')
# 注意关闭文件
file.close()
print('the file has been wrote')
##########################保存成json檔####################################
# 相关词字典存成json
file_name = '/Users/youngwells/Downloads/course-info/课件/Project1-master/related_words.json'
with open(file_name, 'w') as file_object:
json.dump(d, file_object)
print('txt_file has been wrote')
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment