Commit 605d661f by 20200519027

Project 1 补交

parents
import numpy as np
import heapq
glovefile = open("glove.6B.200d.txt","r",encoding="utf-8")
#key:单词 val:矢量
emb = {}
for line in glovefile:
word, vec = line.split(maxsplit=1)
vec = np.fromstring(vec, 'float', sep = ' ')
emb[word] = vec
related = []
for word in emb:
related.append(word)
# 计算词与词之间的相似度
def cos_sim(x,y):
return x.dot(y)/(np.linalg.norm(x)*np.linalg.norm(y))
for i in range(len(related)):
sim_h = []
for word2 in emb:
if word2 != related[i]:
vec1, vec2 = emb[related[i]], emb[word2]
sim = cos_sim(vec1,vec2)
heapq.heappush(sim_h,(sim,word2))
related_words = heapq.nsmallest(10, sim_h)
for word in related_words:
related[i] = related[i] + ' ' + word[1]
if ((i/len(related))*100)//10 == 0:
print("Finish %d %" % i/len(related))*100)
with open('related.txt', 'w') as f:
f.writelines("%s\n" % i for i in related)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment