related.py 983 Bytes
Newer Older
20200519027 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

import numpy as np
import heapq

glovefile = open("glove.6B.200d.txt","r",encoding="utf-8")  
 #key:单词 val:矢量
emb = {}
for line in glovefile:
    word, vec = line.split(maxsplit=1)
    vec = np.fromstring(vec, 'float', sep = ' ')
    emb[word] = vec

related = []
for word in emb:
    related.append(word)


# 计算词与词之间的相似度

def cos_sim(x,y):
    return x.dot(y)/(np.linalg.norm(x)*np.linalg.norm(y))



for i in range(len(related)):
    sim_h = []
    for word2 in emb:

        if word2 != related[i]:
            vec1, vec2 = emb[related[i]], emb[word2]
            sim = cos_sim(vec1,vec2)
            heapq.heappush(sim_h,(sim,word2))
    
    related_words = heapq.nsmallest(10, sim_h)
    for word in related_words:
        related[i] = related[i] + ' ' + word[1]

    if ((i/len(related))*100)//10 == 0:
        print("Finish %d %" % i/len(related))*100) 


with open('related.txt', 'w') as f:
    f.writelines("%s\n" % i for i in related)