Commit 90fbfa9a by 20200519052

添加了代码文件

parent 1e077ad7
import heapq
import numpy as np
# 利用priority queue性质求解Top-K问题
def topk(inputs, k):
pq = []
pq_index = []
for index, value in enumerate(inputs):
if len(pq) < k:
heapq.heappush(pq, value)
heapq.heappush(pq_index, index)
elif value > pq[0]:
heapq.heapreplace(pq, value)
heapq.heapreplace(pq_index, index)
ret = list()
while pq_index:
ret.append(heapq.heappop(pq_index))
return ret[::-1]
# 计算每个词的glove词向量的语义相似度,每个词取最相似topk
def get_top_glove_results(vec, emb, vocab):
# 计算余弦相似度
cos = np.dot(vec, emb) / (np.linalg.norm(vec)*(np.linalg.norm(emb)))
inputs = list(cos)
k = 10
top_idxs = topk(inputs, k) # top_idxs存放相似度最高的词向量下标
return vocab[top_idxs] # 返回相似度最高的问题对应的答案,作为TOP10答案
with open('./data/word2id.txt', 'r') as f:
word2id = eval(f.read())
emb = np.loadtxt('./data/embedding.txt',delimiter=',')
res = {}
vocab = np.array(list(word2id.keys()))
for k in word2id:
word_vec = emb[word2id[k]-1, :]
res[k] = list(get_top_glove_results(word_vec, emb.T, vocab))
# 保存结果
with open('related_words.txt', 'w') as f:
f.write(str(res))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment