Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
Project1
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200913032
Project1
Commits
6da2b1a1
Commit
6da2b1a1
authored
Nov 15, 2020
by
20200913032
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
9fd4f892
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
236 additions
and
0 deletions
+236
-0
related.py
+236
-0
No files found.
related.py
0 → 100644
View file @
6da2b1a1
import
json
import
numpy
as
np
import
jieba
import
string
import
timeit
from
matplotlib.font_manager
import
FontProperties
import
matplotlib.pyplot
as
plt
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
heapq
from
sklearn.metrics.pairwise
import
cosine_similarity
import
scipy.sparse
import
os
# 自定义打印方法
def
print_format
(
str
,
a
):
print
(
str
+
'
\n
{0}
\n
'
.
format
(
a
))
# cut_list
def
cut_list
(
list_input
,
isSpecialHandle
=
True
):
list_new
=
[]
for
sentence
in
list_input
:
if
isSpecialHandle
:
list_new
.
append
(
sentence
.
replace
(
'?'
,
''
)
.
split
())
else
:
list_new
.
append
(
sentence
.
split
())
return
list_new
#handle_one_sentence
def
handle_one_sentence
(
sentence
):
return
sentence
.
replace
(
'?'
,
''
)
.
split
(
' '
)
def
get_least_numbers_big_data
(
alist
,
k
):
max_heap
=
[]
length
=
len
(
alist
)
if
not
alist
or
k
<=
0
or
k
>
length
:
return
k
-=
1
for
ele
in
alist
:
if
len
(
max_heap
)
<=
k
:
heapq
.
heappush
(
max_heap
,
ele
)
else
:
heapq
.
heappushpop
(
max_heap
,
ele
)
# return list(map(lambda x:x, max_heap))
return
max_heap
# ==============================第一部分:对于训练数据的处理:读取文件和预处理=======================
# 文本的读取: 需要从文本中读取数据,此处需要读取的文件是dev-v2.0.json,并把读取的文件存入一个列表里(list)
def
read_corpus
():
#解析json数据
#Tips1:答案字典“answers”可能会有空的情况 此时应该取plausible_answers节点
qlist
=
[]
alist
=
[]
filename
=
'train-v2.0.json'
with
open
(
filename
,
'r'
)
as
load_f
:
load_dict
=
json
.
load
(
load_f
)
data_list
=
load_dict
[
'data'
]
# len_data = len(data_list)
# print_format("len_data", len_data)
for
data
in
data_list
:
paragraphs
=
data
[
"paragraphs"
]
for
paragraph
in
paragraphs
:
qas
=
paragraph
[
"qas"
]
for
qa
in
paragraph
[
"qas"
]:
if
"answers"
in
qa
:
if
len
(
qa
[
"answers"
])
>
0
!=
None
and
qa
[
"answers"
][
0
][
"text"
]
!=
None
:
qlist
.
append
(
qa
[
"question"
])
alist
.
append
(
qa
[
"answers"
][
0
][
"text"
])
assert
len
(
qlist
)
==
len
(
alist
)
# 确保长度一样
return
qlist
,
np
.
array
(
alist
)
qlist
,
alist
=
read_corpus
()
word_total
=
[
word
for
words_list
in
cut_list
(
qlist
)
for
word
in
words_list
]
word_total_unique
=
list
(
set
(
word_total
))
# print_format("len(word_total)", len(word_total))
# 统计词频
dict_word_count
=
{
l
:
0
for
l
in
word_total_unique
}
for
value
in
word_total
:
dict_word_count
[
value
]
+=
1
def
text_preparation
(
qlist
):
"""
- 1. 停用词过滤 (去网上搜一下 "english stop words list",会出现很多包含停用词库的网页,或者直接使用NLTK自带的)
- 2. 转换成lower_case: 这是一个基本的操作
- 3. 去掉一些无用的符号: 比如连续的感叹号!!!, 或者一些奇怪的单词。
- 4. 去掉出现频率很低的词:比如出现次数少于10,20.... (想一下如何选择阈值)
- 5. 对于数字的处理: 分词完只有有些单词可能就是数字比如44,415,把所有这些数字都看成是一个单词,这个新的单词我们可以定义为 "#number"
- 6. lemmazation: 在这里不要使用stemming, 因为stemming的结果有可能不是valid word。
"""
stopwords
=
{
line
.
rstrip
()
.
lower
():
None
for
line
in
open
(
'stopwords.txt'
)}
low_freg_words
=
{
value
[
0
]:
None
for
value
in
dict_word_count
.
items
()
if
value
[
1
]
<
3
}
start
=
timeit
.
default_timer
()
qlist_new
=
[]
remove_punct_map
=
{
c
:
None
for
c
in
string
.
punctuation
}
for
sentence
in
qlist
:
sentence_new
=
''
words_list
=
handle_one_sentence
(
sentence
)
for
word
in
words_list
:
# 过滤掉频率低的单词
if
word
in
low_freg_words
:
continue
# 去除所有标点符号
word
=
''
.
join
(
c
for
c
in
word
if
c
not
in
remove_punct_map
)
#停用词过滤
if
word
.
lower
()
in
stopwords
:
# print(word)
continue
# 处理数字
if
word
.
isdigit
():
word
=
word
.
replace
(
word
,
"#number"
)
#单词转小写
word
=
word
.
lower
()
sentence_new
+=
word
+
" "
qlist_new
.
append
(
sentence_new
)
# qlist = qlist_new
qlist
=
[
q
for
q
in
qlist_new
if
q
.
rstrip
()
!=
""
]
stop
=
timeit
.
default_timer
()
print
(
'文本预处理 Time: '
,
stop
-
start
)
return
qlist
qlist
=
text_preparation
(
qlist
)
cut_table
=
cut_list
(
qlist
,
isSpecialHandle
=
False
)
word_total
=
[
word
for
words_list
in
cut_table
for
word
in
words_list
]
word_total_unique
=
list
(
set
(
word_total
))
print_format
(
"len(word_total_unique)"
,
len
(
word_total_unique
))
# =====================================glove 方式 开始============================================
embeddings_index
=
{}
glovefile
=
open
(
"glove.6B.200d.txt"
,
"r"
,
encoding
=
"utf-8"
)
for
line
in
glovefile
:
values
=
line
.
split
()
word
=
values
[
0
]
coefs
=
np
.
asarray
(
values
[
1
:],
dtype
=
'float16'
)
embeddings_index
[
word
]
=
coefs
glovefile
.
close
()
embedding_dim
=
200
def
get_embedding_matrix_glove
(
word
):
embedding_vector
=
embeddings_index
.
get
(
word
)
if
embedding_vector
is
not
None
:
return
embedding_vector
[:
embedding_dim
]
return
np
.
zeros
(
embedding_dim
)
word2id
,
id2word
=
{},
{}
emd
=
[]
for
word
in
word_total_unique
:
if
word
not
in
word2id
:
word2id
[
word
]
=
len
(
word2id
)
id2word
[
len
(
id2word
)]
=
word
emd
.
append
(
get_embedding_matrix_glove
(
word
))
emd
=
np
.
asarray
(
emd
)
dict_related
=
{
word
:[]
for
word
in
word_total_unique
}
emd_csr_matrix
=
scipy
.
sparse
.
csr_matrix
(
emd
)
test_count
=
0
for
key
in
dict_related
.
keys
():
# if test_count >=100:
# break
# test_count += 1
word_index
=
word2id
[
key
]
# v_k = emd[index]
# result = list(cosine_similarity(scipy.sparse.csr_matrix(v_k), emd_csr_matrix)[0])
result
=
list
(
cosine_similarity
(
emd_csr_matrix
[
word_index
],
emd_csr_matrix
)[
0
])
top_values
=
sorted
(
get_least_numbers_big_data
(
result
,
10
),
reverse
=
True
)
top_idxs
=
[]
len_result
=
len
(
result
)
dict_visited
=
{}
for
value
in
top_values
:
for
i
in
range
(
len_result
):
if
value
==
result
[
i
]
and
i
not
in
dict_visited
and
word_index
!=
i
:
top_idxs
.
append
(
i
)
dict_visited
[
i
]
=
True
top_idxs
=
top_idxs
[:
10
]
word_total_unique
=
np
.
array
(
word_total_unique
)
dict_related
[
key
]
=
list
(
word_total_unique
[
top_idxs
])
# print("dict_related", dict_related)
file_store_path
=
'related_words.txt'
if
os
.
path
.
exists
(
file_store_path
):
os
.
remove
(
file_store_path
)
# file = open(file_store_path, 'w')
# file.writelines('你好,\n hello。')
# file.close()
with
open
(
file_store_path
,
mode
=
'w'
,
encoding
=
'utf-8'
)
as
file
:
# file.writelines('你好,\n hello。')
for
item
in
dict_related
.
items
():
r_l
=
" "
.
join
(
word
for
word
in
item
[
1
])
output
=
'{0},{1}'
.
format
(
item
[
0
],
r_l
)
file
.
write
(
output
+
"
\n
"
)
file
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment