Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
Project1
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200913032
Project1
Commits
6da2b1a1
Commit
6da2b1a1
authored
4 years ago
by
20200913032
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
9fd4f892
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
236 additions
and
0 deletions
+236
-0
related.py
+236
-0
No files found.
related.py
0 → 100644
View file @
6da2b1a1
import
json
import
numpy
as
np
import
jieba
import
string
import
timeit
from
matplotlib.font_manager
import
FontProperties
import
matplotlib.pyplot
as
plt
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
heapq
from
sklearn.metrics.pairwise
import
cosine_similarity
import
scipy.sparse
import
os
# 自定义打印方法
def
print_format
(
str
,
a
):
print
(
str
+
'
\n
{0}
\n
'
.
format
(
a
))
# cut_list
def
cut_list
(
list_input
,
isSpecialHandle
=
True
):
list_new
=
[]
for
sentence
in
list_input
:
if
isSpecialHandle
:
list_new
.
append
(
sentence
.
replace
(
'?'
,
''
)
.
split
())
else
:
list_new
.
append
(
sentence
.
split
())
return
list_new
#handle_one_sentence
def
handle_one_sentence
(
sentence
):
return
sentence
.
replace
(
'?'
,
''
)
.
split
(
' '
)
def
get_least_numbers_big_data
(
alist
,
k
):
max_heap
=
[]
length
=
len
(
alist
)
if
not
alist
or
k
<=
0
or
k
>
length
:
return
k
-=
1
for
ele
in
alist
:
if
len
(
max_heap
)
<=
k
:
heapq
.
heappush
(
max_heap
,
ele
)
else
:
heapq
.
heappushpop
(
max_heap
,
ele
)
# return list(map(lambda x:x, max_heap))
return
max_heap
# ==============================第一部分:对于训练数据的处理:读取文件和预处理=======================
# 文本的读取: 需要从文本中读取数据,此处需要读取的文件是dev-v2.0.json,并把读取的文件存入一个列表里(list)
def
read_corpus
():
#解析json数据
#Tips1:答案字典“answers”可能会有空的情况 此时应该取plausible_answers节点
qlist
=
[]
alist
=
[]
filename
=
'train-v2.0.json'
with
open
(
filename
,
'r'
)
as
load_f
:
load_dict
=
json
.
load
(
load_f
)
data_list
=
load_dict
[
'data'
]
# len_data = len(data_list)
# print_format("len_data", len_data)
for
data
in
data_list
:
paragraphs
=
data
[
"paragraphs"
]
for
paragraph
in
paragraphs
:
qas
=
paragraph
[
"qas"
]
for
qa
in
paragraph
[
"qas"
]:
if
"answers"
in
qa
:
if
len
(
qa
[
"answers"
])
>
0
!=
None
and
qa
[
"answers"
][
0
][
"text"
]
!=
None
:
qlist
.
append
(
qa
[
"question"
])
alist
.
append
(
qa
[
"answers"
][
0
][
"text"
])
assert
len
(
qlist
)
==
len
(
alist
)
# 确保长度一样
return
qlist
,
np
.
array
(
alist
)
qlist
,
alist
=
read_corpus
()
word_total
=
[
word
for
words_list
in
cut_list
(
qlist
)
for
word
in
words_list
]
word_total_unique
=
list
(
set
(
word_total
))
# print_format("len(word_total)", len(word_total))
# 统计词频
dict_word_count
=
{
l
:
0
for
l
in
word_total_unique
}
for
value
in
word_total
:
dict_word_count
[
value
]
+=
1
def
text_preparation
(
qlist
):
"""
- 1. 停用词过滤 (去网上搜一下 "english stop words list",会出现很多包含停用词库的网页,或者直接使用NLTK自带的)
- 2. 转换成lower_case: 这是一个基本的操作
- 3. 去掉一些无用的符号: 比如连续的感叹号!!!, 或者一些奇怪的单词。
- 4. 去掉出现频率很低的词:比如出现次数少于10,20.... (想一下如何选择阈值)
- 5. 对于数字的处理: 分词完只有有些单词可能就是数字比如44,415,把所有这些数字都看成是一个单词,这个新的单词我们可以定义为 "#number"
- 6. lemmazation: 在这里不要使用stemming, 因为stemming的结果有可能不是valid word。
"""
stopwords
=
{
line
.
rstrip
()
.
lower
():
None
for
line
in
open
(
'stopwords.txt'
)}
low_freg_words
=
{
value
[
0
]:
None
for
value
in
dict_word_count
.
items
()
if
value
[
1
]
<
3
}
start
=
timeit
.
default_timer
()
qlist_new
=
[]
remove_punct_map
=
{
c
:
None
for
c
in
string
.
punctuation
}
for
sentence
in
qlist
:
sentence_new
=
''
words_list
=
handle_one_sentence
(
sentence
)
for
word
in
words_list
:
# 过滤掉频率低的单词
if
word
in
low_freg_words
:
continue
# 去除所有标点符号
word
=
''
.
join
(
c
for
c
in
word
if
c
not
in
remove_punct_map
)
#停用词过滤
if
word
.
lower
()
in
stopwords
:
# print(word)
continue
# 处理数字
if
word
.
isdigit
():
word
=
word
.
replace
(
word
,
"#number"
)
#单词转小写
word
=
word
.
lower
()
sentence_new
+=
word
+
" "
qlist_new
.
append
(
sentence_new
)
# qlist = qlist_new
qlist
=
[
q
for
q
in
qlist_new
if
q
.
rstrip
()
!=
""
]
stop
=
timeit
.
default_timer
()
print
(
'文本预处理 Time: '
,
stop
-
start
)
return
qlist
qlist
=
text_preparation
(
qlist
)
cut_table
=
cut_list
(
qlist
,
isSpecialHandle
=
False
)
word_total
=
[
word
for
words_list
in
cut_table
for
word
in
words_list
]
word_total_unique
=
list
(
set
(
word_total
))
print_format
(
"len(word_total_unique)"
,
len
(
word_total_unique
))
# =====================================glove 方式 开始============================================
embeddings_index
=
{}
glovefile
=
open
(
"glove.6B.200d.txt"
,
"r"
,
encoding
=
"utf-8"
)
for
line
in
glovefile
:
values
=
line
.
split
()
word
=
values
[
0
]
coefs
=
np
.
asarray
(
values
[
1
:],
dtype
=
'float16'
)
embeddings_index
[
word
]
=
coefs
glovefile
.
close
()
embedding_dim
=
200
def
get_embedding_matrix_glove
(
word
):
embedding_vector
=
embeddings_index
.
get
(
word
)
if
embedding_vector
is
not
None
:
return
embedding_vector
[:
embedding_dim
]
return
np
.
zeros
(
embedding_dim
)
word2id
,
id2word
=
{},
{}
emd
=
[]
for
word
in
word_total_unique
:
if
word
not
in
word2id
:
word2id
[
word
]
=
len
(
word2id
)
id2word
[
len
(
id2word
)]
=
word
emd
.
append
(
get_embedding_matrix_glove
(
word
))
emd
=
np
.
asarray
(
emd
)
dict_related
=
{
word
:[]
for
word
in
word_total_unique
}
emd_csr_matrix
=
scipy
.
sparse
.
csr_matrix
(
emd
)
test_count
=
0
for
key
in
dict_related
.
keys
():
# if test_count >=100:
# break
# test_count += 1
word_index
=
word2id
[
key
]
# v_k = emd[index]
# result = list(cosine_similarity(scipy.sparse.csr_matrix(v_k), emd_csr_matrix)[0])
result
=
list
(
cosine_similarity
(
emd_csr_matrix
[
word_index
],
emd_csr_matrix
)[
0
])
top_values
=
sorted
(
get_least_numbers_big_data
(
result
,
10
),
reverse
=
True
)
top_idxs
=
[]
len_result
=
len
(
result
)
dict_visited
=
{}
for
value
in
top_values
:
for
i
in
range
(
len_result
):
if
value
==
result
[
i
]
and
i
not
in
dict_visited
and
word_index
!=
i
:
top_idxs
.
append
(
i
)
dict_visited
[
i
]
=
True
top_idxs
=
top_idxs
[:
10
]
word_total_unique
=
np
.
array
(
word_total_unique
)
dict_related
[
key
]
=
list
(
word_total_unique
[
top_idxs
])
# print("dict_related", dict_related)
file_store_path
=
'related_words.txt'
if
os
.
path
.
exists
(
file_store_path
):
os
.
remove
(
file_store_path
)
# file = open(file_store_path, 'w')
# file.writelines('你好,\n hello。')
# file.close()
with
open
(
file_store_path
,
mode
=
'w'
,
encoding
=
'utf-8'
)
as
file
:
# file.writelines('你好,\n hello。')
for
item
in
dict_related
.
items
():
r_l
=
" "
.
join
(
word
for
word
in
item
[
1
])
output
=
'{0},{1}'
.
format
(
item
[
0
],
r_l
)
file
.
write
(
output
+
"
\n
"
)
file
.
close
()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment