Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
project
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200913012
project
Commits
4a0ee0c4
Commit
4a0ee0c4
authored
Nov 06, 2020
by
zbh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tfidf search
parent
dd84525e
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
315 additions
and
67 deletions
+315
-67
Project1/starter_code.ipynb
+315
-67
No files found.
Project1/starter_code.ipynb
View file @
4a0ee0c4
...
...
@@ -101,7 +101,7 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
139
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -110,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count":
9
,
"execution_count":
152
,
"metadata": {},
"outputs": [
{
...
...
@@ -119,7 +119,7 @@
"(86821, 86821)"
]
},
"execution_count":
9
,
"execution_count":
152
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -164,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 1
0
,
"execution_count": 1
53
,
"metadata": {},
"outputs": [
{
...
...
@@ -174,7 +174,7 @@
" ['England'])"
]
},
"execution_count": 1
0
,
"execution_count": 1
53
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -197,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": 1
1
,
"execution_count": 1
54
,
"metadata": {},
"outputs": [
{
...
...
@@ -226,7 +226,7 @@
},
{
"cell_type": "code",
"execution_count": 1
2
,
"execution_count": 1
55
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -240,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 1
3
,
"execution_count": 1
56
,
"metadata": {},
"outputs": [
{
...
...
@@ -276,7 +276,7 @@
},
{
"cell_type": "code",
"execution_count": 1
4
,
"execution_count": 1
57
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -303,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 1
6
,
"execution_count": 1
58
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -339,34 +339,45 @@
" chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]\n",
" return unicodedata.normalize('NFC', ''.join(chars))\n",
"\n",
"for i in range(len(qlist)):\n",
"def qlist_preprocess(qlist):\n",
" for i in range(len(qlist)):\n",
"\n",
" ques = re.sub('\\d+', '#number', qlist[i]) # 数字变成统一字符\n",
" ques = re.sub(\"\\'.\", '', ques) # 's, 'm 等过滤\n",
" ques = ques.replace('-', ' ')\n",
" \n",
" # ques = word_tokenize(re.sub('[!.?,]+', '', ques)) # 去除标点, 分词\n",
" ques = re.sub('[!.?,\\\"]+', '', ques).split() # 去除标点, 分词\n",
" \n",
" ques = [i.lower() for i in ques] # 转小写\n",
" ques = re.sub('\\d+', '#number', qlist[i]) # 数字变成统一字符\n",
" ques = re.sub(\"\\'.\", '', ques) # 's, 'm 等过滤\n",
" ques = ques.replace('-', ' ')\n",
"\n",
" # lemmatization\n",
" tagged_sent = pos_tag(ques)\n",
" ques = [wnl.lemmatize(tag[0], pos=get_wordnet_pos(tag[1])) for tag in tagged_sent]\n",
" # ques = word_tokenize(re.sub('[!.?,]+', '', ques)) # 去除标点, 分词\n",
" ques = re.sub('[!.?,\\\"]+', '', ques).split() # 去除标点, 分词\n",
"\n",
" ques = [i for i in ques if i not in stop_words] # 去停用词\n",
" \n",
" # deaccent\n",
" ques = [strip_accents(i) for i in ques]\n",
" ques = [i.lower() for i in ques] # 转小写\n",
"\n",
" # lemmatization\n",
" tagged_sent = pos_tag(ques)\n",
" ques = [wnl.lemmatize(tag[0], pos=get_wordnet_pos(tag[1])) for tag in tagged_sent]\n",
"\n",
"
qlist[i] = ques
\n",
"
ques = [i for i in ques if i not in stop_words] # 去停用词
\n",
"\n",
"# qlist = # 更新后的问题列表"
" # deaccent\n",
" ques = [strip_accents(i) for i in ques]\n",
"\n",
" qlist[i] = ques\n",
" return qlist\n",
"\n",
" # qlist = # 更新后的问题列表"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"qlist = qlist_preprocess(qlist)"
]
},
{
"cell_type": "code",
"execution_count":
17
,
"execution_count":
202
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -397,20 +408,32 @@
},
{
"cell_type": "code",
"execution_count":
19
,
"execution_count":
211
,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 467600 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 211,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TODO \n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
"
X =
vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
"X_tfidf = X # 结果存放在X矩阵里"
"vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
"
#
X_tfidf = X # 结果存放在X矩阵里"
]
},
{
"cell_type": "code",
"execution_count": 2
0
,
"execution_count": 2
12
,
"metadata": {},
"outputs": [
{
...
...
@@ -419,7 +442,7 @@
"(86821, 30608)"
]
},
"execution_count": 2
0
,
"execution_count": 2
12
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -429,6 +452,26 @@
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.9943043238027367"
]
},
"execution_count": 215,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sum(X_tfidf[0].A)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
...
...
@@ -570,7 +613,9 @@
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
...
...
@@ -704,88 +749,290 @@
},
{
"cell_type": "code",
"execution_count": 136,
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"# TODO 基于BERT的句子向量计算\n",
"\n",
"X_bert = results # 每一个句子的向量结果存放在X_bert矩阵里。行数为句子的总个数,列数为一个句子embedding大小。 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 第三部分: 相似度匹配以及搜索\n",
"在这部分里,我们需要把用户每一个输入跟知识库里的每一个问题做一个相似度计算,从而得出最相似的问题。但对于这个问题,时间复杂度其实很高,所以我们需要结合倒排表来获取相似度最高的问题,从而获得答案。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.1 tf-idf + 余弦相似度\n",
"我们可以直接基于计算出来的``tf-idf``向量,计算用户最新问题与库中存储的问题之间的相似度,从而选择相似度最高的问题的答案。这个方法的复杂度为``O(N)``, ``N``是库中问题的个数。"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [],
"source": [
"query = \"'beyonce', 'start', 'become, popular?\"\n",
"q_list = qlist_preprocess([query])"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['eyonce', 'tart', 'ecome', 'popular']]"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q_list"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"ques_tfidf = vectorizer.transform([' '.join(i) for i in q_list])"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"
(86821, 768
)"
"
array([[0., 0., 0., ..., 0., 0., 0.]]
)"
]
},
"execution_count":
13
6,
"execution_count":
20
6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"
results.shape
"
"
ques_tfidf.A
"
]
},
{
"cell_type": "code",
"execution_count":
13
7,
"execution_count":
20
7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.3757278 , -0.36750692, 0.44403127, ..., -0.29200172,\n",
" 0.22539857, -0.57711858],\n",
" [ 0.08952965, -0.33501336, 0.476886 , ..., -0.01217865,\n",
" -0.0762298 , -0.18508096],\n",
" [ 0.25395563, -0.22788545, 0.83195764, ..., -0.30400619,\n",
" 0.0784178 , -0.79656309],\n",
"array([[0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" ...,\n",
" [-0.02081348, -0.13692667, -0.27600896, ..., -0.26923919,\n",
" 0.11144835, -0.08454084],\n",
" [-0.47825322, 0.26795244, 0.15068491, ..., -0.79847699,\n",
" 0.37375435, -0.31773022],\n",
" [-0.43065107, 0.05200777, -0.23492971, ..., -0.10056096,\n",
" 0.03602708, 0.12411974]])"
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count":
13
7,
"execution_count":
20
7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"
results
"
"
X_tfidf.A
"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<86821x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 467600 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_tfidf"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"# TODO 基于BERT的句子向量计算\n",
"\n",
"X_bert = # 每一个句子的向量结果存放在X_bert矩阵里。行数为句子的总个数,列数为一个句子embedding大小。 "
"arr = ques_tfidf.A\n",
"brr = X_tfidf.A\n",
"def vector_matrix(arr, brr):\n",
" return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/zbh/anaconda3/envs/py37/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide\n",
" after removing the cwd from sys.path.\n"
]
},
{
"data": {
"text/plain": [
"array([[nan, nan, nan, ..., nan, nan, nan]])"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### 第三部分: 相似度匹配以及搜索\n",
"在这部分里,我们需要把用户每一个输入跟知识库里的每一个问题做一个相似度计算,从而得出最相似的问题。但对于这个问题,时间复杂度其实很高,所以我们需要结合倒排表来获取相似度最高的问题,从而获得答案。"
"vector_matrix(arr, brr)"
]
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
192
,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr.A"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1x86821 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 0 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ques_tfidf.dot(X_tfidf.T)"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1x30608 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 0 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[84, 84, 36, 54]])"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#### 3.1 tf-idf + 余弦相似度\n",
"我们可以直接基于计算出来的``tf-idf``向量,计算用户最新问题与库中存储的问题之间的相似度,从而选择相似度最高的问题的答案。这个方法的复杂度为``O(N)``, ``N``是库中问题的个数。"
"a = np.array([[1, 2, 1, 2, 3, 5, 6, 2]])\n",
"a.dot(b.T)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.],\n",
" [1., 2., 1., 2., 3., 5., 6., 2.]])"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ones((4,1)).dot(a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer\n",
"X = vectorizer.fit_transform([' '.join(i) for i in qlist]) \n",
"X_tfidf = X # 结果存放在X矩阵里"
]
},
{
...
...
@@ -803,6 +1050,7 @@
" 3. 找出相似度最高的top5问题的答案\n",
" \"\"\"\n",
" \n",
" \n",
" top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下标 \n",
" # hint: 请使用 priority queue来找出top results. 思考为什么可以这么做? \n",
" \n",
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment