Retrieve.ipynb 70.4 KB

搭建倒排表

倒排表的作用是让搜索更加快速,是搜索引擎中常用的技术。根据课程中所讲的方法,你需要完成这部分的代码。

In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle
from gensim.models import KeyedVectors  # 词向量用来比较俩俩之间相似度
In [3]:
# 读取数据: 导入在preprocessor.ipynb中生成的data/question_answer_pares.pkl文件,并将其保存在变量QApares中
with open('data/question_answer_pares.pkl','rb') as f:
    QApares = pickle.load(f)
In [4]:
QApares.head()
question answer question_after_preprocessing
0 买二份有没有少点呀 亲亲真的不好意思我们已经是优惠价了呢小本生意请亲谅解 [买, 二份, 有没有, 少点]
1 那就等你们处理喽 好的亲退了 [处理]
2 那我不喜欢 颜色的话一般茶刀茶针和二合一的话都是红木檀和黑木檀哦 [喜欢]
3 不是免运费 本店茶具订单满99包邮除宁夏青海内蒙古海南新疆西藏满39包邮 [免, 运费]
4 好吃吗 好吃的 [好吃]

TODO1 构造一个倒排表,不需要考虑单词的相似度

In [6]:
# 构建一个倒排表,有关倒排表的详细内容参考实验手册
# 为了能够快速检索,倒排表应用哈希表来存储。python中字典内部便是用哈希表来存储的,所以这里我们直接将倒排表保存在字典中
# 注意:在这里不需要考虑单词之间的相似度。
inverted_list = {}
for index,sentence in enumerate(QApares.question_after_preprocessing):
    ### 你需要完成的代码
    for word in sentence:
        if word in inverted_list:
            inverted_list[word].add(index)
        else:
            inverted_list[word] = set()
            inverted_list[word].add(index)
    
    ### 你需要完成的代码结束
In [24]:
inverted_list["发货"]
Out [24]:
{5,
 65541,
 32776,
 17,
 18,
 65554,
 29,
 65566,
 32800,
 32803,
 98339,
 32810,
 98346,
 32818,
 55,
 98366,
 64,
 65604,
 65611,
 32850,
 98387,
 98398,
 65631,
 102,
 65639,
 65640,
 65646,
 98415,
 98416,
 118,
 122,
 65659,
 125,
 32894,
 133,
 65669,
 65670,
 65671,
 142,
 65679,
 32912,
 98451,
 150,
 151,
 32929,
 65708,
 98484,
 98489,
 187,
 32957,
 200,
 32973,
 65742,
 98518,
 65755,
 220,
 223,
 65764,
 65783,
 33017,
 65786,
 254,
 65790,
 261,
 65798,
 65810,
 275,
 98586,
 65833,
 33068,
 65838,
 33073,
 65843,
 65844,
 310,
 65852,
 318,
 65862,
 65863,
 344,
 65883,
 65885,
 350,
 33120,
 364,
 98668,
 65903,
 33140,
 98678,
 65913,
 33149,
 33158,
 33163,
 33168,
 401,
 98709,
 98715,
 33189,
 33191,
 65960,
 33193,
 98740,
 33210,
 98751,
 98754,
 33220,
 453,
 98763,
 461,
 469,
 33244,
 98794,
 495,
 98803,
 33273,
 33276,
 66046,
 33290,
 530,
 33300,
 66069,
 66077,
 542,
 543,
 66081,
 66091,
 556,
 66093,
 66094,
 98860,
 66097,
 98877,
 66111,
 33346,
 579,
 33347,
 588,
 66152,
 66153,
 98932,
 66166,
 639,
 640,
 642,
 98948,
 66185,
 651,
 33422,
 98962,
 33427,
 98970,
 66207,
 673,
 33441,
 33446,
 66221,
 687,
 66224,
 33459,
 694,
 33464,
 33466,
 33468,
 99005,
 66238,
 99007,
 710,
 99017,
 718,
 99044,
 743,
 33513,
 749,
 33523,
 759,
 66299,
 99068,
 33535,
 769,
 33539,
 66307,
 66309,
 99084,
 66321,
 99090,
 66323,
 99098,
 66334,
 800,
 66337,
 33576,
 99120,
 99122,
 33592,
 33593,
 33600,
 33604,
 99140,
 841,
 845,
 99149,
 66389,
 854,
 33624,
 33625,
 858,
 66395,
 99163,
 33629,
 99173,
 33638,
 874,
 66411,
 878,
 33647,
 66415,
 66417,
 33650,
 66418,
 99182,
 891,
 33662,
 66436,
 901,
 99208,
 66441,
 33674,
 33675,
 33682,
 916,
 66452,
 33691,
 33692,
 66461,
 66463,
 33697,
 99234,
 931,
 938,
 944,
 66480,
 33716,
 99254,
 959,
 33728,
 99267,
 33736,
 33741,
 978,
 33750,
 992,
 33765,
 99301,
 1000,
 1005,
 99309,
 33780,
 99318,
 1017,
 33787,
 66557,
 1024,
 1027,
 1034,
 1036,
 66577,
 66580,
 99351,
 99360,
 1057,
 33825,
 99363,
 33831,
 66600,
 33837,
 66607,
 99376,
 33841,
 99380,
 66613,
 33849,
 66622,
 1087,
 1088,
 99391,
 1095,
 1098,
 66637,
 1102,
 99405,
 99410,
 99413,
 66646,
 99419,
 66652,
 1118,
 99431,
 1129,
 66670,
 33912,
 33914,
 66685,
 33927,
 99467,
 99470,
 99472,
 66705,
 66708,
 66709,
 99484,
 99485,
 99487,
 66721,
 33954,
 1187,
 99495,
 66728,
 66732,
 66734,
 99510,
 99517,
 66751,
 1223,
 33992,
 99528,
 99537,
 66770,
 99539,
 1237,
 1242,
 66779,
 66780,
 1247,
 99555,
 34024,
 1265,
 34034,
 1270,
 34042,
 1277,
 66814,
 99583,
 34064,
 1297,
 66834,
 66844,
 1312,
 66849,
 1326,
 66866,
 34099,
 99637,
 1334,
 66871,
 66877,
 1342,
 1345,
 99651,
 66887,
 1352,
 1354,
 66895,
 99671,
 66909,
 99681,
 1392,
 66929,
 34162,
 1401,
 1402,
 34169,
 99709,
 1407,
 66944,
 66945,
 1412,
 1415,
 34184,
 1420,
 99729,
 1426,
 34196,
 66971,
 1436,
 66974,
 1443,
 34217,
 66986,
 1452,
 66993,
 1459,
 34227,
 66996,
 34258,
 99797,
 99803,
 1508,
 99814,
 1512,
 1515,
 67053,
 99821,
 99824,
 99825,
 34303,
 67071,
 34307,
 67077,
 67079,
 99854,
 67092,
 67094,
 99862,
 1565,
 34333,
 1567,
 34338,
 1579,
 1585,
 67127,
 99897,
 1603,
 1604,
 67147,
 67152,
 34388,
 67156,
 99924,
 1623,
 34395,
 99931,
 1635,
 99946,
 67180,
 99949,
 99954,
 34419,
 99957,
 67197,
 34434,
 99970,
 34440,
 1673,
 1675,
 1676,
 34443,
 67212,
 67216,
 34457,
 1691,
 1692,
 1697,
 34473,
 67241,
 34480,
 67250,
 67269,
 1743,
 67284,
 34528,
 1775,
 34553,
 67323,
 1798,
 34572,
 1805,
 34576,
 67344,
 67350,
 1820,
 1821,
 34590,
 1825,
 34606,
 1839,
 67375,
 67379,
 1852,
 1861,
 1866,
 67402,
 1874,
 67416,
 67431,
 1904,
 67441,
 34687,
 1920,
 34689,
 34701,
 34708,
 34710,
 67492,
 67493,
 1970,
 34744,
 34753,
 67531,
 2004,
 2008,
 2009,
 2023,
 67560,
 34793,
 67562,
 34795,
 2037,
 34807,
 2046,
 34819,
 2052,
 34827,
 2068,
 67613,
 34848,
 67626,
 67628,
 34864,
 2099,
 67636,
 34869,
 67641,
 2110,
 67649,
 2117,
 67655,
 67666,
 67669,
 67680,
 34914,
 67682,
 2151,
 34926,
 2160,
 2161,
 34929,
 67699,
 2170,
 2178,
 34949,
 67718,
 2187,
 2193,
 67730,
 67735,
 2216,
 2218,
 2230,
 34998,
 2235,
 67771,
 2244,
 2247,
 67791,
 2263,
 67800,
 2266,
 35037,
 67810,
 67816,
 35049,
 35055,
 67824,
 67825,
 35059,
 35063,
 2296,
 35067,
 2323,
 2326,
 2330,
 67867,
 2335,
 67886,
 2365,
 2370,
 2372,
 2377,
 67915,
 2380,
 2392,
 67928,
 67934,
 35174,
 35180,
 35188,
 67958,
 35195,
 67968,
 35202,
 2441,
 35211,
 67981,
 35215,
 67984,
 35219,
 35220,
 35228,
 2465,
 2483,
 2484,
 35256,
 68025,
 2510,
 35280,
 35281,
 68066,
 2532,
 35310,
 68084,
 2553,
 68089,
 68097,
 2566,
 35351,
 35358,
 2594,
 2607,
 35378,
 2612,
 68151,
 35385,
 2620,
 35394,
 35395,
 68163,
 68164,
 68169,
 35415,
 2653,
 68199,
 68200,
 68209,
 2677,
 68215,
 35458,
 35459,
 2692,
 35466,
 35472,
 2705,
 35473,
 35474,
 68240,
 2710,
 2712,
 35481,
 2715,
 2721,
 68257,
 68264,
 68265,
 35503,
 2744,
 68290,
 2761,
 68306,
 35539,
 35547,
 35549,
 2786,
 35557,
 35559,
 68329,
 68332,
 68334,
 68337,
 35570,
 2804,
 68343,
 35587,
 2824,
 35603,
 2838,
 68375,
 35613,
 2853,
 35622,
 35634,
 2868,
 35636,
 68408,
 68419,
 35654,
 2887,
 68440,
 68445,
 2916,
 35689,
 68457,
 35697,
 35729,
 2968,
 68506,
 35740,
 68512,
 2981,
 35758,
 2991,
 35759,
 2996,
 3000,
 68537,
 3009,
 68549,
 68556,
 35792,
 35793,
 68562,
 68568,
 35804,
 35809,
 35810,
 68578,
 3050,
 3053,
 68590,
 3061,
 3067,
 35835,
 35843,
 3076,
 68613,
 35846,
 3081,
 35855,
 68629,
 3094,
 35863,
 35867,
 35888,
 35899,
 35915,
 68683,
 68685,
 3151,
 68687,
 68690,
 68697,
 35932,
 68705,
 3171,
 68708,
 68711,
 68721,
 3187,
 35974,
 68744,
 3210,
 3211,
 35983,
 68761,
 36002,
 3236,
 68774,
 3240,
 68790,
 3256,
 68792,
 3265,
 3269,
 3271,
 68809,
 3275,
 36052,
 3285,
 3286,
 68823,
 3288,
 3291,
 3293,
 3303,
 68855,
 68865,
 3333,
 36107,
 3356,
 3364,
 3375,
 68914,
 68930,
 68948,
 3414,
 36192,
 36197,
 36201,
 3434,
 36211,
 36214,
 3455,
 36226,
 69014,
 69015,
 3482,
 3483,
 69022,
 3496,
 3500,
 36273,
 36277,
 3526,
 3527,
 36294,
 36300,
 3536,
 3539,
 36308,
 3542,
 3543,
 69082,
 36320,
 36321,
 36326,
 3560,
 3572,
 36340,
 36341,
 36346,
 69124,
 3590,
 36370,
 3610,
 3613,
 69158,
 3627,
 3636,
 36404,
 36416,
 36419,
 3652,
 3655,
 3657,
 3669,
 3673,
 36444,
 36446,
 3680,
 69220,
 69221,
 69231,
 36469,
 69244,
 3712,
 3713,
 36481,
 69251,
 36488,
 69264,
 36501,
 36509,
 36510,
 3744,
 69282,
 69287,
 69294,
 36527,
 36528,
 3764,
 3785,
 3791,
 69338,
 36572,
 3807,
 69344,
 3815,
 69365,
 36598,
 3832,
 3837,
 36619,
 69405,
 3882,
 3883,
 36650,
 69426,
 36687,
 3928,
 3931,
 69474,
 36707,
 69477,
 36710,
 3945,
 36723,
 3960,
 69512,
 3980,
 69516,
 69518,
 69519,
 69523,
 36756,
 36762,
 36765,
 36771,
 36778,
 4026,
 69563,
 69569,
 69578,
 69596,
 36829,
 4069,
 36838,
 69607,
 36841,
 4074,
 36845,
 69617,
 4093,
 36871,
 36874,
 36878,
 36880,
 36888,
 36890,
 4124,
 69662,
 4127,
 36898,
 4133,
 36916,
 36917,
 36924,
 69692,
 69695,
 4161,
 4162,
 4165,
 69707,
 4175,
 36948,
 4182,
 36958,
 69732,
 36974,
 36978,
 69748,
 4220,
 36995,
 69766,
 36999,
 69767,
 69770,
 4238,
 37007,
 69783,
 4250,
 69800,
 69802,
 4267,
 37036,
 37037,
 37039,
 4278,
 37049,
 4282,
 4287,
 4290,
 4291,
 37064,
 4301,
 37072,
 37073,
 69847,
 4312,
 4321,
 4323,
 4324,
 4332,
 4336,
 69874,
 69880,
 37114,
 37120,
 4354,
 4360,
 69898,
 37131,
 69903,
 37150,
 4387,
 69923,
 69928,
 37166,
 69934,
 69944,
 37177,
 4425,
 37193,
 4429,
 37216,
 4463,
 37234,
 37235,
 70002,
 4492,
 4493,
 4500,
 70040,
 ...}
In [11]:
len(inverted_list)
Out [11]:
3832
In [7]:
#d ata/retrieve/sgns.zhihu.word是从https://github.com/Embedding/Chinese-Word-Vectors下载到的预训练好的中文词向量文件
#使 用KeyedVectors.load_word2vec_format()函数加载预训练好的词向量文件
model = KeyedVectors.load_word2vec_format('data/retrieve/sgns.zhihu.word')
In [8]:
def get_similar_by_word(word,topk):
    '''
        返回与一个单词word相似度最高的topk个单词所组成的单词列表
        出参:
            word_list:与word相似度最高的topk个单词所组成的单词列表。格式为[单词1,单词2,单词3,单词4,单词5]
    '''
    similar_words = model.similar_by_word(word,topk)
    word_list = [word[0] for word in similar_words]
    return word_list
In [9]:
get_similar_by_word("今天",5)
Out [9]:
['昨天', '现在', '今天下午', '明天', '今日']

TODO2 构造一个新的倒排表,考虑单词之间的语义相似度

In [25]:
# TODO:
# 构造一个新的倒排表,并将结果保存在字典inverted_list_new中
# 新的倒排表键为word,值为老倒排表[word]、老倒排表[单词1]、老倒排表[单词2]、老倒排表[单词3]、老倒排表[单词4]的并集
# 即新倒排表保存了包含单词word或包含与单词word最相近的5个单词中的某一个的问题的index
inverted_list_new = {}
OOV_count = 0
for word in tqdm(inverted_list):
    ### 你需要完成的部分
    try:
        top_4_words = get_similar_by_word(word,4)
        inverted_list_new[word] = set()
        inverted_list_new[word] = inverted_list_new[word].union(inverted_list[word])
        for t_word in top_4_words:
            if t_word in inverted_list:
                inverted_list_new[word] = inverted_list_new[word].union(inverted_list[t_word])
    except Exception as e:
        OOV_count += 1
print("OOV_count:",OOV_count)
    ### 你需要完成的代码结束
    
Out [25]:
100%|██████████| 3832/3832 [00:44<00:00, 85.74it/s] 
In [26]:
inverted_list_new["发货"]
Out [26]:
{81920,
 16386,
 5,
 65541,
 81927,
 32776,
 81930,
 81935,
 17,
 18,
 65554,
 16401,
 81947,
 98331,
 29,
 65566,
 32800,
 81953,
 32803,
 98339,
 81959,
 32810,
 98346,
 49194,
 32818,
 16435,
 55,
 49209,
 98366,
 64,
 49219,
 65604,
 81988,
 16458,
 65611,
 81995,
 81998,
 16463,
 16464,
 49233,
 32850,
 98387,
 49234,
 82004,
 86,
 81999,
 98386,
 16475,
 32859,
 49245,
 98398,
 65631,
 82015,
 65630,
 102,
 65639,
 65640,
 49259,
 65646,
 98415,
 98416,
 49263,
 16495,
 49267,
 16500,
 82035,
 118,
 16503,
 65650,
 65656,
 122,
 65659,
 49275,
 125,
 32894,
 65660,
 133,
 65669,
 65670,
 65671,
 32902,
 139,
 49293,
 142,
 65679,
 32912,
 98451,
 150,
 151,
 32929,
 49318,
 49320,
 65708,
 82092,
 82093,
 65711,
 98484,
 98489,
 49337,
 187,
 49340,
 32957,
 200,
 16588,
 32973,
 65742,
 16589,
 98518,
 16598,
 49366,
 82137,
 65755,
 220,
 223,
 65764,
 82149,
 82155,
 16621,
 49396,
 65783,
 33017,
 65786,
 254,
 65790,
 82176,
 261,
 65798,
 65806,
 65810,
 275,
 279,
 98586,
 82208,
 49442,
 65833,
 33068,
 82220,
 65838,
 82221,
 82224,
 33073,
 65843,
 65844,
 16691,
 310,
 16696,
 82234,
 65852,
 49468,
 318,
 49471,
 49472,
 16706,
 49475,
 65862,
 65863,
 16712,
 82248,
 65866,
 49483,
 49493,
 16726,
 344,
 16730,
 65883,
 82268,
 65885,
 350,
 33120,
 16745,
 364,
 98668,
 65903,
 33140,
 98678,
 65913,
 33149,
 49535,
 33158,
 49544,
 82313,
 33163,
 16783,
 33168,
 401,
 82322,
 98709,
 49558,
 98715,
 16796,
 49565,
 82333,
 82334,
 33189,
 33191,
 65960,
 33193,
 49579,
 16812,
 98740,
 65973,
 16822,
 82359,
 16825,
 33210,
 16827,
 82365,
 98751,
 16832,
 98754,
 33220,
 453,
 49604,
 98763,
 461,
 469,
 49623,
 16856,
 33244,
 49629,
 16867,
 16869,
 98794,
 82410,
 82412,
 495,
 16882,
 98803,
 49651,
 49656,
 33273,
 16889,
 49658,
 33276,
 82426,
 66046,
 507,
 16895,
 49669,
 82437,
 33290,
 49674,
 16911,
 530,
 33300,
 66069,
 16918,
 16922,
 66077,
 542,
 543,
 82463,
 66081,
 16932,
 66091,
 556,
 66093,
 66094,
 98860,
 82475,
 66097,
 49709,
 16942,
 49715,
 49716,
 66106,
 98877,
 66111,
 49729,
 33346,
 579,
 33347,
 82499,
 49735,
 16967,
 49739,
 588,
 16983,
 82522,
 16991,
 82528,
 49761,
 49762,
 66151,
 66152,
 66153,
 17003,
 49777,
 98932,
 17012,
 66166,
 17020,
 17021,
 639,
 640,
 49793,
 642,
 17026,
 98948,
 17027,
 49796,
 82563,
 17030,
 66185,
 82566,
 651,
 33422,
 82576,
 98962,
 33427,
 17043,
 49812,
 82584,
 98970,
 17050,
 17052,
 66207,
 82592,
 673,
 33441,
 17057,
 17061,
 33446,
 49831,
 66221,
 82605,
 687,
 66224,
 49841,
 33453,
 33459,
 17076,
 66228,
 694,
 33464,
 33466,
 33468,
 99005,
 66238,
 99007,
 99006,
 82628,
 710,
 82630,
 82632,
 99017,
 711,
 718,
 82639,
 49872,
 82645,
 82652,
 49889,
 82658,
 99044,
 743,
 17127,
 33513,
 49897,
 17132,
 749,
 49901,
 49903,
 82674,
 33523,
 17142,
 759,
 17144,
 33527,
 66299,
 99068,
 33535,
 17152,
 769,
 33539,
 66307,
 66309,
 82694,
 17162,
 99084,
 49932,
 17167,
 49935,
 66321,
 99090,
 66323,
 783,
 82713,
 99098,
 66333,
 66334,
 82719,
 800,
 66337,
 17185,
 49954,
 17190,
 33576,
 49962,
 17196,
 99120,
 99122,
 33587,
 49972,
 49974,
 17207,
 33592,
 33593,
 33600,
 33604,
 99140,
 82757,
 841,
 49994,
 845,
 99149,
 17235,
 82771,
 66389,
 854,
 82775,
 33624,
 33625,
 858,
 66395,
 99163,
 33629,
 17245,
 82782,
 17249,
 17251,
 82787,
 99173,
 33638,
 17255,
 874,
 66411,
 82794,
 17259,
 878,
 33647,
 66415,
 66417,
 33650,
 66418,
 99182,
 82798,
 33652,
 891,
 17275,
 82812,
 33662,
 82813,
 17283,
 66436,
 901,
 99208,
 66441,
 33674,
 33675,
 17291,
 82829,
 33682,
 916,
 66452,
 17304,
 33688,
 33691,
 33692,
 66461,
 66463,
 82847,
 33697,
 99234,
 931,
 82848,
 17317,
 938,
 17324,
 82863,
 944,
 66480,
 33716,
 99254,
 82878,
 959,
 33728,
 99267,
 33736,
 33741,
 17359,
 978,
 17364,
 33750,
 82905,
 17371,
 992,
 17378,
 33765,
 99301,
 1000,
 50152,
 50155,
 1005,
 99309,
 50158,
 82928,
 66542,
 33780,
 99318,
 1017,
 33787,
 66557,
 1024,
 17408,
 17409,
 1027,
 50179,
 82949,
 82950,
 1034,
 50187,
 1036,
 50191,
 66577,
 66580,
 99351,
 82969,
 82970,
 17437,
 99360,
 1057,
 33825,
 99363,
 82977,
 82979,
 33831,
 66600,
 17447,
 17448,
 33837,
 50221,
 66607,
 99376,
 33841,
 82993,
 99380,
 66613,
 82996,
 82998,
 33849,
 83003,
 66622,
 1087,
 1088,
 99391,
 17473,
 83011,
 50245,
 1095,
 1098,
 66637,
 1102,
 99405,
 50256,
 99410,
 66644,
 99413,
 66646,
 99419,
 66652,
 83037,
 1118,
 83040,
 99431,
 66663,
 1129,
 17516,
 83052,
 66670,
 33900,
 50290,
 17524,
 33912,
 83065,
 33914,
 66685,
 83071,
 17537,
 50309,
 33927,
 50314,
 99467,
 99470,
 50318,
 99472,
 66705,
 83088,
 66708,
 66709,
 50327,
 99484,
 99485,
 83101,
 99487,
 66721,
 33954,
 1187,
 83105,
 50337,
 50341,
 99495,
 66728,
 50345,
 50346,
 17578,
 66732,
 83116,
 66734,
 50351,
 50356,
 99510,
 17591,
 50363,
 99517,
 66751,
 17599,
 1223,
 33992,
 99528,
 17607,
 50376,
 50379,
 17616,
 99537,
 66770,
 99539,
 1237,
 1242,
 66779,
 66780,
 17628,
 1247,
 17631,
 83168,
 50402,
 99555,
 50405,
 34024,
 50409,
 50411,
 1265,
 34034,
 83186,
 17651,
 50419,
 1270,
 34042,
 99580,
 1277,
 66814,
 99583,
 83200,
 17671,
 50441,
 17677,
 34064,
 1297,
 66834,
 50448,
 50449,
 50456,
 66844,
 83229,
 83231,
 1312,
 66849,
 50464,
 17698,
 17699,
 83236,
 17702,
 1326,
 99632,
 17713,
 66866,
 34099,
 83252,
 99637,
 1334,
 66871,
 83258,
 66877,
 1342,
 50493,
 17728,
 1345,
 99651,
 66887,
 1352,
 50503,
 1354,
 17737,
 50508,
 66895,
 99671,
 66909,
 34143,
 99681,
 83300,
 17770,
 1392,
 66929,
 34162,
 17777,
 1401,
 1402,
 34169,
 50554,
 99709,
 50557,
 1407,
 66944,
 66945,
 17788,
 1412,
 1415,
 34184,
 1420,
 99729,
 1426,
 34196,
 83351,
 66971,
 1436,
 83357,
 66974,
 17821,
 1443,
 83363,
 50598,
 34217,
 66986,
 83369,
 1452,
 66993,
 50609,
 1459,
 34227,
 66996,
 83385,
 50618,
 50626,
 50627,
 83406,
 83407,
 34258,
 50643,
 83412,
 99797,
 50647,
 99803,
 50652,
 50654,
 1508,
 50661,
 99814,
 1512,
 1515,
 50667,
 67053,
 99821,
 17901,
 99824,
 99825,
 83438,
 50672,
 83441,
 50677,
 17910,
 83446,
 83447,
 83448,
 34292,
 17916,
 17918,
 34303,
 67071,
 34307,
 83460,
 67077,
 1540,
 67079,
 34310,
 99854,
 83473,
 67092,
 67094,
 99862,
 67100,
 1565,
 34333,
 1567,
 34338,
 99877,
 1579,
 83502,
 83504,
 1585,
 67127,
 99897,
 50753,
 83521,
 1603,
 1604,
 17991,
 17992,
 50760,
 83527,
 67147,
 17996,
 67152,
 34388,
 67156,
 99924,
 1623,
 34395,
 99931,
 18016,
 1635,
 99946,
 67180,
 99949,
 99954,
 34419,
 83571,
 99957,
 83573,
 83572,
 18043,
 50811,
 67197,
 34434,
 99970,
 18053,
 83590,
 34440,
 1673,
 50826,
 1675,
 1676,
 34443,
 67212,
 67216,
 18065,
 18066,
 18068,
 18069,
 50838,
 50839,
 34457,
 50841,
 1691,
 1692,
 83615,
 1697,
 50855,
 34473,
 67241,
 50861,
 34480,
 67250,
 18104,
 18108,
 83645,
 18112,
 18116,
 67269,
 1743,
 83667,
 67284,
 50904,
 83674,
 50910,
 34528,
 34529,
 18146,
 50917,
 83688,
 50923,
 1775,
 18160,
 18168,
 34553,
 67322,
 67323,
 18171,
 50939,
 67327,
 50944,
 50947,
 1798,
 83718,
 34572,
 1805,
 83724,
 34576,
 67344,
 83730,
 67350,
 1820,
 1821,
 34590,
 50972,
 1825,
 50986,
 50988,
 34606,
 1839,
 67375,
 50990,
 50993,
 67379,
 83767,
 51000,
 1852,
 34620,
 83774,
 18239,
 1861,
 18245,
 1866,
 67402,
 1874,
 83795,
 83799,
 67416,
 18270,
 51039,
 83807,
 83810,
 51043,
 51046,
 67431,
 51047,
 1904,
 67441,
 18289,
 34687,
 1920,
 34689,
 51071,
 51074,
 1919,
 83846,
 18314,
 51084,
 34701,
 18319,
 34708,
 34710,
 18326,
 51095,
 67492,
 67493,
 51108,
 1959,
 34725,
 18348,
 67500,
 83887,
 1970,
 83895,
 34744,
 51128,
 83898,
 83902,
 34753,
 83906,
 51140,
 18373,
 67531,
 18384,
 2004,
 51159,
 2008,
 2009,
 18397,
 18401,
 83940,
 2023,
 67560,
 34793,
 67562,
 34795,
 51175,
 83951,
 51184,
 83953,
 51185,
 ...}
In [27]:
# 将新的倒排表保存在文件data/retrieve/invertedList.pkl中
with open('data/retrieve/invertedList.pkl','wb') as f:
    pickle.dump(inverted_list_new,f)

以下为测试,完成上述过程之后,可以运行以下的代码来测试准确性。

In [28]:
#这一格的内容是从preprocessor.ipynb中粘贴而来,包含了数据预处理的几个关键函数
import emoji
import re
import jieba
def clean(content):
    content = emoji.demojize(content)
    content = re.sub('<.*>','',content)
    return content
#这一函数是用于对句子进行分词,在preprocessor.ipynb中由于数据是已经分好词的,所以我们并没有进行这一步骤,但是对于一个新的问句,这一步是必不可少的
def question_cut(content):
    return list(jieba.cut(content))
def strip(wordList):
    return [word.strip() for word in wordList if word.strip()!='']
with open("data/stopWord.json","r") as f:
    stopWords = f.read().split("\n")
def rm_stop_word(wordList):
    return [word for word in wordList if word not in stopWords]
In [29]:
# 从data/retrieve/invertedList.pkl加载倒排表并将其保存在变量invertedList中
with open('data/retrieve/invertedList.pkl','rb') as f:
    invertedList = pickle.load(f)
In [30]:
def get_retrieve_result(sentence):
    '''
        输入一个句子sentence,根据倒排表进行快速检索,返回与该句子较相近的一些候选问题的index
        候选问题由包含该句子中任一单词或包含与该句子中任一单词意思相近的单词的问题索引组成
    '''
    sentence = clean(sentence)
    sentence = question_cut(sentence)
    sentence = strip(sentence)
    sentence = rm_stop_word(sentence)
    candidate = set()
    for word in sentence:
        if word in invertedList:
            candidate = candidate | invertedList[word]
    return candidate
In [31]:
get_retrieve_result('什么时候发货')  # 通过倒排表返回文档IDs
Out [31]:
{81920,
 16386,
 65541,
 5,
 81927,
 32776,
 81930,
 81935,
 17,
 18,
 65554,
 16401,
 98331,
 81947,
 29,
 65566,
 32800,
 81953,
 32803,
 98339,
 81959,
 32810,
 98346,
 49194,
 32818,
 16435,
 55,
 49209,
 98366,
 64,
 49219,
 65604,
 81988,
 16458,
 65611,
 81995,
 81998,
 16463,
 16464,
 49233,
 32850,
 98387,
 98386,
 49234,
 86,
 81999,
 82004,
 32859,
 16475,
 49245,
 98398,
 65631,
 65630,
 82015,
 102,
 65639,
 65640,
 49259,
 65646,
 98415,
 98416,
 49263,
 65650,
 16495,
 49267,
 16500,
 118,
 82035,
 65656,
 16503,
 122,
 65659,
 65660,
 125,
 32894,
 49275,
 133,
 65669,
 65670,
 65671,
 32902,
 139,
 49293,
 142,
 65679,
 32912,
 98451,
 150,
 151,
 32929,
 49318,
 49320,
 65708,
 82092,
 82093,
 65711,
 98484,
 98489,
 49337,
 187,
 49340,
 32957,
 200,
 16588,
 32973,
 65742,
 16589,
 98518,
 16598,
 49366,
 82137,
 65755,
 220,
 223,
 65764,
 82149,
 82155,
 16621,
 49396,
 65783,
 33017,
 65786,
 254,
 65790,
 82176,
 261,
 65798,
 65806,
 65810,
 275,
 279,
 98586,
 82208,
 49442,
 65833,
 33068,
 82220,
 65838,
 82221,
 82224,
 33073,
 65843,
 65844,
 16691,
 310,
 16696,
 82234,
 65852,
 49468,
 318,
 49471,
 49472,
 16706,
 49475,
 65862,
 65863,
 16712,
 82248,
 65866,
 49483,
 49493,
 16726,
 344,
 16730,
 65883,
 82268,
 65885,
 350,
 33120,
 16745,
 364,
 98668,
 65903,
 33140,
 98678,
 65913,
 33149,
 49535,
 33158,
 49544,
 82313,
 33163,
 16783,
 33168,
 401,
 82322,
 98709,
 49558,
 98715,
 16796,
 49565,
 82333,
 82334,
 33189,
 33191,
 65960,
 33193,
 49579,
 16812,
 98740,
 65973,
 16822,
 82359,
 16825,
 33210,
 16827,
 82365,
 98751,
 16832,
 98754,
 33220,
 453,
 49604,
 98763,
 461,
 469,
 49623,
 16856,
 33244,
 49629,
 16867,
 16869,
 98794,
 82410,
 82412,
 495,
 16882,
 98803,
 49651,
 49656,
 33273,
 16889,
 507,
 33276,
 82426,
 66046,
 49658,
 16895,
 49669,
 82437,
 33290,
 49674,
 16911,
 530,
 33300,
 66069,
 16918,
 16922,
 66077,
 542,
 543,
 82463,
 66081,
 16932,
 66091,
 556,
 66093,
 66094,
 98860,
 33324,
 66097,
 82475,
 49709,
 16942,
 49715,
 49716,
 66106,
 98877,
 66111,
 49729,
 33346,
 579,
 33347,
 82499,
 49735,
 16967,
 49739,
 588,
 16983,
 82522,
 16991,
 82528,
 49761,
 49762,
 66151,
 66152,
 66153,
 17003,
 49777,
 98932,
 17012,
 66166,
 17020,
 17021,
 639,
 640,
 49793,
 642,
 17026,
 98948,
 17027,
 82563,
 49796,
 17030,
 66185,
 82566,
 651,
 33422,
 82576,
 98962,
 33427,
 17043,
 49812,
 82584,
 98970,
 17050,
 17052,
 66207,
 82592,
 673,
 33441,
 17057,
 17061,
 33446,
 49831,
 66221,
 33453,
 687,
 66224,
 82605,
 49841,
 33459,
 66228,
 17076,
 694,
 33464,
 33466,
 33468,
 99005,
 66238,
 99007,
 99006,
 82628,
 710,
 711,
 82630,
 99017,
 82632,
 718,
 82639,
 49872,
 82645,
 82652,
 49889,
 82658,
 99044,
 743,
 17127,
 33513,
 49897,
 17132,
 749,
 49901,
 49903,
 82674,
 33523,
 17142,
 759,
 33527,
 17144,
 66299,
 99068,
 33535,
 17152,
 769,
 33539,
 66307,
 66309,
 82694,
 17162,
 99084,
 49932,
 783,
 17167,
 66321,
 99090,
 66323,
 49935,
 82713,
 99098,
 66333,
 66334,
 82719,
 800,
 66337,
 17185,
 49954,
 17190,
 33576,
 49962,
 17196,
 99120,
 99122,
 33587,
 49972,
 49974,
 17207,
 33592,
 33593,
 33600,
 33604,
 99140,
 82757,
 841,
 49994,
 845,
 99149,
 17235,
 82771,
 66389,
 854,
 82775,
 33624,
 33625,
 858,
 66395,
 99163,
 33629,
 17245,
 82782,
 17249,
 17251,
 82787,
 99173,
 33638,
 17255,
 874,
 66411,
 82794,
 17259,
 878,
 33647,
 66415,
 66417,
 33650,
 66418,
 99182,
 33652,
 82798,
 891,
 17275,
 82812,
 33662,
 82813,
 17283,
 66436,
 901,
 99208,
 66441,
 33674,
 33675,
 17291,
 82829,
 33682,
 916,
 66452,
 33688,
 17304,
 33691,
 33692,
 66461,
 66463,
 82847,
 33697,
 99234,
 931,
 82848,
 17317,
 938,
 17324,
 82863,
 944,
 66480,
 33716,
 99254,
 82878,
 959,
 33728,
 99267,
 33736,
 33741,
 17359,
 978,
 17364,
 33750,
 82905,
 17371,
 992,
 17378,
 33765,
 99301,
 1000,
 50152,
 50155,
 1005,
 99309,
 66542,
 50158,
 82928,
 33780,
 99318,
 1017,
 33787,
 66557,
 1024,
 17408,
 17409,
 1027,
 50179,
 82949,
 82950,
 1034,
 50187,
 1036,
 50191,
 66577,
 66580,
 99351,
 82969,
 82970,
 17437,
 99360,
 1057,
 33825,
 99363,
 82977,
 82979,
 33831,
 66600,
 17447,
 17448,
 33837,
 50221,
 66607,
 99376,
 33841,
 82993,
 99380,
 66613,
 82996,
 82998,
 33849,
 83003,
 66622,
 1087,
 1088,
 99391,
 17473,
 83011,
 50245,
 1095,
 1098,
 66637,
 1102,
 99405,
 50256,
 99410,
 66644,
 99413,
 66646,
 99419,
 66652,
 83037,
 1118,
 83040,
 99431,
 66663,
 1129,
 33900,
 17516,
 66670,
 83052,
 50290,
 17524,
 33912,
 83065,
 33914,
 66685,
 83071,
 17537,
 50309,
 33927,
 50314,
 99467,
 99470,
 50318,
 99472,
 66705,
 83088,
 66708,
 66709,
 50327,
 99484,
 99485,
 83101,
 99487,
 66721,
 33954,
 1187,
 83105,
 50337,
 50341,
 99495,
 66728,
 50345,
 50346,
 17578,
 66732,
 83116,
 66734,
 50351,
 50356,
 99510,
 17591,
 50363,
 99517,
 66751,
 17599,
 1223,
 33992,
 99528,
 17607,
 50376,
 50379,
 17616,
 99537,
 66770,
 99539,
 1237,
 1242,
 66779,
 66780,
 17628,
 1247,
 17631,
 83168,
 50402,
 99555,
 50405,
 34024,
 50409,
 50411,
 1265,
 34034,
 83186,
 17651,
 50419,
 1270,
 34042,
 99580,
 1277,
 66814,
 99583,
 83200,
 17671,
 50441,
 17677,
 34064,
 1297,
 66834,
 50448,
 50449,
 50456,
 66844,
 83229,
 83231,
 1312,
 66849,
 50464,
 17698,
 17699,
 83236,
 17702,
 1326,
 99632,
 17713,
 66866,
 34099,
 83252,
 99637,
 1334,
 66871,
 83258,
 66877,
 1342,
 50493,
 17728,
 1345,
 99651,
 66887,
 1352,
 50503,
 1354,
 17737,
 50508,
 66895,
 99671,
 66909,
 34143,
 99681,
 83300,
 17770,
 1392,
 66929,
 34162,
 17777,
 1401,
 1402,
 34169,
 50554,
 99709,
 17788,
 1407,
 66944,
 66945,
 50557,
 1412,
 1415,
 34184,
 1420,
 99729,
 1426,
 34196,
 83351,
 66971,
 1436,
 83357,
 66974,
 17821,
 1443,
 83363,
 50598,
 34217,
 66986,
 83369,
 1452,
 66993,
 50609,
 1459,
 34227,
 66996,
 83385,
 50618,
 50626,
 50627,
 83406,
 83407,
 34258,
 50643,
 83412,
 99797,
 50647,
 99803,
 50652,
 50654,
 1508,
 50661,
 99814,
 1512,
 1515,
 50667,
 67053,
 99821,
 17901,
 99824,
 99825,
 83438,
 50672,
 34292,
 83441,
 50677,
 17910,
 83446,
 83447,
 83448,
 17916,
 17918,
 34303,
 67071,
 34307,
 1540,
 67077,
 34310,
 67079,
 83460,
 99854,
 83473,
 67092,
 67094,
 99862,
 67100,
 1565,
 34333,
 1567,
 34338,
 99877,
 1579,
 83502,
 83504,
 1585,
 67127,
 99897,
 50753,
 83521,
 1603,
 1604,
 17991,
 17992,
 50760,
 83527,
 67147,
 17996,
 67152,
 34388,
 67156,
 99924,
 1623,
 34395,
 99931,
 18016,
 1635,
 99946,
 67180,
 99949,
 99954,
 34419,
 83571,
 99957,
 83572,
 83573,
 18043,
 50811,
 67197,
 34434,
 99970,
 18053,
 83590,
 34440,
 1673,
 50826,
 1675,
 1676,
 34443,
 67212,
 67216,
 18065,
 18066,
 18068,
 18069,
 50838,
 50839,
 34457,
 50841,
 1691,
 1692,
 83615,
 1697,
 50855,
 34473,
 67241,
 50861,
 34480,
 67250,
 18104,
 18108,
 83645,
 18112,
 18116,
 67269,
 1743,
 83667,
 67284,
 50904,
 83674,
 50910,
 34528,
 34529,
 18146,
 50917,
 83688,
 50923,
 1775,
 18160,
 18168,
 34553,
 67322,
 67323,
 18171,
 50939,
 67327,
 50944,
 50947,
 1798,
 83718,
 34572,
 1805,
 83724,
 34576,
 67344,
 83730,
 67350,
 1820,
 1821,
 34590,
 50972,
 1825,
 50986,
 50988,
 34606,
 1839,
 67375,
 50990,
 50993,
 67379,
 83767,
 51000,
 1852,
 34620,
 83774,
 18239,
 1861,
 18245,
 1866,
 67402,
 1874,
 83795,
 83799,
 67416,
 18270,
 51039,
 83807,
 83810,
 51043,
 51046,
 67431,
 51047,
 1904,
 67441,
 18289,
 34687,
 1920,
 34689,
 1919,
 51071,
 51074,
 83846,
 18314,
 51084,
 34701,
 18319,
 34708,
 34710,
 18326,
 51095,
 67492,
 67493,
 34725,
 1959,
 51108,
 67500,
 18348,
 83887,
 1970,
 83895,
 34744,
 51128,
 83898,
 83902,
 34753,
 83906,
 51140,
 18373,
 67531,
 18384,
 2004,
 51159,
 2008,
 2009,
 18397,
 18401,
 83940,
 2023,
 67560,
 34793,
 67562,
 34795,
 51175,
 83951,
 51184,
 83953,
 ...}