gen_valid_business_id.py 1.15 KB
Newer Older
20200203048 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
#encoding: utf-8
import pandas as pd
import json
from tqdm import tqdm

review_path = "data/review.json"
valid_business_id_path = "data/valid_business_id.txt"
def gen_valid_business_id(review_path,count_citerion=100):
    valid_business = []
    count_business_id = dict()
    reviews = []
    with open(review_path,"r",encoding="utf-8") as f:
        for line in tqdm(f.readlines()):
            if len(line.strip()) == 0:
                continue
            ele = json.loads(line.strip())
            if not ele["business_id"] in count_business_id:
                count_business_id[ele["business_id"]] = 0
            count_business_id[ele["business_id"]] += 1
            reviews.append(ele)
    review_df = pd.DataFrame(reviews)
    print("total count of business id in {}: {}".format(len(count_business_id),review_path))
    for key,value in count_business_id.items():
        if value >= count_citerion:
            valid_business.append("{}\t{}".format(str(key),str(value)))
    return valid_business

valid_business = gen_valid_business_id(review_path,count_citerion=100)
with open(valid_business_id_path,'w',encoding='utf-8') as f:
    f.write("\n".join(valid_business))