Commit 8e8f131f by 20200203063

Upload New File

parent 4d476bb4
#encoding: utf-8
import pandas as pd
import json
from tqdm import tqdm
review_path = "data/review.json"
valid_business_id_path = "data/valid_business_id.txt"
def gen_valid_business_id(review_path,count_citerion=100):
valid_business = []
count_business_id = dict()
reviews = []
with open(review_path,"r",encoding="utf-8") as f:
for line in tqdm(f.readlines()):
if len(line.strip()) == 0:
continue
ele = json.loads(line.strip())
if not ele["business_id"] in count_business_id:
count_business_id[ele["business_id"]] = 0
count_business_id[ele["business_id"]] += 1
reviews.append(ele)
review_df = pd.DataFrame(reviews)
print("total count of business id in {}: {}".format(len(count_business_id),review_path))
for key,value in count_business_id.items():
if value >= count_citerion:
valid_business.append("{}\t{}".format(str(key),str(value)))
return valid_business
valid_business = gen_valid_business_id(review_path,count_citerion=100)
with open(valid_business_id_path,'w',encoding='utf-8') as f:
f.write("\n".join(valid_business))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment