Commit b4649acc by 20210828028

DuEE-fin

parent 15907570
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""duee finance data predict post-process"""
import os
import sys
import json
import argparse
from utils import read_by_lines, write_by_lines, extract_result
enum_event_type = "公司上市"
enum_role = "环节"
def event_normalization(doc):
"""event_merge"""
for event in doc.get("event_list", []):
argument_list = []
argument_set = set()
for arg in event["arguments"]:
arg_str = "{}-{}".format(arg["role"], arg["argument"])
if arg_str not in argument_set:
argument_list.append(arg)
argument_set.add(arg_str)
event["arguments"] = argument_list
event_list = sorted(
doc.get("event_list", []),
key=lambda x: len(x["arguments"]),
reverse=True)
new_event_list = []
for event in event_list:
event_type = event["event_type"]
event_argument_set = set()
for arg in event["arguments"]:
event_argument_set.add("{}-{}".format(arg["role"], arg["argument"]))
flag = True
for new_event in new_event_list:
if event_type != new_event["event_type"]:
continue
new_event_argument_set = set()
for arg in new_event["arguments"]:
new_event_argument_set.add("{}-{}".format(arg["role"], arg[
"argument"]))
if len(event_argument_set & new_event_argument_set) == len(
new_event_argument_set):
flag = False
if flag:
new_event_list.append(event)
doc["event_list"] = new_event_list
return doc
def predict_data_process(trigger_file, role_file, enum_file, schema_file,
save_path):
"""predict_data_process"""
pred_ret = []
trigger_data = read_by_lines(trigger_file)
role_data = read_by_lines(role_file)
enum_data = read_by_lines(enum_file)
schema_data = read_by_lines(schema_file)
print("trigger predict {} load from {}".format(
len(trigger_data), trigger_file))
print("role predict {} load from {}".format(len(role_data), role_file))
print("enum predict {} load from {}".format(len(enum_data), enum_file))
print("schema {} load from {}".format(len(schema_data), schema_file))
schema, sent_role_mapping, sent_enum_mapping = {}, {}, {}
for s in schema_data:
d_json = json.loads(s)
schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]
# role depends on id and sent_id
for d in role_data:
d_json = json.loads(d)
r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
role_ret = {}
for r in r_ret:
role_type = r["type"]
if role_type not in role_ret:
role_ret[role_type] = []
role_ret[role_type].append("".join(r["text"]))
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
sent_role_mapping[_id] = role_ret
# process the enum_role data
for d in enum_data:
d_json = json.loads(d)
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
label = d_json["pred"]["label"]
sent_enum_mapping[_id] = label
# process trigger data
for d in trigger_data:
d_json = json.loads(d)
t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
pred_event_types = list(set([t["type"] for t in t_ret]))
event_list = []
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
for event_type in pred_event_types:
role_list = schema[event_type]
arguments = []
for role_type, ags in sent_role_mapping[_id].items():
if role_type not in role_list:
continue
for arg in ags:
arguments.append({"role": role_type, "argument": arg})
# 特殊处理环节
if event_type == enum_event_type:
arguments.append({
"role": enum_role,
"argument": sent_enum_mapping[_id]
})
event = {
"event_type": event_type,
"arguments": arguments,
"text": d_json["text"]
}
event_list.append(event)
pred_ret.append({
"id": d_json["id"],
"sent_id": d_json["sent_id"],
"text": d_json["text"],
"event_list": event_list
})
doc_pred = {}
for d in pred_ret:
if d["id"] not in doc_pred:
doc_pred[d["id"]] = {"id": d["id"], "event_list": []}
doc_pred[d["id"]]["event_list"].extend(d["event_list"])
# unfiy the all prediction results and save them
doc_pred = [
json.dumps(
event_normalization(r), ensure_ascii=False)
for r in doc_pred.values()
]
print("submit data {} save to {}".format(len(doc_pred), save_path))
write_by_lines(save_path, doc_pred)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Official evaluation script for DuEE version 1.0")
parser.add_argument(
"--trigger_file", help="trigger model predict data path", required=True)
parser.add_argument(
"--role_file", help="role model predict data path", required=True)
parser.add_argument(
"--enum_file", help="enum model predict data path", required=True)
parser.add_argument("--schema_file", help="schema file path", required=True)
parser.add_argument("--save_path", help="save file path", required=True)
args = parser.parse_args()
predict_data_process(args.trigger_file, args.role_file, args.enum_file,
args.schema_file, args.save_path)
\ No newline at end of file
data_dir=${1}
conf_path=${2}
ckpt_dir=${3}
predict_data=${4}
learning_rate=${5}
is_train=${6}
max_seq_len=${7}
batch_size=${8}
epoch=${9}
pred_save_path=${10}
if [ "$is_train" = True ]; then
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" classifier.py \
--num_epoch ${epoch} \
--learning_rate 5e-5 \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train True \
--do_predict False \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 1 \
--valid_step 5 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
else
export CUDA_VISIBLE_DEVICES=0
python classifier.py \
--num_epoch ${epoch} \
--learning_rate 5e-5 \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train False \
--do_predict True \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 1 \
--valid_step 1 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
fi
\ No newline at end of file
dataset_name=DuEE-Fin
data_dir=./data/${dataset_name}
conf_dir=./conf/${dataset_name}
ckpt_dir=./ckpt/${dataset_name}
submit_data_path=./submit/test_duee_fin.json
pred_data=${data_dir}/sentence/test.json # 换其他数据,需要修改它
learning_rate=5e-5
max_seq_len=300
batch_size=16
epoch=20
echo -e "check and create directory"
dir_list=(./ckpt ${ckpt_dir} ./submit)
for item in ${dir_list[*]}
do
if [ ! -d ${item} ]; then
mkdir ${item}
echo "create dir * ${item} *"
else
echo "dir ${item} exist"
fi
done
process_name=${1}
run_sequence_labeling_model(){
model=${1}
is_train=${2}
pred_save_path=${ckpt_dir}/${model}/test_pred.json
sh run_sequence_labeling.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
}
run_classifier_model(){
model=${1}
is_train=${2}
pred_save_path=${ckpt_dir}/${model}/test_pred.json
sh run_classifier.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
}
if [ ${process_name} == data_prepare ]; then
echo -e "\nstart ${dataset_name} data prepare"
python duee_fin_data_prepare.py
echo -e "end ${dataset_name} data prepare"
elif [ ${process_name} == trigger_train ]; then
echo -e "\nstart ${dataset_name} trigger train"
run_sequence_labeling_model trigger True
echo -e "end ${dataset_name} trigger train"
elif [ ${process_name} == trigger_predict ]; then
echo -e "\nstart ${dataset_name} trigger predict"
run_sequence_labeling_model trigger False
echo -e "end ${dataset_name} trigger predict"
elif [ ${process_name} == role_train ]; then
echo -e "\nstart ${dataset_name} role train"
run_sequence_labeling_model role True
echo -e "end ${dataset_name} role train"
elif [ ${process_name} == role_predict ]; then
echo -e "\nstart ${dataset_name} role predict"
run_sequence_labeling_model role False
echo -e "end ${dataset_name} role predict"
elif [ ${process_name} == enum_train ]; then
echo -e "\nstart ${dataset_name} enum train"
run_classifier_model enum True
echo -e "end ${dataset_name} enum train"
elif [ ${process_name} == enum_predict ]; then
echo -e "\nstart ${dataset_name} enum predict"
run_classifier_model enum False
echo -e "end ${dataset_name} enum predict"
elif [ ${process_name} == pred_2_submit ]; then
echo -e "\nstart ${dataset_name} predict data merge to submit fotmat"
python duee_fin_postprocess.py --trigger_file ${ckpt_dir}/trigger/test_pred.json --role_file ${ckpt_dir}/role/test_pred.json --enum_file ${ckpt_dir}/enum/test_pred.json --schema_file ${conf_dir}/event_schema.json --save_path ${submit_data_path}
echo -e "end ${dataset_name} role predict data merge"
else
echo "no process name ${process_name}"
fi
\ No newline at end of file
data_dir=$1
conf_path=$2
ckpt_dir=$3
predict_data=$4
learning_rate=$5
is_train=$6
max_seq_len=$7
batch_size=$8
epoch=${9}
pred_save_path=${10}
if [ "$is_train" = True ]; then
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" sequence_labeling.py \
--num_epoch ${epoch} \
--learning_rate ${learning_rate} \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train True \
--do_predict False \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 10 \
--valid_step 50 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
else
export CUDA_VISIBLE_DEVICES=0
python sequence_labeling.py \
--num_epoch ${epoch} \
--learning_rate ${learning_rate} \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train False \
--do_predict True \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 10 \
--valid_step 50 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
fi
\ No newline at end of file
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
def cal_md5(str):
"""calculate string md5"""
str = str.decode("utf-8", "ignore").encode("utf-8", "ignore")
return hashlib.md5(str).hexdigest()
def read_by_lines(path):
"""read the data by line"""
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
"""write the data"""
with open(path, "w") as outfile:
[outfile.write(d + "\n") for d in data]
def text_to_sents(text):
"""text_to_sents"""
deliniter_symbols = [u"。", u"?", u"!"]
paragraphs = text.split("\n")
ret = []
for para in paragraphs:
if para == u"":
continue
sents = [u""]
for s in para:
sents[-1] += s
if s in deliniter_symbols:
sents.append(u"")
if sents[-1] == u"":
sents = sents[:-1]
ret.extend(sents)
return ret
def load_dict(dict_path):
"""load_dict"""
vocab = {}
for line in open(dict_path, 'r', encoding='utf-8'):
value, key = line.strip('\n').split('\t')
vocab[key] = int(value)
return vocab
def extract_result(text, labels):
"""extract_result"""
ret, is_start, cur_type = [], False, None
if len(text) != len(labels):
# 韩文回导致label 比 text要长
labels = labels[:len(text)]
for i, label in enumerate(labels):
if label != u"O":
_type = label[2:]
if label.startswith(u"B-"):
is_start = True
cur_type = _type
ret.append({"start": i, "text": [text[i]], "type": _type})
elif _type != cur_type:
"""
# 如果是没有B-开头的,则不要这部分数据
cur_type = None
is_start = False
"""
cur_type = _type
is_start = True
ret.append({"start": i, "text": [text[i]], "type": _type})
elif is_start:
ret[-1]["text"].append(text[i])
else:
cur_type = None
is_start = False
else:
cur_type = None
is_start = False
return ret
if __name__ == "__main__":
s = "xxdedewd"
print(cal_md5(s.encode("utf-8")))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment