DuEE-fin

b4649acc · 20210828028 · 15907570 · b4649acc · b4649acc · b4649acc
Commit b4649acc authored Mar 09, 2022 by 20210828028
8 changed files
--- a/DuEE/classifier.py
+++ b/DuEE/classifier.py
--- a/DuEE/duee_fin_data_prepare.py
+++ b/DuEE/duee_fin_data_prepare.py
--- a/DuEE/duee_fin_postprocess.py
+++ b/DuEE/duee_fin_postprocess.py
+# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""duee finance data predict post-process"""
+
+import os
+import sys
+import json
+import argparse
+
+from utils import read_by_lines, write_by_lines, extract_result
+
+enum_event_type = "公司上市"
+enum_role = "环节"
+
+
+def event_normalization(doc):
+    """event_merge"""
+    for event in doc.get("event_list", []):
+        argument_list = []
+        argument_set = set()
+        for arg in event["arguments"]:
+            arg_str = "{}-{}".format(arg["role"], arg["argument"])
+            if arg_str not in argument_set:
+                argument_list.append(arg)
+            argument_set.add(arg_str)
+        event["arguments"] = argument_list
+
+    event_list = sorted(
+        doc.get("event_list", []),
+        key=lambda x: len(x["arguments"]),
+        reverse=True)
+    new_event_list = []
+    for event in event_list:
+        event_type = event["event_type"]
+        event_argument_set = set()
+        for arg in event["arguments"]:
+            event_argument_set.add("{}-{}".format(arg["role"], arg["argument"]))
+        flag = True
+        for new_event in new_event_list:
+            if event_type != new_event["event_type"]:
+                continue
+            new_event_argument_set = set()
+            for arg in new_event["arguments"]:
+                new_event_argument_set.add("{}-{}".format(arg["role"], arg[
+                    "argument"]))
+            if len(event_argument_set & new_event_argument_set) == len(
+                    new_event_argument_set):
+                flag = False
+        if flag:
+            new_event_list.append(event)
+    doc["event_list"] = new_event_list
+    return doc
+
+
+def predict_data_process(trigger_file, role_file, enum_file, schema_file,
+                         save_path):
+    """predict_data_process"""
+    pred_ret = []
+    trigger_data = read_by_lines(trigger_file)
+    role_data = read_by_lines(role_file)
+    enum_data = read_by_lines(enum_file)
+    schema_data = read_by_lines(schema_file)
+    print("trigger predict {} load from {}".format(
+        len(trigger_data), trigger_file))
+    print("role predict {} load from {}".format(len(role_data), role_file))
+    print("enum predict {} load from {}".format(len(enum_data), enum_file))
+    print("schema {} load from {}".format(len(schema_data), schema_file))
+
+    schema, sent_role_mapping, sent_enum_mapping = {}, {}, {}
+    for s in schema_data:
+        d_json = json.loads(s)
+        schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]
+
+    # role depends on id and sent_id 
+    for d in role_data:
+        d_json = json.loads(d)
+        r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
+        role_ret = {}
+        for r in r_ret:
+            role_type = r["type"]
+            if role_type not in role_ret:
+                role_ret[role_type] = []
+            role_ret[role_type].append("".join(r["text"]))
+        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
+        sent_role_mapping[_id] = role_ret
+
+    # process the enum_role data
+    for d in enum_data:
+        d_json = json.loads(d)
+        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
+        label = d_json["pred"]["label"]
+        sent_enum_mapping[_id] = label
+
+    # process trigger data
+    for d in trigger_data:
+        d_json = json.loads(d)
+        t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
+        pred_event_types = list(set([t["type"] for t in t_ret]))
+        event_list = []
+        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
+        for event_type in pred_event_types:
+            role_list = schema[event_type]
+            arguments = []
+            for role_type, ags in sent_role_mapping[_id].items():
+                if role_type not in role_list:
+                    continue
+                for arg in ags:
+                    arguments.append({"role": role_type, "argument": arg})
+            # 特殊处理环节
+            if event_type == enum_event_type:
+                arguments.append({
+                    "role": enum_role,
+                    "argument": sent_enum_mapping[_id]
+                })
+            event = {
+                "event_type": event_type,
+                "arguments": arguments,
+                "text": d_json["text"]
+            }
+            event_list.append(event)
+        pred_ret.append({
+            "id": d_json["id"],
+            "sent_id": d_json["sent_id"],
+            "text": d_json["text"],
+            "event_list": event_list
+        })
+    doc_pred = {}
+    for d in pred_ret:
+        if d["id"] not in doc_pred:
+            doc_pred[d["id"]] = {"id": d["id"], "event_list": []}
+        doc_pred[d["id"]]["event_list"].extend(d["event_list"])
+
+    # unfiy the all prediction results and save them
+    doc_pred = [
+        json.dumps(
+            event_normalization(r), ensure_ascii=False)
+        for r in doc_pred.values()
+    ]
+    print("submit data {} save to {}".format(len(doc_pred), save_path))
+    write_by_lines(save_path, doc_pred)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Official evaluation script for DuEE version 1.0")
+    parser.add_argument(
+        "--trigger_file", help="trigger model predict data path", required=True)
+    parser.add_argument(
+        "--role_file", help="role model predict data path", required=True)
+    parser.add_argument(
+        "--enum_file", help="enum model predict data path", required=True)
+    parser.add_argument("--schema_file", help="schema file path", required=True)
+    parser.add_argument("--save_path", help="save file path", required=True)
+    args = parser.parse_args()
+    predict_data_process(args.trigger_file, args.role_file, args.enum_file,
+                         args.schema_file, args.save_path)
\ No newline at end of file
--- a/DuEE/run_classifier.sh
+++ b/DuEE/run_classifier.sh
+data_dir=${1}
+conf_path=${2}
+ckpt_dir=${3}
+predict_data=${4}
+learning_rate=${5}
+is_train=${6}
+max_seq_len=${7}
+batch_size=${8}
+epoch=${9}
+pred_save_path=${10}
+
+
+if [ "$is_train" = True ]; then
+    unset CUDA_VISIBLE_DEVICES
+    python -m paddle.distributed.launch --gpus "0"  classifier.py \
+                                        --num_epoch ${epoch} \
+                                        --learning_rate 5e-5 \
+                                        --tag_path ${conf_path} \
+                                        --train_data ${data_dir}/train.tsv \
+                                        --dev_data ${data_dir}/dev.tsv \
+                                        --test_data ${data_dir}/test.tsv \
+                                        --predict_data ${predict_data} \
+                                        --do_train True \
+                                        --do_predict False \
+                                        --max_seq_len ${max_seq_len} \
+                                        --batch_size ${batch_size} \
+                                        --skip_step 1 \
+                                        --valid_step 5 \
+                                        --checkpoints ${ckpt_dir} \
+                                        --init_ckpt ${ckpt_dir}/best.pdparams \
+                                        --predict_save_path ${pred_save_path} \
+                                        --device gpu
+else
+    export CUDA_VISIBLE_DEVICES=0
+    python classifier.py \
+        --num_epoch ${epoch} \
+        --learning_rate 5e-5 \
+        --tag_path ${conf_path} \
+        --train_data ${data_dir}/train.tsv \
+        --dev_data ${data_dir}/dev.tsv \
+        --test_data ${data_dir}/test.tsv \
+        --predict_data ${predict_data} \
+        --do_train False \
+        --do_predict True \
+        --max_seq_len ${max_seq_len} \
+        --batch_size ${batch_size} \
+        --skip_step 1 \
+        --valid_step 1 \
+        --checkpoints ${ckpt_dir} \
+        --init_ckpt ${ckpt_dir}/best.pdparams \
+        --predict_save_path ${pred_save_path} \
+        --device gpu
+fi
\ No newline at end of file
--- a/DuEE/run_duee_fin.sh
+++ b/DuEE/run_duee_fin.sh
+dataset_name=DuEE-Fin
+data_dir=./data/${dataset_name}
+conf_dir=./conf/${dataset_name}
+ckpt_dir=./ckpt/${dataset_name}
+submit_data_path=./submit/test_duee_fin.json
+pred_data=${data_dir}/sentence/test.json  # 换其他数据，需要修改它
+
+learning_rate=5e-5
+max_seq_len=300
+batch_size=16
+epoch=20
+
+echo -e "check and create directory"
+dir_list=(./ckpt ${ckpt_dir} ./submit)
+for item in ${dir_list[*]}
+do
+    if [ ! -d ${item} ]; then
+        mkdir ${item}
+        echo "create dir * ${item} *"
+    else
+        echo "dir ${item} exist"
+    fi
+done
+
+process_name=${1}
+
+run_sequence_labeling_model(){
+    model=${1}
+    is_train=${2}
+    pred_save_path=${ckpt_dir}/${model}/test_pred.json
+    sh run_sequence_labeling.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
+}
+
+run_classifier_model(){
+    model=${1}
+    is_train=${2}
+    pred_save_path=${ckpt_dir}/${model}/test_pred.json
+    sh run_classifier.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
+}
+
+if [ ${process_name} == data_prepare ]; then
+    echo -e "\nstart ${dataset_name} data prepare"
+    python duee_fin_data_prepare.py
+    echo -e "end ${dataset_name} data prepare"
+elif [ ${process_name} == trigger_train ]; then
+    echo -e "\nstart ${dataset_name} trigger train"
+    run_sequence_labeling_model trigger True
+    echo -e "end ${dataset_name} trigger train"
+elif [ ${process_name} == trigger_predict ]; then
+    echo -e "\nstart ${dataset_name} trigger predict"
+    run_sequence_labeling_model trigger False
+    echo -e "end ${dataset_name} trigger predict"
+elif [ ${process_name} == role_train ]; then
+    echo -e "\nstart ${dataset_name} role train"
+    run_sequence_labeling_model role True
+    echo -e "end ${dataset_name} role train"
+elif [ ${process_name} == role_predict ]; then
+    echo -e "\nstart ${dataset_name} role predict"
+    run_sequence_labeling_model role False
+    echo -e "end ${dataset_name} role predict"
+elif [ ${process_name} == enum_train ]; then
+    echo -e "\nstart ${dataset_name} enum train"
+    run_classifier_model enum True
+    echo -e "end ${dataset_name} enum train"
+elif [ ${process_name} == enum_predict ]; then
+    echo -e "\nstart ${dataset_name} enum predict"
+    run_classifier_model enum False
+    echo -e "end ${dataset_name} enum predict"
+elif [ ${process_name} == pred_2_submit ]; then
+    echo -e "\nstart ${dataset_name} predict data merge to submit fotmat"
+    python duee_fin_postprocess.py --trigger_file ${ckpt_dir}/trigger/test_pred.json --role_file ${ckpt_dir}/role/test_pred.json --enum_file ${ckpt_dir}/enum/test_pred.json --schema_file ${conf_dir}/event_schema.json --save_path ${submit_data_path}
+    echo -e "end ${dataset_name} role predict data merge"
+else
+    echo "no process name ${process_name}"
+fi
\ No newline at end of file
--- a/DuEE/run_sequence_labeling.sh
+++ b/DuEE/run_sequence_labeling.sh
+
+data_dir=$1
+conf_path=$2
+ckpt_dir=$3
+predict_data=$4
+learning_rate=$5
+is_train=$6
+max_seq_len=$7
+batch_size=$8
+epoch=${9}
+pred_save_path=${10}
+
+if [ "$is_train" = True ]; then
+    unset CUDA_VISIBLE_DEVICES
+    python -m paddle.distributed.launch --gpus "0"  sequence_labeling.py \
+                            --num_epoch ${epoch} \
+                            --learning_rate ${learning_rate} \
+                            --tag_path ${conf_path} \
+                            --train_data ${data_dir}/train.tsv \
+                            --dev_data ${data_dir}/dev.tsv \
+                            --test_data ${data_dir}/test.tsv \
+                            --predict_data ${predict_data} \
+                            --do_train True \
+                            --do_predict False \
+                            --max_seq_len ${max_seq_len} \
+                            --batch_size ${batch_size} \
+                            --skip_step 10 \
+                            --valid_step 50 \
+                            --checkpoints ${ckpt_dir} \
+                            --init_ckpt ${ckpt_dir}/best.pdparams \
+                            --predict_save_path ${pred_save_path} \
+                            --device gpu
+else
+    export CUDA_VISIBLE_DEVICES=0
+    python sequence_labeling.py \
+            --num_epoch ${epoch} \
+            --learning_rate ${learning_rate} \
+            --tag_path ${conf_path} \
+            --train_data ${data_dir}/train.tsv \
+            --dev_data ${data_dir}/dev.tsv \
+            --test_data ${data_dir}/test.tsv \
+            --predict_data ${predict_data} \
+            --do_train False \
+            --do_predict True \
+            --max_seq_len ${max_seq_len} \
+            --batch_size ${batch_size} \
+            --skip_step 10 \
+            --valid_step 50 \
+            --checkpoints ${ckpt_dir} \
+            --init_ckpt ${ckpt_dir}/best.pdparams \
+            --predict_save_path ${pred_save_path} \
+            --device gpu
+fi
\ No newline at end of file
--- a/DuEE/sequence_labeling.py
+++ b/DuEE/sequence_labeling.py
--- a/DuEE/utils.py
+++ b/DuEE/utils.py
+# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+
+
+def cal_md5(str):
+    """calculate string md5"""
+    str = str.decode("utf-8", "ignore").encode("utf-8", "ignore")
+    return hashlib.md5(str).hexdigest()
+
+
+def read_by_lines(path):
+    """read the data by line"""
+    result = list()
+    with open(path, "r", encoding="utf8") as infile:
+        for line in infile:
+            result.append(line.strip())
+    return result
+
+
+def write_by_lines(path, data):
+    """write the data"""
+    with open(path, "w") as outfile:
+        [outfile.write(d + "\n") for d in data]
+
+
+def text_to_sents(text):
+    """text_to_sents"""
+    deliniter_symbols = [u"。", u"？", u"！"]
+    paragraphs = text.split("\n")
+    ret = []
+    for para in paragraphs:
+        if para == u"":
+            continue
+        sents = [u""]
+        for s in para:
+            sents[-1] += s
+            if s in deliniter_symbols:
+                sents.append(u"")
+        if sents[-1] == u"":
+            sents = sents[:-1]
+        ret.extend(sents)
+    return ret
+
+
+def load_dict(dict_path):
+    """load_dict"""
+    vocab = {}
+    for line in open(dict_path, 'r', encoding='utf-8'):
+        value, key = line.strip('\n').split('\t')
+        vocab[key] = int(value)
+    return vocab
+
+
+def extract_result(text, labels):
+    """extract_result"""
+    ret, is_start, cur_type = [], False, None
+    if len(text) != len(labels):
+        # 韩文回导致label 比 text要长
+        labels = labels[:len(text)]
+    for i, label in enumerate(labels):
+        if label != u"O":
+            _type = label[2:]
+            if label.startswith(u"B-"):
+                is_start = True
+                cur_type = _type
+                ret.append({"start": i, "text": [text[i]], "type": _type})
+            elif _type != cur_type:
+                """
+                # 如果是没有B-开头的，则不要这部分数据
+                cur_type = None
+                is_start = False
+                """
+                cur_type = _type
+                is_start = True
+                ret.append({"start": i, "text": [text[i]], "type": _type})
+            elif is_start:
+                ret[-1]["text"].append(text[i])
+            else:
+                cur_type = None
+                is_start = False
+        else:
+            cur_type = None
+            is_start = False
+    return ret
+
+
+if __name__ == "__main__":
+    s = "xxdedewd"
+    print(cal_md5(s.encode("utf-8")))
\ No newline at end of file