Commit b4649acc by 20210828028

DuEE-fin

parent 15907570
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
classification
"""
import ast
import os
import csv
import json
import warnings
import random
import argparse
import traceback
from functools import partial
from collections import namedtuple
import numpy as np
import paddle
import paddle.nn.functional as F
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from utils import read_by_lines, write_by_lines, load_dict
# warnings.filterwarnings('ignore')
"""
For All pre-trained model(English and Chinese),
Please refer to https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst.
"""
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--tag_path", type=str, default=None, help="tag set path")
parser.add_argument("--train_data", type=str, default=None, help="train data")
parser.add_argument("--dev_data", type=str, default=None, help="dev data")
parser.add_argument("--test_data", type=str, default=None, help="test data")
parser.add_argument("--predict_data", type=str, default=None, help="predict data")
parser.add_argument("--do_train", type=ast.literal_eval, default=True, help="do train")
parser.add_argument("--do_predict", type=ast.literal_eval, default=True, help="do predict")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--valid_step", type=int, default=100, help="validation step")
parser.add_argument("--skip_step", type=int, default=20, help="skip step")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--checkpoints", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--init_ckpt", type=str, default=None, help="already pretraining model checkpoint")
parser.add_argument("--predict_save_path", type=str, default=None, help="predict data save path")
parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable.
def set_seed(random_seed):
"""sets random seed"""
random.seed(random_seed)
np.random.seed(random_seed)
paddle.seed(random_seed)
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
"""
Given a dataset, it evals model and computes the metric.
Args:
model(obj:`paddle.nn.Layer`): A model to classify texts.
data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
criterion(obj:`paddle.nn.Layer`): It can compute the loss.
metric(obj:`paddle.metric.Metric`): The evaluation metric.
"""
model.eval()
metric.reset()
losses = []
for batch in data_loader:
input_ids, token_type_ids, labels = batch
logits = model(input_ids, token_type_ids)
loss = criterion(logits, labels)
losses.append(loss.numpy())
correct = metric.compute(logits, labels)
metric.update(correct)
accuracy = metric.accumulate()
metric.reset()
model.train()
return float(np.mean(losses)), accuracy
def convert_example(example, tokenizer, label_map=None, max_seq_len=512, is_test=False):
"""convert_example"""
has_text_b = False
if isinstance(example, dict):
has_text_b = "text_b" in example.keys()
else:
has_text_b = "text_b" in example._fields
text_b = None
if has_text_b:
text_b = example.text_b
tokenized_input = tokenizer(
text=example.text_a,
text_pair=text_b,
max_seq_len=max_seq_len)
input_ids = tokenized_input['input_ids']
token_type_ids = tokenized_input['token_type_ids']
if is_test:
return input_ids, token_type_ids
else:
label = np.array([label_map[example.label]], dtype="int64")
return input_ids, token_type_ids, label
class DuEventExtraction(paddle.io.Dataset):
"""Du"""
def __init__(self, data_path, tag_path):
self.label_vocab = load_dict(tag_path)
self.examples = self._read_tsv(data_path)
def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
headers = next(reader)
text_indices = [
index for index, h in enumerate(headers) if h != "label"]
Example = namedtuple('Example', headers)
examples = []
for line in reader:
for index, text in enumerate(line):
if index in text_indices:
line[index] = text
try:
example = Example(*line)
except Exception as e:
traceback.print_exc()
raise Exception(e)
examples.append(example)
return examples
def __len__(self):
return len(self.examples)
def __getitem__(self, index):
return self.examples[index]
def data_2_examples(datas):
"""data_2_examples"""
has_text_b, examples = False, []
if isinstance(datas[0], list):
Example = namedtuple('Example', ["text_a", "text_b"])
has_text_b = True
else:
Example = namedtuple('Example', ["text_a"])
for item in datas:
if has_text_b:
example = Example(text_a=item[0], text_b=item[1])
else:
example = Example(text_a=item)
examples.append(example)
return examples
def do_train():
paddle.set_device(args.device)
world_size = paddle.distributed.get_world_size()
rank = paddle.distributed.get_rank()
if world_size > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
label_map = load_dict(args.tag_path)
id2label = {val: key for key, val in label_map.items()}
model = ErnieForSequenceClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))
model = paddle.DataParallel(model)
tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
print("============start train==========")
train_ds = DuEventExtraction(args.train_data, args.tag_path)
dev_ds = DuEventExtraction(args.dev_data, args.tag_path)
test_ds = DuEventExtraction(args.test_data, args.tag_path)
trans_func = partial(
convert_example, tokenizer=tokenizer, label_map=label_map, max_seq_len=args.max_seq_len)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'),
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'),
Stack(dtype="int64") # label
): fn(list(map(trans_func, samples)))
batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True)
train_loader = paddle.io.DataLoader(
dataset=train_ds,
batch_sampler=batch_sampler,
collate_fn=batchify_fn)
dev_loader = paddle.io.DataLoader(
dataset=dev_ds,
batch_size=args.batch_size,
collate_fn=batchify_fn)
test_loader = paddle.io.DataLoader(
dataset=test_ds,
batch_size=args.batch_size,
collate_fn=batchify_fn)
num_training_steps = len(train_loader) * args.num_epoch
metric = paddle.metric.Accuracy()
criterion = paddle.nn.loss.CrossEntropyLoss()
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=args.learning_rate,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params)
step, best_performerence = 0, 0.0
model.train()
for epoch in range(args.num_epoch):
for idx, (input_ids, token_type_ids, labels) in enumerate(train_loader):
logits = model(input_ids, token_type_ids)
loss = criterion(logits, labels)
probs = F.softmax(logits, axis=1)
correct = metric.compute(probs, labels)
metric.update(correct)
acc = metric.accumulate()
loss.backward()
optimizer.step()
optimizer.clear_grad()
loss_item = loss.numpy().item()
if step > 0 and step % args.skip_step == 0 and rank == 0:
print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) ' \
f'- loss: {loss_item:.6f} acc {acc:.5f}')
if step > 0 and step % args.valid_step == 0 and rank == 0:
loss_dev, acc_dev = evaluate(model, criterion, metric, dev_loader)
print(f'dev step: {step} - loss: {loss_dev:.6f} accuracy: {acc_dev:.5f}, ' \
f'current best {best_performerence:.5f}')
if acc_dev > best_performerence:
best_performerence = acc_dev
print(f'==============================================save best model ' \
f'best performerence {best_performerence:5f}')
paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints))
step += 1
# save the final model
if rank == 0:
paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
def do_predict():
set_seed(args.seed)
paddle.set_device(args.device)
label_map = load_dict(args.tag_path)
id2label = {val: key for key, val in label_map.items()}
model = ErnieForSequenceClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))
model = paddle.DataParallel(model)
tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
print("============start predict==========")
if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
else:
state_dict = paddle.load(args.init_ckpt)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.init_ckpt)
# load data from predict file
sentences = read_by_lines(args.predict_data) # origin data format
sentences = [json.loads(sent) for sent in sentences]
encoded_inputs_list = []
for sent in sentences:
sent = sent["text"]
input_sent = [sent] # only text_a
if "text_b" in sent:
input_sent = [[sent, sent["text_b"]]] # add text_b
example = data_2_examples(input_sent)[0]
input_ids, token_type_ids = convert_example(example, tokenizer,
max_seq_len=args.max_seq_len, is_test=True)
encoded_inputs_list.append((input_ids, token_type_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
): fn(samples)
# Seperates data into some batches.
batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size]
for i in range(0, len(encoded_inputs_list), args.batch_size)]
results = []
model.eval()
for batch in batch_encoded_inputs:
input_ids, token_type_ids = batchify_fn(batch)
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
logits = model(input_ids, token_type_ids)
probs = F.softmax(logits, axis=1)
probs_ids = paddle.argmax(probs, -1).numpy()
probs = probs.numpy()
for prob_one, p_id in zip(probs.tolist(), probs_ids.tolist()):
label_probs = {}
for idx, p in enumerate(prob_one):
label_probs[id2label[idx]] = p
results.append({"probs": label_probs, "label": id2label[p_id]})
assert len(results) == len(sentences)
for sent, ret in zip(sentences, results):
sent["pred"] = ret
sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
write_by_lines(args.predict_save_path, sentences)
print("save data {} to {}".format(len(sentences), args.predict_save_path))
if __name__ == '__main__':
if args.do_train:
do_train()
elif args.do_predict:
do_predict()
\ No newline at end of file
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""duee finance dataset proces"""
import os
import sys
import json
from utils import read_by_lines, write_by_lines, text_to_sents, cal_md5
enum_role = "环节"
def data_process(path, model="trigger", is_predict=False):
"""data_process"""
def label_data(data, start, l, _type):
"""label_data"""
for i in range(start, start + l):
suffix = "B-" if i == start else "I-"
data[i] = "{}{}".format(suffix, _type)
return data
sentences = []
output = ["text_a"] if is_predict else ["text_a\tlabel"]
for line in read_by_lines(path):
d_json = json.loads(line)
_id = d_json["id"]
text_a = [
"," if t == " " or t == "\n" or t == "\t" else t
for t in list(d_json["text"].lower())
]
if is_predict:
sentences.append({"text": d_json["text"], "id": _id})
output.append('\002'.join(text_a))
else:
if model == u"trigger":
labels = ["O"] * len(text_a)
if len(d_json.get("event_list", [])) == 0:
continue
for event in d_json.get("event_list", []):
event_type = event["event_type"]
start = event["trigger_start_index"]
trigger = event["trigger"]
labels = label_data(labels, start, len(trigger), event_type)
output.append("{}\t{}".format('\002'.join(text_a), '\002'.join(
labels)))
elif model == u"role":
for event in d_json.get("event_list", []):
labels = ["O"] * len(text_a)
for arg in event["arguments"]:
role_type = arg["role"]
if role_type == enum_role:
continue
argument = arg["argument"]
start = arg["argument_start_index"]
labels = label_data(labels, start,
len(argument), role_type)
output.append("{}\t{}".format('\002'.join(text_a),
'\002'.join(labels)))
return output
def enum_data_process(path, is_predict=False):
"""enum_data_process"""
output = ["text_a"] if is_predict else ["label\ttext_a"]
for line in read_by_lines(path):
d_json = json.loads(line)
text = d_json["text"].lower().replace("\t", " ")
if is_predict:
output.append(text)
continue
if len(d_json.get("event_list", [])) == 0:
continue
label = None
for event in d_json["event_list"]:
if event["event_type"] != "公司上市":
continue
for argument in event["arguments"]:
role_type = argument["role"]
if role_type == enum_role:
label = argument["argument"]
if label:
output.append("{}\t{}".format(label, text))
return output
def schema_process(path, model="trigger"):
"""schema_process"""
def label_add(labels, _type):
"""label_add"""
if "B-{}".format(_type) not in labels:
labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
return labels
labels = []
for line in read_by_lines(path):
d_json = json.loads(line.strip())
if model == "trigger":
labels = label_add(labels, d_json["event_type"])
elif model == "role":
for role in d_json["role_list"]:
if role["role"] == enum_role:
continue
labels = label_add(labels, role["role"])
elif model == "enum":
for role in d_json["role_list"]:
if role["role"] == enum_role:
labels = role["enum_items"]
labels.append("O")
tags = []
for index, label in enumerate(labels):
tags.append("{}\t{}".format(index, label))
if model == "enum":
tags = tags[:-1]
return tags
def marked_doc_2_sentence(doc):
"""marked_doc_2_sentence"""
def argument_in_sent(sent, argument_list, trigger):
"""argument_in_sent"""
trigger_start = sent.find(trigger)
if trigger_start < 0:
return trigger_start, [], None
new_arguments, enum_argument = [], None
for argument in argument_list:
word = argument["argument"]
role_type = argument["role"]
if role_type == enum_role:
# special
enum_argument = argument
continue
start = sent.find(word)
if start < 0:
continue
new_arguments.append({
"role": role_type,
"argument": word,
"argument_start_index": start
})
return trigger_start, new_arguments, enum_argument
title = doc["title"]
text = doc["text"]
sents = text_to_sents(text)
exist_sents, sent_mapping_event, sents_order = set(), {}, []
step = 3
batch_sents = [sents[i:i + step] for i in range(0, len(sents), step)]
if len(title) > 0:
batch_sents = [[title]] + batch_sents
for batch in batch_sents:
b_sent = " ".join(batch).replace("\n", " ").replace(
"\r\n", " ").replace("\r", " ").replace("\t", " ")
if b_sent in sent_mapping_event:
continue
sent_id = cal_md5(b_sent.encode("utf-8"))
sent_mapping_event[b_sent] = {
"id": doc["id"],
"sent_id": sent_id,
"text": b_sent
}
sents_order.append(b_sent)
for event in doc.get("event_list", []):
cur_sent, trigger_start, arguments, enum_argument = "", -1, [], None
for sent in sents_order:
tri_start, argus, enum_arg = argument_in_sent(
sent, event["arguments"], event["trigger"])
if tri_start < 0:
continue
if len(argus) > len(arguments):
cur_sent, trigger_start, arguments = sent, tri_start, argus
if enum_arg:
enum_argument = enum_arg
if trigger_start >= 0 and len(arguments) > 0:
# add enum 2 event
if enum_argument:
arguments.append(enum_argument)
if "event_list" not in sent_mapping_event[cur_sent]:
sent_mapping_event[cur_sent]["event_list"] = []
new_event = {
"arguments": arguments,
"event_type": event["event_type"],
"trigger": event["trigger"],
"trigger_start_index": trigger_start
}
sent_mapping_event[cur_sent]["event_list"].append(new_event)
return sent_mapping_event.values()
def docs_data_process(path):
"""docs_data_process"""
lines = read_by_lines(path)
sentences = []
for line in lines:
d_json = json.loads(line)
sentences.extend(marked_doc_2_sentence(d_json))
sentences = [json.dumps(s, ensure_ascii=False) for s in sentences]
return sentences
if __name__ == "__main__":
# schema process
print("\n=================DUEE FINANCE DATASET==============")
conf_dir = "./conf/DuEE-Fin"
schema_path = 'DuEE_DuIE_data/data_DuEE/DuEE-Fin/duee_fin_event_schema.json'#"{}/event_schema.json".format(conf_dir)
tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir)
tags_role_path = "{}/role_tag.dict".format(conf_dir)
tags_enum_path = "{}/enum_tag.dict".format(conf_dir)
print("\n=================start schema process==============")
print('input path {}'.format(schema_path))
tags_trigger = schema_process(schema_path, "trigger")
write_by_lines(tags_trigger_path, tags_trigger)
print("save trigger tag {} at {}".format(
len(tags_trigger), tags_trigger_path))
tags_role = schema_process(schema_path, "role")
write_by_lines(tags_role_path, tags_role)
print("save trigger tag {} at {}".format(len(tags_role), tags_role_path))
tags_enum = schema_process(schema_path, "enum")
write_by_lines(tags_enum_path, tags_enum)
print("save enum enum tag {} at {}".format(len(tags_enum), tags_enum_path))
print("=================end schema process===============")
# data process
data_dir = "./DuEE_DuIE_data/data_DuEE/DuEE-Fin"
sentence_dir = "{}/sentence".format(data_dir)
trigger_save_dir = "{}/trigger".format(data_dir)
role_save_dir = "{}/role".format(data_dir)
enum_save_dir = "{}/enum".format(data_dir)
print("\n=================start data process==============")
print("\n********** start document process **********")
if not os.path.exists(sentence_dir):
os.makedirs(sentence_dir)
train_sent = docs_data_process("{}/duee_fin_train.json".format(data_dir))
write_by_lines("{}/train.json".format(sentence_dir), train_sent)
dev_sent = docs_data_process("{}/duee_fin_dev.json".format(data_dir))
write_by_lines("{}/dev.json".format(sentence_dir), dev_sent)
test_sent = docs_data_process("{}/duee_fin_test1.json".format(data_dir))
write_by_lines("{}/test.json".format(sentence_dir), test_sent)
print("train {} dev {} test {}".format(
len(train_sent), len(dev_sent), len(test_sent)))
print("********** end document process **********")
print("\n********** start sentence process **********")
print("\n----trigger------for dir {} to {}".format(sentence_dir,
trigger_save_dir))
if not os.path.exists(trigger_save_dir):
os.makedirs(trigger_save_dir)
train_tri = data_process("{}/train.json".format(sentence_dir), "trigger")
write_by_lines("{}/train.tsv".format(trigger_save_dir), train_tri)
dev_tri = data_process("{}/dev.json".format(sentence_dir), "trigger")
write_by_lines("{}/dev.tsv".format(trigger_save_dir), dev_tri)
test_tri = data_process("{}/test.json".format(sentence_dir), "trigger")
write_by_lines("{}/test.tsv".format(trigger_save_dir), test_tri)
print("train {} dev {} test {}".format(
len(train_tri), len(dev_tri), len(test_tri)))
print("\n----role------for dir {} to {}".format(sentence_dir,
role_save_dir))
if not os.path.exists(role_save_dir):
os.makedirs(role_save_dir)
train_role = data_process("{}/train.json".format(sentence_dir), "role")
write_by_lines("{}/train.tsv".format(role_save_dir), train_role)
dev_role = data_process("{}/dev.json".format(sentence_dir), "role")
write_by_lines("{}/dev.tsv".format(role_save_dir), dev_role)
test_role = data_process("{}/test.json".format(sentence_dir), "role")
write_by_lines("{}/test.tsv".format(role_save_dir), test_role)
print("train {} dev {} test {}".format(
len(train_role), len(dev_role), len(test_role)))
print("\n----enum------for dir {} to {}".format(sentence_dir,
enum_save_dir))
if not os.path.exists(enum_save_dir):
os.makedirs(enum_save_dir)
trian_enum = enum_data_process("{}/train.json".format(sentence_dir))
write_by_lines("{}/train.tsv".format(enum_save_dir), trian_enum)
dev_enum = enum_data_process("{}/dev.json".format(sentence_dir))
write_by_lines("{}/dev.tsv".format(enum_save_dir), dev_enum)
test_enum = enum_data_process("{}/test.json".format(sentence_dir))
write_by_lines("{}/test.tsv".format(enum_save_dir), test_enum)
print("train {} dev {} test {}".format(
len(trian_enum), len(dev_enum), len(test_enum)))
print("********** end sentence process **********")
print("=================end data process==============")
\ No newline at end of file
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""duee finance data predict post-process"""
import os
import sys
import json
import argparse
from utils import read_by_lines, write_by_lines, extract_result
enum_event_type = "公司上市"
enum_role = "环节"
def event_normalization(doc):
"""event_merge"""
for event in doc.get("event_list", []):
argument_list = []
argument_set = set()
for arg in event["arguments"]:
arg_str = "{}-{}".format(arg["role"], arg["argument"])
if arg_str not in argument_set:
argument_list.append(arg)
argument_set.add(arg_str)
event["arguments"] = argument_list
event_list = sorted(
doc.get("event_list", []),
key=lambda x: len(x["arguments"]),
reverse=True)
new_event_list = []
for event in event_list:
event_type = event["event_type"]
event_argument_set = set()
for arg in event["arguments"]:
event_argument_set.add("{}-{}".format(arg["role"], arg["argument"]))
flag = True
for new_event in new_event_list:
if event_type != new_event["event_type"]:
continue
new_event_argument_set = set()
for arg in new_event["arguments"]:
new_event_argument_set.add("{}-{}".format(arg["role"], arg[
"argument"]))
if len(event_argument_set & new_event_argument_set) == len(
new_event_argument_set):
flag = False
if flag:
new_event_list.append(event)
doc["event_list"] = new_event_list
return doc
def predict_data_process(trigger_file, role_file, enum_file, schema_file,
save_path):
"""predict_data_process"""
pred_ret = []
trigger_data = read_by_lines(trigger_file)
role_data = read_by_lines(role_file)
enum_data = read_by_lines(enum_file)
schema_data = read_by_lines(schema_file)
print("trigger predict {} load from {}".format(
len(trigger_data), trigger_file))
print("role predict {} load from {}".format(len(role_data), role_file))
print("enum predict {} load from {}".format(len(enum_data), enum_file))
print("schema {} load from {}".format(len(schema_data), schema_file))
schema, sent_role_mapping, sent_enum_mapping = {}, {}, {}
for s in schema_data:
d_json = json.loads(s)
schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]
# role depends on id and sent_id
for d in role_data:
d_json = json.loads(d)
r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
role_ret = {}
for r in r_ret:
role_type = r["type"]
if role_type not in role_ret:
role_ret[role_type] = []
role_ret[role_type].append("".join(r["text"]))
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
sent_role_mapping[_id] = role_ret
# process the enum_role data
for d in enum_data:
d_json = json.loads(d)
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
label = d_json["pred"]["label"]
sent_enum_mapping[_id] = label
# process trigger data
for d in trigger_data:
d_json = json.loads(d)
t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
pred_event_types = list(set([t["type"] for t in t_ret]))
event_list = []
_id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
for event_type in pred_event_types:
role_list = schema[event_type]
arguments = []
for role_type, ags in sent_role_mapping[_id].items():
if role_type not in role_list:
continue
for arg in ags:
arguments.append({"role": role_type, "argument": arg})
# 特殊处理环节
if event_type == enum_event_type:
arguments.append({
"role": enum_role,
"argument": sent_enum_mapping[_id]
})
event = {
"event_type": event_type,
"arguments": arguments,
"text": d_json["text"]
}
event_list.append(event)
pred_ret.append({
"id": d_json["id"],
"sent_id": d_json["sent_id"],
"text": d_json["text"],
"event_list": event_list
})
doc_pred = {}
for d in pred_ret:
if d["id"] not in doc_pred:
doc_pred[d["id"]] = {"id": d["id"], "event_list": []}
doc_pred[d["id"]]["event_list"].extend(d["event_list"])
# unfiy the all prediction results and save them
doc_pred = [
json.dumps(
event_normalization(r), ensure_ascii=False)
for r in doc_pred.values()
]
print("submit data {} save to {}".format(len(doc_pred), save_path))
write_by_lines(save_path, doc_pred)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Official evaluation script for DuEE version 1.0")
parser.add_argument(
"--trigger_file", help="trigger model predict data path", required=True)
parser.add_argument(
"--role_file", help="role model predict data path", required=True)
parser.add_argument(
"--enum_file", help="enum model predict data path", required=True)
parser.add_argument("--schema_file", help="schema file path", required=True)
parser.add_argument("--save_path", help="save file path", required=True)
args = parser.parse_args()
predict_data_process(args.trigger_file, args.role_file, args.enum_file,
args.schema_file, args.save_path)
\ No newline at end of file
data_dir=${1}
conf_path=${2}
ckpt_dir=${3}
predict_data=${4}
learning_rate=${5}
is_train=${6}
max_seq_len=${7}
batch_size=${8}
epoch=${9}
pred_save_path=${10}
if [ "$is_train" = True ]; then
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" classifier.py \
--num_epoch ${epoch} \
--learning_rate 5e-5 \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train True \
--do_predict False \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 1 \
--valid_step 5 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
else
export CUDA_VISIBLE_DEVICES=0
python classifier.py \
--num_epoch ${epoch} \
--learning_rate 5e-5 \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train False \
--do_predict True \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 1 \
--valid_step 1 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
fi
\ No newline at end of file
dataset_name=DuEE-Fin
data_dir=./data/${dataset_name}
conf_dir=./conf/${dataset_name}
ckpt_dir=./ckpt/${dataset_name}
submit_data_path=./submit/test_duee_fin.json
pred_data=${data_dir}/sentence/test.json # 换其他数据,需要修改它
learning_rate=5e-5
max_seq_len=300
batch_size=16
epoch=20
echo -e "check and create directory"
dir_list=(./ckpt ${ckpt_dir} ./submit)
for item in ${dir_list[*]}
do
if [ ! -d ${item} ]; then
mkdir ${item}
echo "create dir * ${item} *"
else
echo "dir ${item} exist"
fi
done
process_name=${1}
run_sequence_labeling_model(){
model=${1}
is_train=${2}
pred_save_path=${ckpt_dir}/${model}/test_pred.json
sh run_sequence_labeling.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
}
run_classifier_model(){
model=${1}
is_train=${2}
pred_save_path=${ckpt_dir}/${model}/test_pred.json
sh run_classifier.sh ${data_dir}/${model} ${conf_dir}/${model}_tag.dict ${ckpt_dir}/${model} ${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
}
if [ ${process_name} == data_prepare ]; then
echo -e "\nstart ${dataset_name} data prepare"
python duee_fin_data_prepare.py
echo -e "end ${dataset_name} data prepare"
elif [ ${process_name} == trigger_train ]; then
echo -e "\nstart ${dataset_name} trigger train"
run_sequence_labeling_model trigger True
echo -e "end ${dataset_name} trigger train"
elif [ ${process_name} == trigger_predict ]; then
echo -e "\nstart ${dataset_name} trigger predict"
run_sequence_labeling_model trigger False
echo -e "end ${dataset_name} trigger predict"
elif [ ${process_name} == role_train ]; then
echo -e "\nstart ${dataset_name} role train"
run_sequence_labeling_model role True
echo -e "end ${dataset_name} role train"
elif [ ${process_name} == role_predict ]; then
echo -e "\nstart ${dataset_name} role predict"
run_sequence_labeling_model role False
echo -e "end ${dataset_name} role predict"
elif [ ${process_name} == enum_train ]; then
echo -e "\nstart ${dataset_name} enum train"
run_classifier_model enum True
echo -e "end ${dataset_name} enum train"
elif [ ${process_name} == enum_predict ]; then
echo -e "\nstart ${dataset_name} enum predict"
run_classifier_model enum False
echo -e "end ${dataset_name} enum predict"
elif [ ${process_name} == pred_2_submit ]; then
echo -e "\nstart ${dataset_name} predict data merge to submit fotmat"
python duee_fin_postprocess.py --trigger_file ${ckpt_dir}/trigger/test_pred.json --role_file ${ckpt_dir}/role/test_pred.json --enum_file ${ckpt_dir}/enum/test_pred.json --schema_file ${conf_dir}/event_schema.json --save_path ${submit_data_path}
echo -e "end ${dataset_name} role predict data merge"
else
echo "no process name ${process_name}"
fi
\ No newline at end of file
data_dir=$1
conf_path=$2
ckpt_dir=$3
predict_data=$4
learning_rate=$5
is_train=$6
max_seq_len=$7
batch_size=$8
epoch=${9}
pred_save_path=${10}
if [ "$is_train" = True ]; then
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" sequence_labeling.py \
--num_epoch ${epoch} \
--learning_rate ${learning_rate} \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train True \
--do_predict False \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 10 \
--valid_step 50 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
else
export CUDA_VISIBLE_DEVICES=0
python sequence_labeling.py \
--num_epoch ${epoch} \
--learning_rate ${learning_rate} \
--tag_path ${conf_path} \
--train_data ${data_dir}/train.tsv \
--dev_data ${data_dir}/dev.tsv \
--test_data ${data_dir}/test.tsv \
--predict_data ${predict_data} \
--do_train False \
--do_predict True \
--max_seq_len ${max_seq_len} \
--batch_size ${batch_size} \
--skip_step 10 \
--valid_step 50 \
--checkpoints ${ckpt_dir} \
--init_ckpt ${ckpt_dir}/best.pdparams \
--predict_save_path ${pred_save_path} \
--device gpu
fi
\ No newline at end of file
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
sequence labeling
"""
import ast
import os
import json
import warnings
import random
import argparse
from functools import partial
import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import ErnieTokenizer, ErnieForTokenClassification, LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator
from utils import read_by_lines, write_by_lines, load_dict
warnings.filterwarnings('ignore')
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--tag_path", type=str, default=None, help="tag set path")
parser.add_argument("--train_data", type=str, default=None, help="train data")
parser.add_argument("--dev_data", type=str, default=None, help="dev data")
parser.add_argument("--test_data", type=str, default=None, help="test data")
parser.add_argument("--predict_data", type=str, default=None, help="predict data")
parser.add_argument("--do_train", type=ast.literal_eval, default=True, help="do train")
parser.add_argument("--do_predict", type=ast.literal_eval, default=True, help="do predict")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--valid_step", type=int, default=100, help="validation step")
parser.add_argument("--skip_step", type=int, default=20, help="skip step")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--checkpoints", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--init_ckpt", type=str, default=None, help="already pretraining model checkpoint")
parser.add_argument("--predict_save_path", type=str, default=None, help="predict data save path")
parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable.
def set_seed(args):
"""sets random seed"""
random.seed(args.seed)
np.random.seed(args.seed)
paddle.seed(args.seed)
@paddle.no_grad()
def evaluate(model, criterion, metric, num_label, data_loader):
"""evaluate"""
model.eval()
metric.reset()
losses = []
for input_ids, seg_ids, seq_lens, labels in data_loader:
logits = model(input_ids, seg_ids)
loss = paddle.mean(criterion(logits.reshape([-1, num_label]), labels.reshape([-1])))
losses.append(loss.numpy())
preds = paddle.argmax(logits, axis=-1)
n_infer, n_label, n_correct = metric.compute(None, seq_lens, preds, labels)
metric.update(n_infer.numpy(), n_label.numpy(), n_correct.numpy())
precision, recall, f1_score = metric.accumulate()
avg_loss = np.mean(losses)
model.train()
return precision, recall, f1_score, avg_loss
def convert_example_to_feature(example, tokenizer, label_vocab=None, max_seq_len=512, no_entity_label="O", ignore_label=-1, is_test=False):
tokens, labels = example
tokenized_input = tokenizer(
tokens,
return_length=True,
is_split_into_words=True,
max_seq_len=max_seq_len)
input_ids = tokenized_input['input_ids']
token_type_ids = tokenized_input['token_type_ids']
seq_len = tokenized_input['seq_len']
if is_test:
return input_ids, token_type_ids, seq_len
elif label_vocab is not None:
labels = labels[:(max_seq_len-2)]
encoded_label = [no_entity_label] + labels + [no_entity_label]
encoded_label = [label_vocab[x] for x in encoded_label]
return input_ids, token_type_ids, seq_len, encoded_label
class DuEventExtraction(paddle.io.Dataset):
"""DuEventExtraction"""
def __init__(self, data_path, tag_path):
self.label_vocab = load_dict(tag_path)
self.word_ids = []
self.label_ids = []
with open(data_path, 'r', encoding='utf-8') as fp:
# skip the head line
next(fp)
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
self.word_ids.append(words)
self.label_ids.append(labels)
self.label_num = max(self.label_vocab.values()) + 1
def __len__(self):
return len(self.word_ids)
def __getitem__(self, index):
return self.word_ids[index], self.label_ids[index]
def do_train():
paddle.set_device(args.device)
world_size = paddle.distributed.get_world_size()
rank = paddle.distributed.get_rank()
if world_size > 1:
paddle.distributed.init_parallel_env()
set_seed(args)
no_entity_label = "O"
ignore_label = -1
tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
label_map = load_dict(args.tag_path)
id2label = {val: key for key, val in label_map.items()}
model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))
model = paddle.DataParallel(model)
print("============start train==========")
train_ds = DuEventExtraction(args.train_data, args.tag_path)
dev_ds = DuEventExtraction(args.dev_data, args.tag_path)
test_ds = DuEventExtraction(args.test_data, args.tag_path)
trans_func = partial(
convert_example_to_feature,
tokenizer=tokenizer,
label_vocab=train_ds.label_vocab,
max_seq_len=args.max_seq_len,
no_entity_label=no_entity_label,
ignore_label=ignore_label,
is_test=False)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input ids
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token type ids
Stack(dtype='int64'), # sequence lens
Pad(axis=0, pad_val=ignore_label, dtype='int64') # labels
): fn(list(map(trans_func, samples)))
batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True)
train_loader = paddle.io.DataLoader(
dataset=train_ds,
batch_sampler=batch_sampler,
collate_fn=batchify_fn)
dev_loader = paddle.io.DataLoader(
dataset=dev_ds,
batch_size=args.batch_size,
collate_fn=batchify_fn)
test_loader = paddle.io.DataLoader(
dataset=test_ds,
batch_size=args.batch_size,
collate_fn=batchify_fn)
num_training_steps = len(train_loader) * args.num_epoch
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=args.learning_rate,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params)
metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=False)
criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
step, best_f1 = 0, 0.0
model.train()
for epoch in range(args.num_epoch):
for idx, (input_ids, token_type_ids, seq_lens, labels) in enumerate(train_loader):
logits = model(input_ids, token_type_ids).reshape(
[-1, train_ds.label_num])
loss = paddle.mean(criterion(logits, labels.reshape([-1])))
loss.backward()
optimizer.step()
optimizer.clear_grad()
loss_item = loss.numpy().item()
if step > 0 and step % args.skip_step == 0 and rank == 0:
print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}')
if step > 0 and step % args.valid_step == 0 and rank == 0:
p, r, f1, avg_loss = evaluate(model, criterion, metric, len(label_map), dev_loader)
print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \
f'f1: {f1:.5f} current best {best_f1:.5f}')
if f1 > best_f1:
best_f1 = f1
print(f'==============================================save best model ' \
f'best performerence {best_f1:5f}')
paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints))
step += 1
# save the final model
if rank == 0:
paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
def do_predict():
paddle.set_device(args.device)
tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
label_map = load_dict(args.tag_path)
id2label = {val: key for key, val in label_map.items()}
model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))
no_entity_label = "O"
ignore_label = len(label_map)
print("============start predict==========")
if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
else:
state_dict = paddle.load(args.init_ckpt)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.init_ckpt)
# load data from predict file
sentences = read_by_lines(args.predict_data) # origin data format
sentences = [json.loads(sent) for sent in sentences]
encoded_inputs_list = []
for sent in sentences:
sent = sent["text"].replace(" ", "\002")
input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer,
max_seq_len=args.max_seq_len, is_test=True)
encoded_inputs_list.append((input_ids, token_type_ids, seq_len))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids
Stack(dtype='int64') # sequence lens
): fn(samples)
# Seperates data into some batches.
batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size]
for i in range(0, len(encoded_inputs_list), args.batch_size)]
results = []
model.eval()
for batch in batch_encoded_inputs:
input_ids, token_type_ids, seq_lens = batchify_fn(batch)
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
logits = model(input_ids, token_type_ids)
probs = F.softmax(logits, axis=-1)
probs_ids = paddle.argmax(probs, -1).numpy()
probs = probs.numpy()
for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()):
prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])]
label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]]
results.append({"probs": prob_one, "labels": label_one})
assert len(results) == len(sentences)
for sent, ret in zip(sentences, results):
sent["pred"] = ret
sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
write_by_lines(args.predict_save_path, sentences)
print("save data {} to {}".format(len(sentences), args.predict_save_path))
if __name__ == '__main__':
if args.do_train:
do_train()
elif args.do_predict:
do_predict()
\ No newline at end of file
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
def cal_md5(str):
"""calculate string md5"""
str = str.decode("utf-8", "ignore").encode("utf-8", "ignore")
return hashlib.md5(str).hexdigest()
def read_by_lines(path):
"""read the data by line"""
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
"""write the data"""
with open(path, "w") as outfile:
[outfile.write(d + "\n") for d in data]
def text_to_sents(text):
"""text_to_sents"""
deliniter_symbols = [u"。", u"?", u"!"]
paragraphs = text.split("\n")
ret = []
for para in paragraphs:
if para == u"":
continue
sents = [u""]
for s in para:
sents[-1] += s
if s in deliniter_symbols:
sents.append(u"")
if sents[-1] == u"":
sents = sents[:-1]
ret.extend(sents)
return ret
def load_dict(dict_path):
"""load_dict"""
vocab = {}
for line in open(dict_path, 'r', encoding='utf-8'):
value, key = line.strip('\n').split('\t')
vocab[key] = int(value)
return vocab
def extract_result(text, labels):
"""extract_result"""
ret, is_start, cur_type = [], False, None
if len(text) != len(labels):
# 韩文回导致label 比 text要长
labels = labels[:len(text)]
for i, label in enumerate(labels):
if label != u"O":
_type = label[2:]
if label.startswith(u"B-"):
is_start = True
cur_type = _type
ret.append({"start": i, "text": [text[i]], "type": _type})
elif _type != cur_type:
"""
# 如果是没有B-开头的,则不要这部分数据
cur_type = None
is_start = False
"""
cur_type = _type
is_start = True
ret.append({"start": i, "text": [text[i]], "type": _type})
elif is_start:
ret[-1]["text"].append(text[i])
else:
cur_type = None
is_start = False
else:
cur_type = None
is_start = False
return ret
if __name__ == "__main__":
s = "xxdedewd"
print(cal_md5(s.encode("utf-8")))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment