# coding=utf-8 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utilities for wrapping BertModel.""" import torch from .modeling import BertConfig from .modeling import BertForPreTraining, BertForMaskedLM from .modeling import BertLayerNorm def get_params_for_weight_decay_optimization(module): weight_decay_params = {'params': []} no_weight_decay_params = {'params': [], 'weight_decay': 0.0} for module_ in module.modules(): if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)): no_weight_decay_params['params'].extend( [p for p in list(module_._parameters.values()) if p is not None]) else: weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n != 'bias']) no_weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']) return weight_decay_params, no_weight_decay_params class BertModel(torch.nn.Module): def __init__(self, args): super(BertModel, self).__init__() if args.pretrained_bert: self.model = BertForPreTraining.from_pretrained( args.tokenizer_model_type, cache_dir=args.cache_dir, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, layernorm_epsilon=args.layernorm_epsilon) else: if args.intermediate_size is None: intermediate_size = 4 * args.hidden_size else: intermediate_size = args.intermediate_size self.config = BertConfig( args.tokenizer_num_tokens, hidden_size=args.hidden_size, num_hidden_layers=args.num_layers, num_attention_heads=args.num_attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=args.hidden_dropout, attention_probs_dropout_prob=args.attention_dropout, max_position_embeddings=args.max_position_embeddings, type_vocab_size=args.tokenizer_num_type_tokens, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, fp32_tokentypes=args.fp32_tokentypes, layernorm_epsilon=args.layernorm_epsilon, deep_init=args.deep_init) self.model = BertForPreTraining(self.config) def forward(self, input_tokens, token_type_ids=None, attention_mask=None, checkpoint_activations=False): return self.model( input_tokens, token_type_ids, attention_mask, checkpoint_activations=checkpoint_activations) def state_dict(self, destination=None, prefix='', keep_vars=False): return self.model.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) def load_state_dict(self, state_dict, strict=True): return self.model.load_state_dict(state_dict, strict=strict)