Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
try:
from torch.utils.tensorboard import SummaryWriter
except ImportError:
from tensorboardX import SummaryWriter
logger = logging.getLogger(__name__)
MODEL_CLASSES = {
"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
"openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
"bert": (BertConfig, BertForMaskedLM, BertTokenizer),
"roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
"camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}
class TextDataset(Dataset):
def __init__(self, tokenizer, args, file_path="train", block_size=512):
assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(
directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
from allennlp.modules.elmo import Elmo, batch_to_ids
import models.slot_tagger as slot_tagger
import models.slot_tagger_with_focus as slot_tagger_with_focus
import models.slot_tagger_crf as slot_tagger_with_crf
import models.snt_classifier as snt_classifier
import utils.vocab_reader as vocab_reader
import utils.data_reader_for_elmo as data_reader
import utils.read_wordEmb as read_wordEmb
import utils.util as util
import utils.acc as acc
MODEL_CLASSES = {
'bert': (BertModel, BertTokenizer),
'xlnet': (XLNetModel, XLNetTokenizer),
}
parser = argparse.ArgumentParser()
parser.add_argument('--task_st', required=True, help='slot filling task: slot_tagger | slot_tagger_with_focus | slot_tagger_with_crf')
parser.add_argument('--task_sc', required=True, help='intent detection task: none | 2tails | maxPooling | hiddenCNN | hiddenAttention')
parser.add_argument('--sc_type', default='single_cls_CE', help='single_cls_CE | multi_cls_BCE')
parser.add_argument('--st_weight', type=float, default=0.5, help='loss weight for slot tagging task, ranging from 0 to 1.')
parser.add_argument('--dataset', required=True, help='atis-2 | snips')
parser.add_argument('--dataroot', required=True, help='path to dataset')
parser.add_argument('--save_model', default='model', help='save model to this file')
#parser.add_argument('--mini_word_freq', type=int, default=2, help='mini_word_freq in the training data')
#parser.add_argument('--word_lowercase', action='store_true', help='word lowercase')
parser.add_argument('--bos_eos', action='store_true', help='Whether to add <s> and </s> to the input sentence (default is not)')
parser.add_argument('--save_vocab', default='vocab', help='save vocab to this file')
gpt2_tokens_cleaner,
xlnet_tokens_cleaner,
)
model_class_dict = {
"bert": TFBertModel,
"gpt": TFOpenAIGPTModel,
"gpt2": TFGPT2Model,
"xlnet": TFXLNetModel,
# "xlm": TFXLMModel, # Currently doesn't work because of a bug in transformers library https://github.com/huggingface/transformers/issues/2729
"distilbert": TFDistilBertModel,
"roberta": TFRobertaModel,
}
model_tokenizer_dict = {
"bert": BertTokenizer,
"gpt": OpenAIGPTTokenizer,
"gpt2": GPT2Tokenizer,
"xlnet": XLNetTokenizer,
# "xlm": XLMTokenizer,
"distilbert": DistilBertTokenizer,
"roberta": RobertaTokenizer,
}
model_weights_defaults = {
"bert": "bert-base-uncased",
"gpt": "openai-gpt",
"gpt2": "gpt2",
"xlnet": "xlnet-base-cased",
# "xlm": "xlm-mlm-enfr-1024",
"distilbert": "distilbert-base-uncased",
"roberta": "roberta-base",
}
try:
from torch.utils.tensorboard import SummaryWriter
except ImportError:
from tensorboardX import SummaryWriter
logger = logging.getLogger(__name__)
ALL_MODELS = sum(
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def train(args, train_dataset, model, tokenizer):
""" Train the model """
if args.local_rank in [-1, 0]:
def __call__(self, args):
self.args = args
if not os.path.exists(args.file):
os.mkdir(args.file)
if not os.path.exists(args.fields) or args.preprocess:
print("Preprocess the data")
self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
if args.feat == 'char':
self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos,
fix_len=args.fix_len, tokenize=list)
elif args.feat == 'bert':
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]',
tokenize=tokenizer.encode)
else:
self.FEAT = Field('tags', bos=bos)
self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int)
self.REL = Field('rels', bos=bos)
if args.feat in ('char', 'bert'):
self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
HEAD=self.HEAD, DEPREL=self.REL)
else:
self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT,
HEAD=self.HEAD, DEPREL=self.REL)
train = Corpus.load(args.ftrain, self.fields)
if args.fembed:
embed = Embedding.load(args.fembed, args.unk)
if num_hidden_layers > tmp_config['num_hidden_layers']:
raise DLPyError('You specified more hidden layers than are available in '
'the base BERT model.')
self._base_model = BertModel.from_pretrained(name,
cache_dir=cache_dir,
num_hidden_layers=num_hidden_layers)
self._config = BertConfig.from_pretrained(name,
cache_dir=cache_dir,
num_hidden_layers=num_hidden_layers).to_dict()
if self._verbose:
print("NOTE: base BERT model loaded.")
# instantiate BERT _tokenizer
self._tokenizer = BertTokenizer.from_pretrained(name)
# load embedding table (token | position | segment)
self._load_embedding_table()
if self._config['max_position_embeddings'] < max_seq_len:
raise DLPyError('The specified maximum sequence length exceeds the maximum position embedding.')
else:
self._max_seq_len = max_seq_len
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()] # 类别名单
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.num_epochs = 3 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-5 # 学习率
self.bert_path = './bert'
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
# -------------------
# Convert the weights
# -------------------
logging.info("convert the model")
new_model.bert.load_state_dict(original.bert.state_dict())
new_model.decoder.load_state_dict(original.decoder.state_dict())
new_model.generator.load_state_dict(original.generator.state_dict())
# ----------------------------------
# Make sure the outpus are identical
# ----------------------------------
logging.info("Make sure that the models' outputs are identical")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# prepare the model inputs
encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
# failsafe to make sure the weights reset does not affect the
# loaded weights.
assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
# forward pass
src = encoder_input_ids
tgt = decoder_input_ids
def __init__(self, bert_model, lower, max_src_tokens, max_tgt_tokens):
self.max_src_tokens = max_src_tokens
self.max_tgt_tokens = max_tgt_tokens
self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower)
self.sep_token = '[SEP]'
self.cls_token = '[CLS]'
self.pad_token = '[PAD]'
self.tgt_bos = '[unused1] '
self.tgt_eos = ' [unused2]'
self.tgt_sent_split = ' [unused3] '
self.sep_vid = self.tokenizer.vocab[self.sep_token]
self.cls_vid = self.tokenizer.vocab[self.cls_token]
self.pad_vid = self.tokenizer.vocab[self.pad_token]