Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
) # small variation of seq_length
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
sequence_labels = None
token_labels = None
is_impossible_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
config = XLMConfig(
vocab_size=self.vocab_size,
n_special=self.n_special,
emb_dim=self.hidden_size,
n_layers=self.num_hidden_layers,
n_heads=self.num_attention_heads,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
gelu_activation=self.gelu_activation,
sinusoidal_embeddings=self.sinusoidal_embeddings,
asm=self.asm,
causal=self.causal,
n_langs=self.n_langs,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
summary_type=self.summary_type,
use_proj=self.use_proj,
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
) # small variation of seq_length
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
sequence_labels = None
token_labels = None
is_impossible_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
config = XLMConfig(
vocab_size=self.vocab_size,
n_special=self.n_special,
emb_dim=self.hidden_size,
n_layers=self.num_hidden_layers,
n_heads=self.num_attention_heads,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
gelu_activation=self.gelu_activation,
sinusoidal_embeddings=self.sinusoidal_embeddings,
asm=self.asm,
causal=self.causal,
n_langs=self.n_langs,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
summary_type=self.summary_type,
use_proj=self.use_proj,
from torch.utils.data import DataLoader
from tqdm import tqdm, trange
from transformers import (AdamW, BertConfig, BertTokenizer, RobertaConfig,
RobertaTokenizer, XLMConfig, XLMTokenizer,
XLNetConfig, XLNetTokenizer,
get_linear_schedule_with_warmup)
from nlp_architect.models import TrainableModel
from nlp_architect.models.transformers.quantized_bert import \
QuantizedBertConfig
logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys())
for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
def get_models(models: List[str]):
if models is not None:
return [m for m in ALL_MODELS if m.split('-')[0] in models]
return ALL_MODELS
class TransformerBase(TrainableModel):
"""
Transformers base model (for working with pytorch-transformers models)
"""
MODEL_CONFIGURATIONS = {
'bert': (BertConfig, BertTokenizer),
'quant_bert': (QuantizedBertConfig, BertTokenizer),
'xlnet': (XLNetConfig, XLNetTokenizer),
)
import argparse
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import glue_convert_examples_to_features as convert_examples_to_features
logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
RobertaConfig, DistilBertConfig)), ())
MODEL_CLASSES = {
'bert': (BertConfig, TFBertForSequenceClassification, BertTokenizer),
'xlnet': (XLNetConfig, TFXLNetForSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, TFXLMForSequenceClassification, XLMTokenizer),
'roberta': (RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer)
}
def load_and_cache_examples(args, data, task, tokenizer, split):
if task == 'mnli' and split == 'validation':
split = 'validation_matched'
features_output_dir = os.path.join(args.output_dir, 'features')
cached_features_file = os.path.join(features_output_dir, 'cached_{}_{}_{}_{}.tfrecord'.format(
split,
list(filter(None, args.model_name_or_path.split('/'))).pop(),
str(args.max_seq_length),
str(task)))
XLMTokenizer,
XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer,
RobertaConfig,
RobertaForSequenceClassification,
RobertaTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
"distilbert": (
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
),
}
# Create text corpus suitable for language model training
def create_corpus(text_list, target_path, logger=None):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "textcat"])
with open(target_path, "w") as f:
import torch
import torch.nn.functional as F
import numpy as np
from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from transformers import XLNetLMHeadModel, XLNetTokenizer
from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
from transformers import XLMWithLMHeadModel, XLMTokenizer
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. """
MODEL_CLASSES = {
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
def get_config(name):
"""Map a name to the appropriate transformers.*Config class."""
name = name.lower()
if name.startswith("roberta"):
return transformers.RobertaConfig
elif name.startswith("distilbert"):
return transformers.DistilBertConfig
elif name.startswith("bert"):
return transformers.BertConfig
elif name.startswith("xlnet"):
return transformers.XLNetConfig
elif name.startswith("gpt2"):
return transformers.GPT2Config
elif name.startswith("xlm"):
return transformers.XLMConfig
else:
raise ValueError(f"Unsupported transformers config name: '{name}'")
def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1):
"""
Initializes a QuestionAnsweringModel model.
Args:
model_type: The type of model (bert, xlnet, xlm, distilbert)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args['
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
self.model = model_class.from_pretrained(model_name)
if use_cuda:
if torch.cuda.is_available():
if cuda_device == -1:
self.device = torch.device("cuda")
else:
self.device = torch.device(f"cuda:{cuda_device}")
else:
raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
else:
logger = logging.getLogger(__name__)
ALL_MODELS = sum(
(
tuple(conf.pretrained_config_archive_map.keys())
for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
),
(),
)
MODEL_CLASSES = {
"bert": (BertConfig, BertModel, BertTokenizer),
"xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
"xlm": (XLMConfig, XLMModel, XLMTokenizer),
"roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
"albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def train(args, train_dataset, model, tokenizer, criterion):
""" Train the model """