How to use the transformers.XLNetTokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / tests / test_transformer_embeddings.py View on Github external
def test_xlnet_embeddings():
    xlnet_model: str = "xlnet-large-cased"

    tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)
    model = XLNetModel.from_pretrained(
        pretrained_model_name_or_path=xlnet_model, output_hidden_states=True
    )
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]
github sz128 / slot_filling_and_intent_detection_of_SLU / tests / test_xlnet.py View on Github external
'''
@Time   : 2019-06-16 11:34:23
@Author : su.zhu
@Desc   : 
'''

import torch
from transformers import XLNetModel, XLNetTokenizer

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
last_hidden_states = model(input_ids)[0]

print(model.config)
print(last_hidden_states.size())

# Tokenized input
text_a = "Who was Jim Henson ?"
text_b = "Jim Henson was a puppeteer"
tokens_a = tokenizer.tokenize(text_a)
tokens_b = tokenizer.tokenize(text_b)
cls_token='[CLS]'
sep_token='[SEP]'
tokens = tokens_a + ['[SEP]']
segment_ids = [0] * len(tokens)
github huggingface / transformers / templates / adding_a_new_example_script / run_xxx.py View on Github external
RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)

# The follwing import is the official SQuAD evaluation script (2.0).
# You can remove it from the dependencies if you are using this script outside of the library
# We've added it here for automated tests (see examples/test_examples.py file)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

logger = logging.getLogger(__name__)

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())

MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def to_list(tensor):
    return tensor.detach().cpu().tolist()

def train(args, train_dataset, model, tokenizer):
    """ Train the model """
github ThilinaRajapakse / simpletransformers / simpletransformers / question_answering / question_answering_model.py View on Github external
def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1):
        """
        Initializes a QuestionAnsweringModel model.

        Args:
            model_type: The type of model (bert, xlnet, xlm, distilbert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args['
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
        """

        MODEL_CLASSES = {
            'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
            'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
            'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
            'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
            'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
        }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.model = model_class.from_pretrained(model_name)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
github sz128 / slot_filling_and_intent_detection_of_SLU / scripts / slot_tagging_and_intent_detection_with_transformer.py View on Github external
from utils.bert_xlnet_inputs import prepare_inputs_for_bert_xlnet

import models.slot_tagger as slot_tagger
import models.slot_tagger_with_focus as slot_tagger_with_focus
import models.slot_tagger_crf as slot_tagger_with_crf
import models.snt_classifier as snt_classifier

import utils.vocab_reader as vocab_reader
import utils.data_reader_for_elmo as data_reader
import utils.read_wordEmb as read_wordEmb
import utils.util as util
import utils.acc as acc

MODEL_CLASSES = {
        'bert': (BertModel, BertTokenizer),
        'xlnet': (XLNetModel, XLNetTokenizer),
        }

parser = argparse.ArgumentParser()
parser.add_argument('--task_st', required=True, help='slot filling task: slot_tagger | slot_tagger_with_focus | slot_tagger_with_crf')
parser.add_argument('--task_sc', required=True, help='intent detection task: none | 2tails | maxPooling | hiddenCNN | hiddenAttention')
parser.add_argument('--sc_type', default='single_cls_CE', help='single_cls_CE | multi_cls_BCE')
parser.add_argument('--st_weight', type=float, default=0.5, help='loss weight for slot tagging task, ranging from 0 to 1.')

parser.add_argument('--dataset', required=True, help='atis-2 | snips')
parser.add_argument('--dataroot', required=True, help='path to dataset')
parser.add_argument('--save_model', default='model', help='save model to this file')
#parser.add_argument('--mini_word_freq', type=int, default=2, help='mini_word_freq in the training data')
#parser.add_argument('--word_lowercase', action='store_true', help='word lowercase')
parser.add_argument('--bos_eos', action='store_true', help='Whether to add <s> and </s> to the input sentence (default is not)')
parser.add_argument('--save_vocab', default='vocab', help='save vocab to this file')
parser.add_argument('--noStdout', action='store_true', help='Only log to a file; no stdout')
github huggingface / transformers / examples / run_multiple_choice.py View on Github external
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
}


def select_field(features, field):
    return [[choice[field] for choice in feature.choices_features] for feature in features]


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
github mgrankin / ru_transformers / run_generation.py View on Github external
TransfoXLLMHeadModel, TransfoXLTokenizer, )


logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
}

# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing.   """
github sz128 / slot_filling_and_intent_detection_of_SLU / scripts / slot_tagging_and_intent_detection_with_pure_transformer.py View on Github external
from transformers import BertTokenizer, BertModel, XLNetTokenizer, XLNetModel 
from transformers import AdamW, get_linear_schedule_with_warmup
from models.optimization import BertAdam
from utils.bert_xlnet_inputs import prepare_inputs_for_bert_xlnet

import models.slot_tagger_and_intent_detector_with_pure_transformer as joint_transformer

import utils.vocab_reader as vocab_reader
import utils.data_reader_for_elmo as data_reader
import utils.read_wordEmb as read_wordEmb
import utils.util as util
import utils.acc as acc

MODEL_CLASSES = {
        'bert': (BertModel, BertTokenizer),
        'xlnet': (XLNetModel, XLNetTokenizer),
        }

parser = argparse.ArgumentParser()
parser.add_argument('--task_st', required=True, help='slot filling task: NN | NN_crf')
parser.add_argument('--task_sc', required=True, help='intent detection task: none | CLS | max | CLS_max')
parser.add_argument('--sc_type', default='single_cls_CE', help='single_cls_CE | multi_cls_BCE')
parser.add_argument('--st_weight', type=float, default=0.5, help='loss weight for slot tagging task, ranging from 0 to 1.')

parser.add_argument('--dataset', required=True, help='atis-2 | snips')
parser.add_argument('--dataroot', required=True, help='path to dataset')
parser.add_argument('--save_model', default='model', help='save model to this file')
#parser.add_argument('--mini_word_freq', type=int, default=2, help='mini_word_freq in the training data')
#parser.add_argument('--word_lowercase', action='store_true', help='word lowercase')
parser.add_argument('--bos_eos', action='store_true', help='Whether to add <s> and </s> to the input sentence (default is not)')
parser.add_argument('--save_vocab', default='vocab', help='save vocab to this file')
parser.add_argument('--noStdout', action='store_true', help='Only log to a file; no stdout')