Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"This is a sentence with tab",
"This is a sentence with multiple tabs",
]
for tokenizer in tokenizers:
for text in texts:
# Important: we don't assume to preserve whitespaces after tokenization.
# This means: \t, \n " " etc will all resolve to a single " ".
# This doesn't make a difference for BERT + XLNet but it does for roBERTa
# 1. original tokenize function from transformer repo on full sentence
standardized_whitespace_text = ' '.join(text.split()) # remove multiple whitespaces
tokenized = tokenizer.tokenize(standardized_whitespace_text)
# 2. our tokenizer with metadata on "whitespace tokenized words"
tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
# verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
assert tokenized_meta["tokens"] == tokenized, f"Failed using {tokenizer.__class__.__name__}"
# verify that offsets align back to original text
if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":
# contains [UNK] that are impossible to match back to original text space
continue
for tok, offset in zip(tokenized_meta["tokens"], tokenized_meta["offsets"]):
#subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them
tok = re.sub(r"^(##|Ġ|▁)", "", tok)
#tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok))
original_tok = text[offset:offset+len(tok)]
assert tok == original_tok, f"Offset alignment wrong for {tokenizer.__class__.__name__} and text '{text}'"
def test_qa(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 2
n_epochs = 1
evaluate_every = 4
base_LM_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
dev_filename="dev-sample.json",
test_filename=None,
data_dir="samples/qa",
label_list=label_list,
metric="squad"
)
def test_doc_regression(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = RegressionProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_regr",
train_filename="train-sample.tsv",
dev_filename="test-sample.tsv",
test_filename=None,
label_column_name="label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
def test_lm_finetuning_custom_vocab(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model)
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = RegressionProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_regr",
train_filename="train-sample.tsv",
dev_filename="test-sample.tsv",
test_filename=None,
label_column_name="label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = RegressionHead(layer_dims=[768, 1])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence_continuous"],
device=device
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
device=device,
"I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/ner",
train_filename="train-sample.txt",
dev_filename="dev-sample.txt",
test_filename=None,
delimiter=" ",
label_list=ner_labels,
metric="seq_f1"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
dev_filename="dev-sample.json",
test_filename=None,
data_dir="samples/qa",
label_list=label_list,
metric="squad"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(base_LM_model)
prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
device=device
model = trainer.train(model)
# LM embeddings and weight of decoder in head are shared and should therefore be equal
assert torch.all(
torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight))
save_dir = "testsave/lm_finetuning"
model.save(save_dir)
processor.save(save_dir)
basic_texts = [
{"text": "Farmer's life is great."},
{"text": "It's nothing for big city kids though."},
]
model = Inferencer.load(save_dir, embedder_only=True)
result = model.extract_vectors(dicts=basic_texts)
assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.']
assert result[0]["vec"].shape == (768,)
# TODO check why results vary accross runs with same seed
assert isinstance(result[0]["vec"][0], np.float32)