Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
dev_filename="dev-sample.json",
test_filename=None,
data_dir="samples/qa",
label_list=label_list,
metric="squad"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(base_LM_model)
prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
device=device
set_all_seeds(seed=42)
batch_size = 32
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
# 4. Create an AdaptiveModel with a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
adaptive_model = AdaptiveModel(
language_model=language_model,
prediction_heads=[],
embeds_dropout_prob=0,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
# 5. Extract embeddings with model in inference mode
basic_texts = [
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
{"text": "Martin Müller spielt Fussball"},
]
model = Inferencer(adaptive_model, processor, gpu=use_gpu)
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1",label_list=ner_labels
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(task_name="ner",
layer_dims=[768, len(processor.tasks["ner"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,