Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
device=device,
schedule_opts=None)
trainer = Trainer(
tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/ner",
train_filename="train-sample.txt",
dev_filename="dev-sample.txt",
test_filename=None,
delimiter=" ",
label_list=ner_labels,
metric="seq_f1"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
device=device,
schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1}
)
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = Roberta.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
device=device,
schedule_opts=None)
trainer = Trainer(
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model)
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=1,
device=device,
schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1})
class_weights=None,
):
parsed_lm_output_types = lm_output_type.split(",")
language_model = LanguageModel.load(model)
initialized_heads = []
for head_name in prediction_heads.split(","):
initialized_heads.append(
PredictionHead.create(
prediction_head_name=head_name,
layer_dims=layer_dims,
class_weights=class_weights,
)
)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=initialized_heads,
embeds_dropout_prob=embeds_dropout_prob,
lm_output_types=parsed_lm_output_types,
device=device,
)
return model
def train_on_split(silo_to_use, n_fold, save_dir):
logger.info(f"############ Crossvalidation: Fold {n_fold} ############")
# Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.2,
lm_output_types=["per_sequence"],
device=device)
# Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
model=model,
learning_rate=0.5e-5,
n_batches=len(silo_to_use.loaders["train"]), # TODO
n_epochs=n_epochs)
# Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
# Also create an EarlyStopping instance and pass it on to the trainer
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
# 4. Create an AdaptiveModel with a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
adaptive_model = AdaptiveModel(
language_model=language_model,
prediction_heads=[],
embeds_dropout_prob=0,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
# 5. Extract embeddings with model in inference mode
basic_texts = [
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
{"text": "Martin Müller spielt Fussball"},
]
model = Inferencer(adaptive_model, processor, gpu=use_gpu)
result = model.extract_vectors(dicts=basic_texts)
print(result)
dev_filename=dev_filename,
test_filename=None,
data_dir="../data/squad20",
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(base_LM_model)
# b) and a prediction head on top that is suited for our task => Question Answering
prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=1e-5,
schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
device=device
)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = BertStyleLMProcessor(
data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=128, max_docs=30
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
)
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
:param language_model: Any model that turns token ids into vector representations
:type language_model: LanguageModel
:param prediction_heads: A list of models that take embeddings and return logits for a given task
:type prediction_heads: list
:param embeds_dropout_prob: The probability that a value in the embeddings returned by the
language model will be zeroed.
:param embeds_dropout_prob: float
:param lm_output_types: How to extract the embeddings from the final layer of the language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:type lm_output_types: list or str
:param device: The device on which this model will operate. Either "cpu" or "cuda".
"""
super(AdaptiveModel, self).__init__()
self.language_model = language_model.to(device)
self.prediction_heads = nn.ModuleList([ph.to(device) for ph in prediction_heads])
# set shared weights for LM finetuning
for head in self.prediction_heads:
if head.model_type == "language_modelling":
head.set_shared_weights(language_model.model.embeddings.word_embeddings.weight)
self.num_labels = [head.num_labels for head in prediction_heads]
self.dropout = nn.Dropout(embeds_dropout_prob)
self.lm_output_types = (
[lm_output_types] if isinstance(lm_output_types, str) else lm_output_types
)
self.log_params()