Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = RegressionProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_regr",
train_filename="train-sample.tsv",
dev_filename="test-sample.tsv",
test_filename=None,
label_column_name="label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = RegressionHead(layer_dims=[768, 1])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence_continuous"],
device=device
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
next_sentence_head = NextSentenceHead.load(lang_model)
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[lm_prediction_head, next_sentence_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token", "per_sequence"],
device=device
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
"I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/ner",
train_filename="train-sample.txt",
dev_filename="dev-sample.txt",
test_filename=None,
delimiter=" ",
label_list=ner_labels,
metric="seq_f1"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
tokenizer = RobertaTokenizer.from_pretrained(
pretrained_model_name_or_path=lang_model)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = Roberta.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
return {"acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro}
register_metrics('mymetrics', mymetrics)
metric = 'mymetrics'
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir="../data/germeval18",
label_list=label_list,
metric=metric,
label_column_name="coarse_label"
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.2,
lm_output_types=["per_sequence"],
device=device)
:param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
:type device: str
"""
eval_processor = SquadProcessor(
tokenizer=self.inferencer.processor.tokenizer,
max_seq_len=self.inferencer.processor.max_seq_len,
label_list=self.inferencer.processor.tasks["question_answering"]["label_list"],
metric=self.inferencer.processor.tasks["question_answering"]["metric"],
train_filename=None,
dev_filename=None,
dev_split=0,
test_filename=test_filename,
data_dir=Path(data_dir),
)
data_silo = DataSilo(processor=eval_processor, batch_size=self.inferencer.batch_size, distributed=False)
data_loader = data_silo.get_data_loader("test")
evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device)
eval_results = evaluator.eval(self.inferencer.model)
results = {
"EM": eval_results[0]["EM"],
"f1": eval_results[0]["f1"],
"top_n_accuracy": eval_results[0]["top_n_accuracy"]
}
return results
metric = "squad"
processor = SquadProcessor(
tokenizer=self.inferencer.processor.tokenizer,
max_seq_len=max_seq_len,
label_list=label_list,
metric=metric,
train_filename=train_filename,
dev_filename=dev_filename,
dev_split=dev_split,
test_filename=test_file_name,
data_dir=Path(data_dir),
)
# 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
# and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
# Quick-fix until this is fixed upstream in FARM:
# We must avoid applying DataParallel twice (once when loading the inferencer,
# once when calling initalize_optimizer)
self.inferencer.model.save("tmp_model")
model = BaseAdaptiveModel.load(load_dir="tmp_model", device=device, strict=True)
shutil.rmtree('tmp_model')
# 3. Create an optimizer and pass the already initialized model
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
# model=self.inferencer.model,
learning_rate=learning_rate,
schedule_opts={"name": "LinearWarmup", "warmup_proportion": warmup_proportion},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1",label_list=ner_labels
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(task_name="ner",
layer_dims=[768, len(processor.tasks["ner"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_token"],
device=device,
)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/toxic-comments",
label_list=label_list,
label_column_name="label",
metric=metric,
quote_char='"',
multilabel=True,
train_filename="train.tsv",
dev_filename="val.tsv",
test_filename=None,
dev_split=0
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Roberta.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = MultiLabelTextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)