Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_ner(caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 2
evaluate_every = 1
lang_model = "bert-base-german-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
"I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/ner",
train_filename="train-sample.txt",
dev_filename="dev-sample.txt",
test_filename=None,
delimiter=" ",
label_list=ner_labels,
metric="seq_f1"
def test_lm_finetuning_no_next_sentence(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=False
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model)
lm_prediction_head = BertLMHead.load(lang_model)
n_epochs = 2
evaluate_every = 500
base_LM_model = "albert-base-v1"
train_filename="subsets/train_medium-v2.0.json"
dev_filename="subsets/dev_medium-v2.0.json"
save_dir = "../saved_models/qa_medium_albert"
inference_file = "../data/squad20/subsets/dev_medium-v2.0.json"
predictions_file = save_dir + "/predictions.json"
full_predictions_file = save_dir + "/full_predictions.json"
max_processes_for_inference = 8
train = False
inference = True
if train:
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
label_list = ["start_token", "end_token"]
metric = "squad"
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=384,
label_list=label_list,
metric=metric,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=None,
data_dir="../data/squad20",
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 500
lang_model = "bert-base-uncased"
do_lower_case = True
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=do_lower_case)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/toxic-comments",
label_list=label_list,
label_column_name="label",
metric=metric,
quote_char='"',
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 100
evaluate_every = 20
lang_model = "bert-base-cased"
do_lower_case = False
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.
label_list = ["0", "1"]
metric = "mcc"
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir="../data/cola",
dev_filename="dev.tsv",
dev_split=None,
test_filename=None,
label_list=label_list,
metric=metric,
label_column_name="label"
set_all_seeds(seed=42)
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(
experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
)
##########################
########## Settings
##########################
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = BertStyleLMProcessor(
data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=128, max_docs=30
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)
# ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
# ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_ner")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1",label_list=ner_labels
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# ml_logger = MLFlowLogger(tracking_uri="logs")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
use_amp = None
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
n_epochs = 20
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
def mymetrics(preds, labels):
acc = simple_accuracy(preds, labels)
f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
"""
Infers the specific type of Processor from a config file (e.g. GNADProcessor) and loads an instance of it.
:param load_dir: str, directory that contains a 'processor_config.json'
:return: An instance of a Processor Subclass (e.g. GNADProcessor)
"""
# read config
processor_config_file = os.path.join(load_dir, "processor_config.json")
config = json.load(open(processor_config_file))
# init tokenizer
if "lower_case" in config.keys():
logger.warning("Loading tokenizer from deprecated FARM config. "
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore.")
tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"])
else:
tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
# we have to delete the tokenizer string from config, because we pass it as Object
del config["tokenizer"]
processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config)
for task_name, task in config["tasks"].items():
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
if processor is None:
raise Exception
return processor
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# We do not have a sample dataset for regression yet, add your own dataset to run the example
processor = RegressionProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/",
label_column_name="label"
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)