How to use the farm.modeling.tokenization.Tokenizer function in farm

To help you get started, we’ve selected a few farm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / test / test_ner.py View on Github external
def test_ner(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
github deepset-ai / FARM / test / test_lm_finetuning.py View on Github external
def test_lm_finetuning_no_next_sentence(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=False
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
github deepset-ai / FARM / examples / question_answering_debug.py View on Github external
n_epochs = 2
evaluate_every = 500
base_LM_model = "albert-base-v1"
train_filename="subsets/train_medium-v2.0.json"
dev_filename="subsets/dev_medium-v2.0.json"
save_dir = "../saved_models/qa_medium_albert"
inference_file = "../data/squad20/subsets/dev_medium-v2.0.json"
predictions_file = save_dir + "/predictions.json"
full_predictions_file = save_dir + "/full_predictions.json"
max_processes_for_inference = 8
train = False
inference = True

if train:
    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir="../data/squad20",
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
github deepset-ai / FARM / examples / doc_classification_multilabel.py View on Github external
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32

evaluate_every = 500
lang_model = "bert-base-uncased"
do_lower_case = True

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/toxic-comments",
                                        label_list=label_list,
                                        label_column_name="label",
                                        metric=metric,
                                        quote_char='"',
github deepset-ai / FARM / examples / doc_classification_cola.py View on Github external
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 100
evaluate_every = 20
lang_model = "bert-base-cased"
do_lower_case = False

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.

label_list = ["0", "1"]
metric = "mcc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/cola",
                                        dev_filename="dev.tsv",
                                        dev_split=None,
                                        test_filename=None,
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="label"
github deepset-ai / FARM / examples / lm_finetuning.py View on Github external
set_all_seeds(seed=42)
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(
    experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
)
##########################
########## Settings
##########################
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model, do_lower_case=False
)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = BertStyleLMProcessor(
    data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=128, max_docs=30
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)
github deepset-ai / FARM / examples / ner.py View on Github external
# ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
# ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_ner")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

processor = NERProcessor(
    tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1",label_list=ner_labels
)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
github deepset-ai / FARM / examples / doc_classification_with_earlystopping.py View on Github external
# ml_logger = MLFlowLogger(tracking_uri="logs")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
use_amp = None
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
n_epochs = 20
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]

# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
def mymetrics(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
"""
         Infers the specific type of Processor from a config file (e.g. GNADProcessor) and loads an instance of it.

        :param load_dir: str, directory that contains a 'processor_config.json'
        :return: An instance of a Processor Subclass (e.g. GNADProcessor)
        """
        # read config
        processor_config_file = os.path.join(load_dir, "processor_config.json")
        config = json.load(open(processor_config_file))
        # init tokenizer
        if "lower_case" in config.keys():
            logger.warning("Loading tokenizer from deprecated FARM config. "
                           "If you used `custom_vocab` or `never_split_chars`, this won't work anymore.")
            tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"])
        else:
            tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])

        # we have to delete the tokenizer string from config, because we pass it as Object
        del config["tokenizer"]

        processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config)

        for task_name, task in config["tasks"].items():
            processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])

        if processor is None:
            raise Exception

        return processor
github deepset-ai / FARM / examples / doc_regression.py View on Github external
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
#    We do not have a sample dataset for regression yet, add your own dataset to run the example
processor = RegressionProcessor(tokenizer=tokenizer,
                                max_seq_len=128,
                                data_dir="../data/",
                                label_column_name="label"
                                )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)