How to use the farm.modeling.tokenization.tokenize_with_metadata function in farm

To help you get started, we’ve selected a few farm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / test / test_tokenization.py View on Github external
"This is a sentence	with tab",
    "This is a sentence			with multiple tabs",
    ]

    for tokenizer in tokenizers:
        for text in texts:
            # Important: we don't assume to preserve whitespaces after tokenization.
            # This means: \t, \n " " etc will all resolve to a single " ".
            # This doesn't make a difference for BERT + XLNet but it does for roBERTa

            # 1. original tokenize function from transformer repo on full sentence
            standardized_whitespace_text = ' '.join(text.split()) # remove multiple whitespaces
            tokenized = tokenizer.tokenize(standardized_whitespace_text)

            # 2. our tokenizer with metadata on "whitespace tokenized words"
            tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)

            # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
            assert tokenized_meta["tokens"] == tokenized, f"Failed using {tokenizer.__class__.__name__}"

            # verify that offsets align back to original text
            if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":
                # contains [UNK] that are impossible to match back to original text space
                continue
            for tok, offset in zip(tokenized_meta["tokens"], tokenized_meta["offsets"]):
                #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them
                tok = re.sub(r"^(##|Ġ|▁)", "", tok)
                #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok))
                original_tok = text[offset:offset+len(tok)]
                assert tok == original_tok, f"Offset alignment wrong for {tokenizer.__class__.__name__} and text '{text}'"
github deepset-ai / FARM / test / test_tokenization.py View on Github external
lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]
    tokenizers = []
    for lang_name in lang_names:
        t = Tokenizer.load(lang_name, lower_case=False)
        t.add_tokens(new_tokens=["neverseentokens"])
        tokenizers.append(t)

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    for tokenizer in tokenizers:
        save_dir = f"testsave"
        tokenizer_type = tokenizer.__class__.__name__
        tokenizer.save_pretrained(save_dir)
        tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type)
        tokenized_before = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
        tokenized_after = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer_loaded)
        assert tokenized_before == tokenized_after
github deepset-ai / FARM / test / test_tokenization.py View on Github external
tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False
        )

    #deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt")
    tokenizer.add_tokens(new_tokens=["neverseentokens"])

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    # original tokenizer from transformer repo
    tokenized = tokenizer.tokenize(basic_text)
    assert tokenized == ['Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']

    # ours with metadata
    tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
    assert tokenized_meta["tokens"] == tokenized
    assert tokenized_meta["offsets"] == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
    assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
github deepset-ai / FARM / test / test_tokenization.py View on Github external
lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False
        )

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    # original tokenizer from transformer repo
    tokenized = tokenizer.tokenize(basic_text)
    assert tokenized == ['Some', 'Text', 'with', 'never', '##see', '##nto', '##ken', '##s', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']

    # ours with metadata
    tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
    assert tokenized_meta["tokens"] == tokenized
    assert tokenized_meta["offsets"] == [0, 5, 10, 15, 20, 23, 26, 29, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
    assert tokenized_meta["start_of_word"] == [True, True, True, True, False, False, False, False, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
def apply_tokenization(self, dictionary):
        """ This performs tokenization on all documents and questions. The result is a list (unnested)
        where each entry is a dictionary for one document-question pair (potentially mutliple answers). """

        raw_baskets = []
        document_text = dictionary["context"]
        document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
        questions = dictionary["qas"]
        for question in questions:
            squad_id = question["id"]
            question_text = question["question"]
            question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
            answers = []
            for answer in question["answers"]:
                a = {"text": answer["text"],
                     "offset": answer["answer_start"]}
                answers.append(a)
            raw = {"document_text": document_text,
                   "document_tokens": document_tokenized["tokens"],
                   "document_offsets": document_tokenized["offsets"],
                   "document_start_of_word": document_start_of_word,
                   "question_text": question_text,
                   "question_tokens": question_tokenized["tokens"],
                   "question_offsets": question_tokenized["offsets"],
                   "question_start_of_word": question_start_of_word,
                   "answers": answers,
                   "is_impossible": question["is_impossible"],
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
def apply_tokenization(self, dictionary):
        """ This performs tokenization on all documents and questions. The result is a list (unnested)
        where each entry is a dictionary for one document-question pair (potentially mutliple answers). """

        raw_baskets = []
        document_text = dictionary["context"]
        document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
        questions = dictionary["qas"]
        for question in questions:
            squad_id = question["id"]
            question_text = question["question"]
            question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
            answers = []
            for answer in question["answers"]:
                a = {"text": answer["text"],
                     "offset": answer["answer_start"]}
                answers.append(a)
            raw = {"document_text": document_text,
                   "document_tokens": document_tokenized["tokens"],
                   "document_offsets": document_tokenized["offsets"],
                   "document_start_of_word": document_start_of_word,
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
# create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer
                )
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer
                )
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=tokenized["text_b"][seq_name],
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": None,
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
        # this tokenization also stores offsets
        tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
        # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
        for seq_name in tokenized.keys():
            tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
                                                           tokenizer=self.tokenizer,
                                                           max_seq_len=self.max_seq_len)
        # Samples don't have labels during Inference mode
        if "label" in dictionary:
            label = float(dictionary["label"])
            scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
            dictionary["label"] = scaled_label
        return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
assert len(all_dicts) > 1, "Need at least 2 documents to sample random sentences from"
        doc = dictionary["doc"]
        samples = []

        # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer
                )
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer
                )
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=tokenized["text_b"][seq_name],
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]