How to use the ludwig.utils.strings_utils.BaseTokenizer function in ludwig

To help you get started, weโ€™ve selected a few ludwig examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
text,
            load_nlp_pipeline('it'),
            filter_stopwords=True
        )


class ItalianLemmatizeTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('it'),
            return_lemma=True
        )


class ItalianLemmatizeFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('it'),
            return_lemma=True,
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('it'),
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('it'),
            return_lemma=True,
            filter_stopwords=True
        )


class SpanishTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('es'))


class SpanishFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('es'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class SpanishRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('es'),
            filter_stopwords=True
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
filter_punctuation=True,
            filter_short_tokens=True
        )


class GermanLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('de'),
            return_lemma=True,
            filter_stopwords=True
        )


class FrenchTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('fr'))


class FrenchFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('fr'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class FrenchRemoveStopwordsTokenizer(BaseTokenizer):
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
def __call__(self, text):
        return process_text(text, load_nlp_pipeline('nb'))


class NorwegianFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('nb'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class NorwegianRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('nb'),
            filter_stopwords=True
        )


class NorwegianLemmatizeTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('nb'), return_lemma=True)


class NorwegianLemmatizeFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
return process_text(text, load_nlp_pipeline('nl'), return_lemma=True)


class DutchLemmatizeFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('nl'),
            return_lemma=True,
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class DutchLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('nl'),
            return_lemma=True,
            filter_stopwords=True
        )


class GreekTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('el'))


class GreekFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
filter_punctuation=True,
            filter_short_tokens=True
        )


class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('it'),
            return_lemma=True,
            filter_stopwords=True
        )


class SpanishTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('es'))


class SpanishFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('es'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class SpanishRemoveStopwordsTokenizer(BaseTokenizer):
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
def __call__(self, text):
        return process_text(text, load_nlp_pipeline('en'))


class EnglishFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('en'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class EnglishRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('en'),
            filter_stopwords=True
        )


class EnglishLemmatizeTokenizer(BaseTokenizer):
    def __call__(self, text):
        process_text(text, load_nlp_pipeline('en'), return_lemma=True)


class EnglishLemmatizeFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
filter_punctuation=True,
            filter_short_tokens=True
        )


class MultiLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('xx'),
            return_lemma=True,
            filter_stopwords=True
        )


class BERTTokenizer(BaseTokenizer):
    def __init__(self, vocab_file=None, **kwargs):

        if vocab_file is None:
            raise ValueError(
                'Vocabulary file is required to initialize BERT tokenizer'
            )

        try:
            from bert.tokenization import FullTokenizer
        except ImportError:
            raise ValueError(
                "Please install bert-tensorflow: pip install bert-tensorflow"
            )

        self.tokenizer = FullTokenizer(vocab_file)
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
process_text(text, load_nlp_pipeline('en'), return_lemma=True)


class EnglishLemmatizeFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('en'),
            return_lemma=True,
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class EnglishLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('en'),
            return_lemma=True,
            filter_stopwords=True
        )


class ItalianTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('it'))


class ItalianFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
github uber / ludwig / ludwig / utils / strings_utils.py View on Github external
filter_punctuation=True,
            filter_short_tokens=True
        )


class FrenchLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('fr'),
            return_lemma=True,
            filter_stopwords=True
        )


class PortugueseTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(text, load_nlp_pipeline('pt'))


class PortugueseFilterTokenizer(BaseTokenizer):
    def __call__(self, text):
        return process_text(
            text,
            load_nlp_pipeline('pt'),
            filter_numbers=True,
            filter_punctuation=True,
            filter_short_tokens=True
        )


class PortugueseRemoveStopwordsTokenizer(BaseTokenizer):