Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
text,
load_nlp_pipeline('it'),
filter_stopwords=True
)
class ItalianLemmatizeTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('it'),
return_lemma=True
)
class ItalianLemmatizeFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('it'),
return_lemma=True,
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('it'),
class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('it'),
return_lemma=True,
filter_stopwords=True
)
class SpanishTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('es'))
class SpanishFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('es'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class SpanishRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('es'),
filter_stopwords=True
filter_punctuation=True,
filter_short_tokens=True
)
class GermanLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('de'),
return_lemma=True,
filter_stopwords=True
)
class FrenchTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('fr'))
class FrenchFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('fr'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class FrenchRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('nb'))
class NorwegianFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('nb'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class NorwegianRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('nb'),
filter_stopwords=True
)
class NorwegianLemmatizeTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('nb'), return_lemma=True)
class NorwegianLemmatizeFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
return process_text(text, load_nlp_pipeline('nl'), return_lemma=True)
class DutchLemmatizeFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('nl'),
return_lemma=True,
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class DutchLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('nl'),
return_lemma=True,
filter_stopwords=True
)
class GreekTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('el'))
class GreekFilterTokenizer(BaseTokenizer):
def __call__(self, text):
filter_punctuation=True,
filter_short_tokens=True
)
class ItalianLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('it'),
return_lemma=True,
filter_stopwords=True
)
class SpanishTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('es'))
class SpanishFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('es'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class SpanishRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('en'))
class EnglishFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('en'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class EnglishRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('en'),
filter_stopwords=True
)
class EnglishLemmatizeTokenizer(BaseTokenizer):
def __call__(self, text):
process_text(text, load_nlp_pipeline('en'), return_lemma=True)
class EnglishLemmatizeFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
filter_punctuation=True,
filter_short_tokens=True
)
class MultiLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('xx'),
return_lemma=True,
filter_stopwords=True
)
class BERTTokenizer(BaseTokenizer):
def __init__(self, vocab_file=None, **kwargs):
if vocab_file is None:
raise ValueError(
'Vocabulary file is required to initialize BERT tokenizer'
)
try:
from bert.tokenization import FullTokenizer
except ImportError:
raise ValueError(
"Please install bert-tensorflow: pip install bert-tensorflow"
)
self.tokenizer = FullTokenizer(vocab_file)
process_text(text, load_nlp_pipeline('en'), return_lemma=True)
class EnglishLemmatizeFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('en'),
return_lemma=True,
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class EnglishLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('en'),
return_lemma=True,
filter_stopwords=True
)
class ItalianTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('it'))
class ItalianFilterTokenizer(BaseTokenizer):
def __call__(self, text):
filter_punctuation=True,
filter_short_tokens=True
)
class FrenchLemmatizeRemoveStopwordsTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('fr'),
return_lemma=True,
filter_stopwords=True
)
class PortugueseTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(text, load_nlp_pipeline('pt'))
class PortugueseFilterTokenizer(BaseTokenizer):
def __call__(self, text):
return process_text(
text,
load_nlp_pipeline('pt'),
filter_numbers=True,
filter_punctuation=True,
filter_short_tokens=True
)
class PortugueseRemoveStopwordsTokenizer(BaseTokenizer):