How to use nlpaug - 10 common examples

To help you get started, we’ve selected a few nlpaug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github makcedward / nlpaug / test / augmenter / word / test_context_word_embs.py View on Github external
def oov(self, augs):
        unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        texts = [
            unknown_token,
            unknown_token + ' the'
        ]

        for aug in augs:
            for text in texts:
                self.assertLess(0, len(text))
                augmented_text = aug.augment(text)
                if aug.action == Action.INSERT:
                    self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
                elif aug.action == Action.SUBSTITUTE:
                    self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))
                else:
                    raise Exception('Augmenter is neither INSERT or SUBSTITUTE')

                if aug.model_type not in ['roberta']:
                    self.assertTrue(aug.model.SUBWORD_PREFIX not in augmented_text)
github makcedward / nlpaug / test / augmenter / word / test_tfidf.py View on Github external
def test_oov(self):
        unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        texts = [
            unknown_token,
            unknown_token + ' the'
        ]

        augmenters = [
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.INSERT),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.SUBSTITUTE)
        ]

        for aug in augmenters:
            for text in texts:
                self.assertLess(0, len(text))
                augmented_text = aug.augment(text)
                if aug.action == Action.INSERT:
                    self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
                    self.assertNotEqual(text, augmented_text)
                elif aug.action == Action.SUBSTITUTE:
                    self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))

                    if unknown_token == text:
                        self.assertEqual(text, augmented_text)
                    else:
                        self.assertNotEqual(text, augmented_text)
github makcedward / nlpaug / test / augmenter / word / test_bert.py View on Github external
def test_oov(self):
        unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        texts = [
            unknown_token,
            unknown_token + ' the'
        ]

        augmenters = [
            naw.BertAug(action=Action.INSERT),
            naw.BertAug(action=Action.SUBSTITUTE)
        ]

        for aug in augmenters:
            for text in texts:
                self.assertLess(0, len(text))
                augmented_text = aug.augment(text)
                if aug.action == Action.INSERT:
                    self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
                elif aug.action == Action.SUBSTITUTE:
                    self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))
                else:
                    raise Exception('Augmenter is neither INSERT or SUBSTITUTE')

                self.assertNotEqual(text, augmented_text)
                self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
github makcedward / nlpaug / test / augmenter / word / test_spelling.py View on Github external
def test_substitute_stopwords(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]

        stopwords = [t.lower() for t in texts[0].split(' ')[:3]]
        aug_n = 3

        aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt', stopwords=stopwords)

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            augmented_tokens = aug.tokenizer(augmented_text)
            tokens = aug.tokenizer(text)

            augmented_cnt = 0

            for token, augmented_token in zip(tokens, augmented_tokens):
                if token.lower() in stopwords and len(token) > aug_n:
                    self.assertEqual(token.lower(), augmented_token)
                else:
                    augmented_cnt += 1
github makcedward / nlpaug / test / augmenter / word / test_context_word_embs.py View on Github external
def insert(self, aug):
        self.assertLess(0, len(self.text))
        augmented_text = aug.augment(self.text)

        self.assertLess(len(self.text.split(' ')), len(augmented_text.split(' ')))
        self.assertNotEqual(self.text, augmented_text)
        self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
github makcedward / nlpaug / test / augmenter / word / test_tfidf.py View on Github external
def test_insert(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]

        aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.INSERT)

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
            self.assertNotEqual(text, augmented_text)

        self.assertLess(0, len(texts))
github makcedward / nlpaug / test / augmenter / word / test_bert.py View on Github external
def test_insert(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]

        aug = naw.BertAug(action=Action.INSERT)

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
            self.assertNotEqual(text, augmented_text)
            self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)

        self.assertLess(0, len(texts))
github makcedward / nlpaug / test / augmenter / word / test_wordembs.py View on Github external
def setUpClass(cls):
        env_config_path = os.path.abspath(os.path.join(
            os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.insert_augmenters = [
            naw.Word2vecAug(
                model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
                action=Action.INSERT),
            naw.FasttextAug(
                model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
                action=Action.INSERT),
            naw.GloVeAug(
                model_path=os.environ.get("MODEL_DIR") + 'glove.6B.50d.txt',
                action=Action.INSERT)
        ]

        cls.substitute_augmenters = [
            naw.Word2vecAug(
                model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
                action=Action.SUBSTITUTE),
            naw.FasttextAug(
                model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
                action=Action.SUBSTITUTE),
            naw.GloVeAug(
github makcedward / nlpaug / test / augmenter / word / test_fasttext.py View on Github external
def test_insert(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]

        aug = naw.FasttextAug(
            model_path=os.environ.get("MODEL_DIR")+'wiki-news-300d-1M.vec',
            action=Action.INSERT)

        for text in texts:
            tokens = aug.tokenizer(text)
            results = aug.augment(text)

            self.assertLess(len(tokens), len(results))
            self.assertLess(0, len(tokens))

        self.assertLess(0, len(texts))
github makcedward / nlpaug / test / augmenter / word / test_fasttext.py View on Github external
def test_substitute(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]

        aug = naw.FasttextAug(
            model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
            action=Action.SUBSTITUTE)

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            self.assertNotEqual(text, augmented_text)

        self.assertLess(0, len(texts))