Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def oov(self, augs):
unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
texts = [
unknown_token,
unknown_token + ' the'
]
for aug in augs:
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
if aug.action == Action.INSERT:
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
elif aug.action == Action.SUBSTITUTE:
self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))
else:
raise Exception('Augmenter is neither INSERT or SUBSTITUTE')
if aug.model_type not in ['roberta']:
self.assertTrue(aug.model.SUBWORD_PREFIX not in augmented_text)
def test_oov(self):
unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
texts = [
unknown_token,
unknown_token + ' the'
]
augmenters = [
naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.INSERT),
naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.SUBSTITUTE)
]
for aug in augmenters:
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
if aug.action == Action.INSERT:
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(text, augmented_text)
elif aug.action == Action.SUBSTITUTE:
self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))
if unknown_token == text:
self.assertEqual(text, augmented_text)
else:
self.assertNotEqual(text, augmented_text)
def test_oov(self):
unknown_token = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
texts = [
unknown_token,
unknown_token + ' the'
]
augmenters = [
naw.BertAug(action=Action.INSERT),
naw.BertAug(action=Action.SUBSTITUTE)
]
for aug in augmenters:
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
if aug.action == Action.INSERT:
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
elif aug.action == Action.SUBSTITUTE:
self.assertEqual(len(text.split(' ')), len(augmented_text.split(' ')))
else:
raise Exception('Augmenter is neither INSERT or SUBSTITUTE')
self.assertNotEqual(text, augmented_text)
self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
def test_substitute_stopwords(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
stopwords = [t.lower() for t in texts[0].split(' ')[:3]]
aug_n = 3
aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt', stopwords=stopwords)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
augmented_tokens = aug.tokenizer(augmented_text)
tokens = aug.tokenizer(text)
augmented_cnt = 0
for token, augmented_token in zip(tokens, augmented_tokens):
if token.lower() in stopwords and len(token) > aug_n:
self.assertEqual(token.lower(), augmented_token)
else:
augmented_cnt += 1
def insert(self, aug):
self.assertLess(0, len(self.text))
augmented_text = aug.augment(self.text)
self.assertLess(len(self.text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(self.text, augmented_text)
self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.INSERT)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(texts))
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.BertAug(action=Action.INSERT)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(text, augmented_text)
self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
self.assertLess(0, len(texts))
def setUpClass(cls):
env_config_path = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', '..', '..', '.env'))
load_dotenv(env_config_path)
cls.insert_augmenters = [
naw.Word2vecAug(
model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
action=Action.INSERT),
naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
action=Action.INSERT),
naw.GloVeAug(
model_path=os.environ.get("MODEL_DIR") + 'glove.6B.50d.txt',
action=Action.INSERT)
]
cls.substitute_augmenters = [
naw.Word2vecAug(
model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
action=Action.SUBSTITUTE),
naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
action=Action.SUBSTITUTE),
naw.GloVeAug(
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR")+'wiki-news-300d-1M.vec',
action=Action.INSERT)
for text in texts:
tokens = aug.tokenizer(text)
results = aug.augment(text)
self.assertLess(len(tokens), len(results))
self.assertLess(0, len(tokens))
self.assertLess(0, len(texts))
def test_substitute(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
action=Action.SUBSTITUTE)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(texts))