Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'Zology raku123456 fasdasd asd4123414 1234584'
]
flows = [
naf.Sequential([
naf.Sometimes([nac.RandomCharAug(action="insert"),
nac.RandomCharAug(action="delete")],
pipeline_p=0.9),
naf.Sequential([
nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)
], name='Sub_Seq')
]),
naf.Sometimes([
naf.Sometimes([nac.RandomCharAug(action="insert"),
nac.RandomCharAug(action="delete")]),
naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
], pipeline_p=0.9)
]
# Since prob may be low and causing do not perform data augmentation. Retry 5 times
for flow in flows:
for text in texts:
at_least_one_not_equal = False
for _ in range(5):
augmented_text = flow.augment(text, n=1)
if text != augmented_text:
at_least_one_not_equal = True
break
self.assertTrue(at_least_one_not_equal)
def test_multiple_actions(self):
texts = [
'The quick brown fox jumps over the lazy dog',
'Zology raku123456 fasdasd asd4123414 1234584'
]
flows = [
naf.Sometimes([nac.RandomCharAug(action=Action.INSERT),
nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
pipeline_p=0.8),
naf.Sometimes(
[nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6),
nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
pipeline_p=0.6)
]
# Since prob may be low and causing do not perform data augmentation. Retry 5 times
for flow in flows:
at_least_one_not_equal = False
for _ in range(0, 5):
for text in texts:
self.assertLess(0, len(text))
augmented_text = flow.augment(text)
if text != augmented_text:
at_least_one_not_equal = True
def test_multiple_actions(self):
texts = [
'The quick brown fox jumps over the lazy dog',
'Zology raku123456 fasdasd asd4123414 1234584'
]
flows = [
naf.Sequential([nac.RandomCharAug(action=Action.INSERT),
naw.RandomWordAug()]),
naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
]
for flow in flows:
for text in texts:
augmented_text = flow.augment(text)
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(text))
self.assertLess(0, len(texts))
self.assertLess(0, len(flows))
naf.Sequential([
nac.OcrAug(),
naw.WordEmbsAug(
model_type='word2vec',
model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
]),
naf.Sequential([
nac.RandomCharAug(),
]),
naw.ContextualWordEmbsAug(
model_path='xlnet-base-cased', action="substitute",
skip_unknown_word=True, temperature=0.7, device='cpu')
]),
naf.Sometimes([
naf.Sequential([
nac.OcrAug(),
nac.RandomCharAug(),
]),
naf.Sometimes([
naw.WordEmbsAug(model_type='word2vec',
model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
], pipeline_p=0.999),
naw.ContextualWordEmbsAug(
model_path='xlnet-base-cased', action="substitute",
skip_unknown_word=True, temperature=0.7, device='cpu')
], pipeline_p=0.9999)
]
for num_thread in [1, 3]:
for flow in flows:
augmented_data = flow.augment(text, n=n, num_thread=num_thread)
self.assertEqual(len(augmented_data), n)
def test_empty(self):
texts = ['', None]
augs = [
nac.OcrAug(),
nac.KeyboardAug(),
]
for text in texts:
for aug in augs:
augmented_text = aug.augment(text)
self.assertEqual(text, augmented_text)
def test_n_output_without_augmentation(self):
texts = [
'AAAAAAAAAAA AAAAAAAAAAAAAA'
]
flows = [
naf.Sequential([
nac.OcrAug(),
nac.OcrAug()
]),
naf.Sometimes([
nac.RandomCharAug(),
nac.RandomCharAug()
], pipeline_p=0.00001)
]
for flow in flows:
for text in texts:
for _ in range(5):
augmented_texts = flow.augment(text, n=3)
all_not_equal = False
for augmented_text in augmented_texts:
if augmented_text != text:
all_not_equal = True
break
def test_ocr_single_word_nonexist_char(self):
texts = ['AAAAA', 'KKKKK']
aug = OcrAug()
for text in texts:
augmented_text = aug.augment(text)
self.assertEqual(text, augmented_text)
self.assertTrue(len(texts) > 0)
def test_multi_thread(self):
text = 'The quick brown fox jumps over the lazy dog'
n = 3
flows = [
naf.Sequential([
naf.Sequential([
nac.OcrAug(),
naw.WordEmbsAug(
model_type='word2vec',
model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
]),
naf.Sequential([
nac.RandomCharAug(),
]),
naw.ContextualWordEmbsAug(
model_path='xlnet-base-cased', action="substitute",
skip_unknown_word=True, temperature=0.7, device='cpu')
]),
naf.Sometimes([
naf.Sequential([
nac.OcrAug(),
nac.RandomCharAug(),
]),