Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_embeddings_with_gensim(self):
for emb in self.embeddings_for_testing:
embeddings = load_wv_with_gensim(emb)
self.assertEqual(MODELS[emb]['vocab_size'], len(embeddings.vocab))
def test_fasttext_embeddings(self):
# First we will add smaller test embeddings to the
MODELS['ddt.swv'] = {
'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/ddt.swv.zip',
'vocab_size': 5000,
'dimensions': 100,
'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf',
'size': 741125088,
'file_extension': '.bin'
}
AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv')
download_model('ddt.swv', process_func=_unzip_process_func)
fasttext_embeddings = load_wv_with_gensim('ddt.swv')
self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)
# The word is not in the vocab
self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab)
# However we can get an embedding because of subword units
self.assertEqual(fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)
def load_wv_models():
for da_wv_model in AVAILABLE_EMBEDDINGS:
yield da_wv_model, load_wv_with_gensim(da_wv_model)