Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'vocab_size': 5000,
'dimensions': 300,
'md5_checksum': 'fcaa981a613b325ae4dc61aba235aa82',
'size': 5594508,
'file_extension': '.bin'
}
AVAILABLE_EMBEDDINGS.append('wiki.da.small.wv')
self.embeddings_for_testing = [
'wiki.da.small.wv',
'dslreddit.da.wv'
]
# Lets download the models and unzip it
for emb in self.embeddings_for_testing:
download_model(emb, process_func=_unzip_process_func)
def test_download(self):
# Download model beforehand
model_path = download_model('spacy', DEFAULT_CACHE_DIR,
process_func=_unzip_process_func,
verbose=True)
info = spacy.info(model_path)
self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner'])
self.assertEqual(info['lang'], 'da')
binary=False,
encoding='utf8')
assert_wv_dimensions(word_vecs, pretrained_embedding)
word_vecs.save_word2vec_format(bin_file_path, binary=True)
# Clean up the files
os.remove(org_vec_file)
os.remove(new_vec_file)
elif pretrained_embedding == 'dslreddit.da.wv':
_process_dslreddit(tmp_file_path, cache_dir)
elif pretrained_embedding == 'wiki.da.swv':
_unzip_process_func(tmp_file_path, clean_up_raw_data, verbose,
file_in_zip='wiki.da.bin')
elif pretrained_embedding == 'cc.da.swv':
import gzip
import shutil
bin_file_path = os.path.join(cache_dir, pretrained_embedding + ".bin")
if verbose:
print(
"Decompressing raw {} embeddings".format(pretrained_embedding))
with gzip.open(tmp_file_path, 'rb') as fin, open(bin_file_path,
'wb') as fout:
shutil.copyfileobj(fin, fout)
elif pretrained_embedding == 'sketchengine.da.swv':
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
from transformers import BertTokenizer, BertForSequenceClassification
# download the model or load the model path
path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose)
path_sub = os.path.join(path_sub,'bert.sub.v0.0.1')
path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func,verbose=verbose)
path_pol = os.path.join(path_pol,'bert.pol.v0.0.1')
self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub)
self.model_sub = BertForSequenceClassification.from_pretrained(path_sub)
self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol)
self.model_pol = BertForSequenceClassification.from_pretrained(path_pol)
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
def load_flair_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model