Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, documents,
alg='cbow', min_count=5, size=300, max_features=10000, window=5):
assert alg in ('cbow', 'sg')
if self.flavor == 'w2v':
alg = 0 if alg == 'cbow' else 1
self.model = Word2Vec(
documents, min_count=min_count, size=size, window=window,
max_vocab_size=max_features, sg=alg)
self.model.save(self.path)
elif self.flavor == 'ft':
func = fasttext.cbow if alg == 'cbow' else fasttext.skipgram
with open('/tmp/skiptrain.txt', 'w') as f:
for d in documents:
f.write(' '.join(d) + '\n')
self.model = func(
input_file='/tmp/skiptrain.txt', output=self.path,
min_count=min_count, dim=size, ws=window)
self.size = size
self.default = np.zeros(self.size, dtype='float64')
self.fitted = True
return self
model_filename = FAST_TEXT_CBOW_MODEL_FILENAME
full_model_filename = self.full_filename(model_filename)
full_input_filename = self.full_filename(input_filename)
if model_type == 'skipgram':
self.logger.info(
'Training fasttext skipgram model on {} to {}'.format(
full_input_filename, full_model_filename))
self.model = fasttext.skipgram(
full_input_filename, full_model_filename)
elif model_type == 'cbow':
self.logger.info(
'Training fasttext cbow model on {} to {}'.format(
full_input_filename, full_model_filename))
self.model = fasttext.cbow(
full_input_filename, full_model_filename)
else:
raise ValueError('Wrong argument to model_type')
# Invalidate computed normalized matrix
self._normalized_matrix = None
def train(inp = "wiki.he.text",out_model = "wiki.he.fasttext.model",
alg = "CBOW"):
start = time.time()
if alg == "skipgram":
# Skipgram model
model = fasttext.skipgram(inp, out_model)
print(model.words) # list of words in dictionary
else:
# CBOW model
model = fasttext.cbow(inp, out_model)
print(model.words) # list of words in dictionary
print(time.time()-start)
model.save(out_model)
def train_embedding_fasttext():
# Skipgram model
model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# CBOW model
model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# Skipgram model
model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# CBOW model
model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
def train_embedding_fasttext():
# Skipgram model
model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# CBOW model
model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# Skipgram model
model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)
# CBOW model
model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
del(model)