Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _induce_frequencies(self, domain:int=2**31 - 1):
""" Induce frequencies for a pretrained model, as not all pretrained models come with frequencies.
Parameters
----------
domain : int
The cumulative count of the vocabulary.
"""
freq_dict = get_frequency_dict(self.lang_freq, wordlist='best')
for word in self.wv.index2word:
if word in freq_dict:
self.wv.vocab[word].count = int(freq_dict[word] * domain)
else:
self.wv.vocab[word].count = int(1e-8 * domain)
Use a the commonly available frequency table if the Gensim model does not contain information about
the frequency of the words (see model.wv.vocab.count).
lang : str, optional
Determines the language of the frequency table used to compute the weights.
Returns
-------
numpy.ndarray
The vector of weights for all words in the model vocabulary
"""
logger.info("pre-computing SIF weights")
if no_frequency:
logger.info("no frequency mode: using wordfreq for estimation (lang=%s)",lang)
freq_dict = get_frequency_dict(str(lang), wordlist='best')
for w in wv.index2word:
if w in freq_dict:
wv.vocab[w].count = int(freq_dict[w] * (2**31 -1))
else:
wv.vocab[w].count = 1
if alpha > 0:
corpus_size = 0
# Set the dtype correct for cython estimation
sif = zeros(shape=len(wv.vocab), dtype=REAL)
for k in wv.index2word:
# Compute normalization constant
corpus_size += wv.vocab[k].count