Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Discard words less-frequent than min_count
if not dry_run:
self.index2word = []
# make stored settings match these applied settings
self.min_count = min_count
self.sample = sample
self.vocab = {}
drop_unique, drop_total, retain_total, original_total = 0, 0, 0, 0
retain_words = []
for word, v in iteritems(self.raw_vocab):
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule):
retain_words.append(word)
retain_total += v
original_total += v
if not dry_run:
self.vocab[word] = Vocab(count=v, index=len(self.index2word))
self.index2word.append(word)
else:
drop_unique += 1
drop_total += v
original_total += v
logger.info("min_count=%d retains %i unique words (drops %i)",
min_count, len(retain_words), drop_unique)
logger.info("min_count leaves %i word corpus (%i%% of original %i)",
retain_total, retain_total * 100 / max(original_total, 1), original_total)
# Precalculate each vocabulary item's threshold for sampling
if not sample:
# no words downsampled
threshold_count = retain_total
elif sample < 1.0:
# traditional meaning: set parameter as proportion of total
def create_binary_tree(self):
"""
Create a binary Huffman tree using stored vocabulary word counts. Frequent words
will have shorter binary codes. Called internally from `build_vocab()`.
"""
logger.info("constructing a huffman tree from %i words", len(self.vocab))
# build the huffman tree
heap = list(itervalues(self.vocab))
heapq.heapify(heap)
for i in xrange(len(self.vocab) - 1):
min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2))
# recurse over the tree, assigning a binary code to each vocabulary word
if heap:
max_depth, stack = 0, [(heap[0], [], [])]
while stack:
node, codes, points = stack.pop()
if node.index < len(self.vocab):
# leaf node => store its path from the root
node.code, node.point = codes, points
max_depth = max(len(codes), max_depth)
else:
# inner node => continue recursion
points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32)
stack.append((node.left, array(list(codes) + [0], dtype=uint8), points))
stack.append((node.right, array(list(codes) + [1], dtype=uint8), points))
def finalize_vocab(self):
"""Build tables and model weights based on final vocabulary settings."""
if not self.index2word:
self.scale_vocab()
if self.sorted_vocab:
self.sort_vocab()
if self.hs:
# add info about each word's Huffman encoding
self.create_binary_tree()
if self.negative:
# build the table for drawing random words (for negative sampling)
self.make_cum_table()
if self.null_word:
# create null pseudo-word for padding when using concatenative L1 (run-of-words)
# this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter
word, v = '\0', Vocab(count=1, sample_int=0)
v.index = len(self.vocab)
self.index2word.append(word)
self.vocab[word] = v
# set initial input/projection and hidden weights
self.reset_weights()
def add_word(word, weights):
word_id = len(result.vocab)
if word in result.vocab:
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
return
if counts is None:
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
elif word in counts:
# use count from the vocab file
result.vocab[word] = Vocab(index=word_id, count=counts[word])
else:
# vocab file given, but word is missing -- set count to None (TODO: or raise?)
logger.warning("vocabulary file is incomplete: '%s' is missing", word)
result.vocab[word] = Vocab(index=word_id, count=None)
result.syn0[word_id] = weights
result.index2word.append(word)