Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix]))
return self.sentences
binary_len = dtype(REAL).itemsize * vector_size
for line_no in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
ch = fin.read(1)
if ch == b' ':
break
if ch != b'\n': # ignore newlines in front of words (some binary files have)
word.append(ch)
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
weights = fromstring(fin.read(binary_len), dtype=REAL)
add_word(word, weights)
else:
for line_no, line in enumerate(fin):
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, weights = parts[0], list(map(REAL, parts[1:]))
add_word(word, weights)
if result.syn0.shape[0] != len(result.vocab):
logger.info(
"duplicate words detected, shrinking matrix size from %i to %i",
result.syn0.shape[0], len(result.vocab)
)
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
assert (len(result.vocab), result.vector_size) == result.syn0.shape
logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
return result
`binary` is a boolean indicating whether the data is in binary word2vec format.
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
Word counts are read from `fvocab` filename, if set (this is the file generated
by `-save-vocab` flag of the original C tool).
If you trained the C model using non-utf8 encoding for words, specify that
encoding in `encoding`.
"""
counts = None
if fvocab is not None:
logger.info("loading word counts from %s" % (fvocab))
counts = {}
with utils.smart_open(fvocab) as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
counts[word] = int(count)
logger.info("loading projection weights from %s" % (fname))
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
result = Word2Vec(size=vector_size)
result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
if binary:
binary_len = dtype(REAL).itemsize * vector_size
for line_no in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
ch = fin.read(1)
if ch == b' ':
Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
Otherwise (default False), these pairs are skipped entirely.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
similarity_gold = []
similarity_model = []
oov = 0
original_vocab = self.vocab
self.vocab = ok_vocab
for line_no, line in enumerate(utils.smart_open(pairs)):
line = utils.to_unicode(line)
if line.startswith('#'):
# May be a comment
continue
else:
try:
if case_insensitive:
a, b, sim = [word.upper() for word in line.split(delimiter)]
else:
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except:
logger.info('skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
if dummy4unknown:
`binary` is a boolean indicating whether the data is in binary word2vec format.
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
Word counts are read from `fvocab` filename, if set (this is the file generated
by `-save-vocab` flag of the original C tool).
If you trained the C model using non-utf8 encoding for words, specify that
encoding in `encoding`.
"""
counts = None
if fvocab is not None:
logger.info("loading word counts from %s", fvocab)
counts = {}
with utils.smart_open(fvocab) as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
counts[word] = int(count)
logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
result = cls(size=vector_size)
result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
def add_word(word, weights):
word_id = len(result.vocab)
if word in result.vocab:
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
return
if counts is None:
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
def strip_short(sentence, minsize=3):
'''
Split and Join words
if size >= minsize
'''
sentence = utils.to_unicode(sentence)
return " ".join(e for e in sentence.split() if len(e) >= minsize)
def __init__(self, basedir, k):
self.basedir = basedir
self.k = k
self.index_annoy = annoy.AnnoyIndex(500, metric='angular')
self.index_annoy.load(os.path.join(basedir, 'index3507620_annoy_100'))
self.id2title = gensim.utils.unpickle(os.path.join(basedir, 'id2title'))
self.title2id = dict((gensim.utils.to_unicode(title).lower(), pos) for pos, title in enumerate(self.id2title))
def tokenize_tr(content,token_min_len=2,token_max_len=50,lower=True):
if lower:
lowerMap = {ord(u'A'): u'a',ord(u'A'): u'a',ord(u'B'): u'b',ord(u'C'): u'c',ord(u'Ç'): u'ç',ord(u'D'): u'd',ord(u'E'): u'e',ord(u'F'): u'f',ord(u'G'): u'g',ord(u'Ğ'): u'ğ',ord(u'H'): u'h',ord(u'I'): u'ı',ord(u'İ'): u'i',ord(u'J'): u'j',ord(u'K'): u'k',ord(u'L'): u'l',ord(u'M'): u'm',ord(u'N'): u'n',ord(u'O'): u'o',ord(u'Ö'): u'ö',ord(u'P'): u'p',ord(u'R'): u'r',ord(u'S'): u's',ord(u'Ş'): u'ş',ord(u'T'): u't',ord(u'U'): u'u',ord(u'Ü'): u'ü',ord(u'V'): u'v',ord(u'Y'): u'y',ord(u'Z'): u'z'}
content = content.translate(lowerMap)
return [
utils.to_unicode(token) for token in utils.tokenize(content, lower=False, errors='ignore')
if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
]
fname_base, _ = path.splitext(fname)
fname_dir = path.dirname(fname)
for fname_vocab in [
utils.smart_extension(fname, '.vocab'),
utils.smart_extension(fname, '/vocab.txt'),
utils.smart_extension(fname_base, '.vocab'),
utils.smart_extension(fname_dir, '/vocab.txt'),
]:
if path.exists(fname_vocab):
break
else:
raise IOError('BleiCorpus: could not find vocabulary file')
self.fname = fname
with utils.open(fname_vocab, 'rb') as fin:
words = [utils.to_unicode(word).rstrip() for word in fin]
self.id2word = dict(enumerate(words))
scorer = ft.partial(
phrase_class.scoring,
len_vocab=float(len(phrase_class.vocab)),
min_count=float(phrase_class.min_count),
corpus_word_count=float(phrase_class.corpus_word_count))
else:
scorer = None
bigrams = phrase_class.analyze_sentence(sentence, threshold=phrase_class.threshold,
common_terms=phrase_class.common_terms, scorer=scorer)
new_s = []
for words, score in bigrams:
if score is not None:
words = delimiter.join(words)
new_s.append(words)
return [utils.to_unicode(w) for w in new_s]