How to use the gensim.utils.to_unicode function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Coder-Yu / OpinionSpam / word_embedding.py View on Github external
def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix]))
        return self.sentences
github Naresh1318 / DiagnosisPredictor / lib / chao_word2vec / word2vec.py View on Github external
binary_len = dtype(REAL).itemsize * vector_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                    word, weights = parts[0], list(map(REAL, parts[1:]))
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            logger.info(
                "duplicate words detected, shrinking matrix size from %i to %i",
                result.syn0.shape[0], len(result.vocab)
            )
            result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
        assert (len(result.vocab), result.vector_size) == result.syn0.shape

        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result
github uhh-lt / sensegram / word2vec_utils / utils.py View on Github external
`binary` is a boolean indicating whether the data is in binary word2vec format.
    `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
    Word counts are read from `fvocab` filename, if set (this is the file generated
    by `-save-vocab` flag of the original C tool).

    If you trained the C model using non-utf8 encoding for words, specify that
    encoding in `encoding`.

    """
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s" % (fvocab))
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s" % (fname))
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
        result = Word2Vec(size=vector_size)
        result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for line_no in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
github akb89 / word2vec / gensim / models / keyedvectors.py View on Github external
Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
        Otherwise (default False), these pairs are skipped entirely.
        """
        ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)

        similarity_gold = []
        similarity_model = []
        oov = 0

        original_vocab = self.vocab
        self.vocab = ok_vocab

        for line_no, line in enumerate(utils.smart_open(pairs)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:
                    if case_insensitive:
                        a, b, sim = [word.upper() for word in line.split(delimiter)]
                    else:
                        a, b, sim = [word for word in line.split(delimiter)]
                    sim = float(sim)
                except:
                    logger.info('skipping invalid line #%d in %s', line_no, pairs)
                    continue
                if a not in ok_vocab or b not in ok_vocab:
                    oov += 1
                    if dummy4unknown:
github rwong / paragraph2vec / par2vec / models / word2vec.py View on Github external
`binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).

        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.

        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s", fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s", fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
            result = cls(size=vector_size)
            result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)

            def add_word(word, weights):
                word_id = len(result.vocab)
                if word in result.vocab:
                    logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up some bogus counts, in descending order
github rohitsakala / semanticAnnotationAcmCategories / ACM / acm_preprocess.py View on Github external
def strip_short(sentence, minsize=3):
    '''
        Split and Join words
        if size >= minsize
    '''
    sentence = utils.to_unicode(sentence)
    return " ".join(e for e in sentence.split() if len(e) >= minsize)
github piskvorky / sim-shootout / wikisim_server / runserver.py View on Github external
def __init__(self, basedir, k):
        self.basedir = basedir
        self.k = k

        self.index_annoy = annoy.AnnoyIndex(500, metric='angular')
        self.index_annoy.load(os.path.join(basedir, 'index3507620_annoy_100'))
        self.id2title = gensim.utils.unpickle(os.path.join(basedir, 'id2title'))
        self.title2id = dict((gensim.utils.to_unicode(title).lower(), pos) for pos, title in enumerate(self.id2title))
github akoksal / Turkish-Word2Vec / preprocess.py View on Github external
def tokenize_tr(content,token_min_len=2,token_max_len=50,lower=True):
	if lower:
		lowerMap = {ord(u'A'): u'a',ord(u'A'): u'a',ord(u'B'): u'b',ord(u'C'): u'c',ord(u'Ç'): u'ç',ord(u'D'): u'd',ord(u'E'): u'e',ord(u'F'): u'f',ord(u'G'): u'g',ord(u'Ğ'): u'ğ',ord(u'H'): u'h',ord(u'I'): u'ı',ord(u'İ'): u'i',ord(u'J'): u'j',ord(u'K'): u'k',ord(u'L'): u'l',ord(u'M'): u'm',ord(u'N'): u'n',ord(u'O'): u'o',ord(u'Ö'): u'ö',ord(u'P'): u'p',ord(u'R'): u'r',ord(u'S'): u's',ord(u'Ş'): u'ş',ord(u'T'): u't',ord(u'U'): u'u',ord(u'Ü'): u'ü',ord(u'V'): u'v',ord(u'Y'): u'y',ord(u'Z'): u'z'}
		content = content.translate(lowerMap)
	return [
	utils.to_unicode(token) for token in utils.tokenize(content, lower=False, errors='ignore')
	if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
	]
github RaRe-Technologies / gensim / gensim / corpora / bleicorpus.py View on Github external
fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        utils.smart_extension(fname, '.vocab'),
                        utils.smart_extension(fname, '/vocab.txt'),
                        utils.smart_extension(fname_base, '.vocab'),
                        utils.smart_extension(fname_dir, '/vocab.txt'),
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.open(fname_vocab, 'rb') as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
github RaRe-Technologies / gensim / gensim / models / phrases.py View on Github external
scorer = ft.partial(
            phrase_class.scoring,
            len_vocab=float(len(phrase_class.vocab)),
            min_count=float(phrase_class.min_count),
            corpus_word_count=float(phrase_class.corpus_word_count))
    else:
        scorer = None
    bigrams = phrase_class.analyze_sentence(sentence, threshold=phrase_class.threshold,
        common_terms=phrase_class.common_terms, scorer=scorer)

    new_s = []
    for words, score in bigrams:
        if score is not None:
            words = delimiter.join(words)
        new_s.append(words)
    return [utils.to_unicode(w) for w in new_s]