How to use the gensim.utils function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github HKUST-KnowComp / MNE / MNE.py View on Github external
def __iter__(self):
        """Iterate through the lines in the source."""
        try:
            # Assume it is a file-like object and try treating it as such
            # Things that don't have seek will trigger an exception
            self.source.seek(0)
            for line in itertools.islice(self.source, self.limit):
                line = utils.to_unicode(line).split()
                i = 0
                while i < len(line):
                    yield line[i : i + self.max_sentence_length]
                    i += self.max_sentence_length
        except AttributeError:
            # If it didn't work like a file, use it as a string filename
            with utils.smart_open(self.source) as fin:
                for line in itertools.islice(fin, self.limit):
                    line = utils.to_unicode(line).split()
                    i = 0
                    while i < len(line):
                        yield line[i : i + self.max_sentence_length]
                        i += self.max_sentence_length
github RaRe-Technologies / gensim / gensim / models / deprecated / word2vec.py View on Github external
def __iter__(self):
        for fname in os.listdir(self.dirname):
            fname = os.path.join(self.dirname, fname)
            if not os.path.isfile(fname):
                continue
            with utils.open(fname, 'rb') as fin:
                for line in fin:
                    line = utils.to_unicode(line)
                    # each file line is a single sentence in the Brown corpus
                    # each token is WORD/POS_TAG
                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
                    if not words:  # don't bother sending out empty sentences
                        continue
                    yield words
github uwdata / termite-data-server / server_src / modules / modellers / GensimLDA.py View on Github external
filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs
github rwong / paragraph2vec / par2vec / models / word2vec.py View on Github external
job_tally += 1

            # update progress stats
            example_count += examples
            trained_word_count += trained_words  # only words in vocab & sampled
            raw_word_count += raw_words

            # log progress once every report_delay seconds
            elapsed = default_timer() - start
            if elapsed >= next_report:
                if total_examples:
                    # examples-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
                        100.0 * example_count / total_examples, trained_word_count / elapsed,
                        utils.qsize(job_queue), utils.qsize(progress_queue))
                else:
                    # words-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
                        100.0 * raw_word_count / total_words, trained_word_count / elapsed,
                        utils.qsize(job_queue), utils.qsize(progress_queue))
                next_report = elapsed + report_delay

        # all done; report the final stats
        elapsed = default_timer() - start
        logger.info(
            "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
            raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed)
        if job_tally < 10 * self.workers:
            logger.warn("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
github RaRe-Technologies / gensim / gensim / models / deprecated / word2vec.py View on Github external
)

        if total_words is None and total_examples is None:
            raise ValueError(
                "You must specify either total_examples or total_words, for proper alpha and progress calculations. "
                "The usual value is total_examples=model.corpus_count."
            )
        if epochs is None:
            raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.")
        start_alpha = start_alpha or self.alpha
        end_alpha = end_alpha or self.min_alpha

        job_tally = 0

        if epochs > 1:
            sentences = utils.RepeatCorpusNTimes(sentences, epochs)
            total_words = total_words and total_words * epochs
            total_examples = total_examples and total_examples * epochs

        def worker_loop():
            """Train the model, lifting lists of sentences from the job_queue."""
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                sentences, alpha = job
                tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
                progress_queue.put((len(sentences), tally, raw_tally))  # report back progress
github kethort / TwitterLDATopicModeling / src / create_LDA_model.py View on Github external
def preprocess_text(lemma, document):
    with open(document, 'r') as infile:
        # transform document into one string
        text = ' '.join(line.rstrip('\n') for line in infile)
    # convert string into unicode
    text = gensim.utils.any2unicode(text)

    # remove URL's
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)

    # remove symbols excluding the @, # and \s symbol
    text = re.sub(r'[^\w@#\s]', '', text)
    
    # use the built-in Gensim lemmatize engine 
    if lemma:
        return utils.lemmatize(text, stopwords=ignore_words, min_length=3)

    # tokenize words using NLTK Twitter Tokenizer
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)

    # lowercase, remove words less than len 2 & remove numbers in tokenized list
github valedica / twec / model.py View on Github external
def my_rule(word, count, min_count):
    if word in gvocab:
        return utils.RULE_KEEP
    else:
        return utils.RULE_DISCARD
github piskvorky / sim-shootout / prepare_shootout.py View on Github external
def process_article((title, text)):
    """Parse a wikipedia article, returning its content as `(title, list of tokens)`, all utf8."""
    text = gensim.corpora.wikicorpus.filter_wiki(text)  # remove markup, get plain text
    return title.encode('utf8').replace('\t', ' '), gensim.utils.simple_preprocess(text)
github RaRe-Technologies / gensim / acme / lee-wiki-streamParsing2008-cosines.py View on Github external
f.close()
    except IOError as e:
        logging.debug(e)
    logging.debug("done loading pickled tfidf matrix %s" % workingcorpus + tfidfCpickleExtension)
    logging.info("finished running %s" % program)
#    print raw_input('holding here to see memory usage ') # todo measure time and memory


# / ------------------------------- /

# -2- load queries, that is, all docs in the lee dataset, and process them
#(remove stopwords etc)
leePath = basepath + '/data/corpora/lee/'
leeCorpusName = 'lee'
queries_filename = ( leePath + leeCorpusName + '.cor')
rawLeeTexts = utils.get_txt(queries_filename)

stoplist = utils.get_txt(basepath+"data/stoplistStoneDennisKwantes.txt")

DEFAULT_FILTERS = [ stem_text, strip_punctuation, remove_stopwords ]  # todo add these filters to log
preprocLeeTexts = preprocess_documents(rawLeeTexts)

# create a bow for each lee text. We need a Dictionary object first
# note that using DictionaryExistingCounts serves as a container for  word2id, id2word
# and also it takes cares of the bow conversion.
# todo: serializing this object would work  as well as having the '_wordids.txt' for ids
dictionary = DictionaryExistingCounts( word2id, id2word )

bow_queries_tfidf = [dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preprocLeeTexts]

# ^^here the ids are  the ones in the larger corpus, because we use dictionary.
# note that these are raw freqs. Which is what we want. To keep them