Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __iter__(self):
"""Iterate through the lines in the source."""
try:
# Assume it is a file-like object and try treating it as such
# Things that don't have seek will trigger an exception
self.source.seek(0)
for line in itertools.islice(self.source, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i : i + self.max_sentence_length]
i += self.max_sentence_length
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i : i + self.max_sentence_length]
i += self.max_sentence_length
def __iter__(self):
for fname in os.listdir(self.dirname):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
with utils.open(fname, 'rb') as fin:
for line in fin:
line = utils.to_unicode(line)
# each file line is a single sentence in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
if not words: # don't bother sending out empty sentences
continue
yield words
filenames = glob.glob('{}/*'.format(self.input))
for filename in filenames:
if os.path.isdir(filename):
filenames += glob.glob('{}/*'.format(filename))
for filename in filenames:
if not os.path.isdir( filename ):
with utils.smart_open( filename ) as f:
docId = filename
docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
tokens = self.tokenRegex.findall(docContent)
tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
yield tokens
self.docIds.append(docId)
total_docs += 1
else:
with utils.smart_open(self.input) as f:
for line in f:
docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
tokens = self.tokenRegex.findall(docContent)
tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
yield tokens
self.docIds.append(docId)
total_docs += 1
self.length = total_docs
job_tally += 1
# update progress stats
example_count += examples
trained_word_count += trained_words # only words in vocab & sampled
raw_word_count += raw_words
# log progress once every report_delay seconds
elapsed = default_timer() - start
if elapsed >= next_report:
if total_examples:
# examples-based progress %
logger.info(
"PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue))
else:
# words-based progress %
logger.info(
"PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue))
next_report = elapsed + report_delay
# all done; report the final stats
elapsed = default_timer() - start
logger.info(
"training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed)
if job_tally < 10 * self.workers:
logger.warn("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
)
if total_words is None and total_examples is None:
raise ValueError(
"You must specify either total_examples or total_words, for proper alpha and progress calculations. "
"The usual value is total_examples=model.corpus_count."
)
if epochs is None:
raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.")
start_alpha = start_alpha or self.alpha
end_alpha = end_alpha or self.min_alpha
job_tally = 0
if epochs > 1:
sentences = utils.RepeatCorpusNTimes(sentences, epochs)
total_words = total_words and total_words * epochs
total_examples = total_examples and total_examples * epochs
def worker_loop():
"""Train the model, lifting lists of sentences from the job_queue."""
work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory
neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
jobs_processed = 0
while True:
job = job_queue.get()
if job is None:
progress_queue.put(None)
break # no more jobs => quit this worker
sentences, alpha = job
tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
progress_queue.put((len(sentences), tally, raw_tally)) # report back progress
def preprocess_text(lemma, document):
with open(document, 'r') as infile:
# transform document into one string
text = ' '.join(line.rstrip('\n') for line in infile)
# convert string into unicode
text = gensim.utils.any2unicode(text)
# remove URL's
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
# remove symbols excluding the @, # and \s symbol
text = re.sub(r'[^\w@#\s]', '', text)
# use the built-in Gensim lemmatize engine
if lemma:
return utils.lemmatize(text, stopwords=ignore_words, min_length=3)
# tokenize words using NLTK Twitter Tokenizer
tknzr = TweetTokenizer()
text = tknzr.tokenize(text)
# lowercase, remove words less than len 2 & remove numbers in tokenized list
def my_rule(word, count, min_count):
if word in gvocab:
return utils.RULE_KEEP
else:
return utils.RULE_DISCARD
def process_article((title, text)):
"""Parse a wikipedia article, returning its content as `(title, list of tokens)`, all utf8."""
text = gensim.corpora.wikicorpus.filter_wiki(text) # remove markup, get plain text
return title.encode('utf8').replace('\t', ' '), gensim.utils.simple_preprocess(text)
f.close()
except IOError as e:
logging.debug(e)
logging.debug("done loading pickled tfidf matrix %s" % workingcorpus + tfidfCpickleExtension)
logging.info("finished running %s" % program)
# print raw_input('holding here to see memory usage ') # todo measure time and memory
# / ------------------------------- /
# -2- load queries, that is, all docs in the lee dataset, and process them
#(remove stopwords etc)
leePath = basepath + '/data/corpora/lee/'
leeCorpusName = 'lee'
queries_filename = ( leePath + leeCorpusName + '.cor')
rawLeeTexts = utils.get_txt(queries_filename)
stoplist = utils.get_txt(basepath+"data/stoplistStoneDennisKwantes.txt")
DEFAULT_FILTERS = [ stem_text, strip_punctuation, remove_stopwords ] # todo add these filters to log
preprocLeeTexts = preprocess_documents(rawLeeTexts)
# create a bow for each lee text. We need a Dictionary object first
# note that using DictionaryExistingCounts serves as a container for word2id, id2word
# and also it takes cares of the bow conversion.
# todo: serializing this object would work as well as having the '_wordids.txt' for ids
dictionary = DictionaryExistingCounts( word2id, id2word )
bow_queries_tfidf = [dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preprocLeeTexts]
# ^^here the ids are the ones in the larger corpus, because we use dictionary.
# note that these are raw freqs. Which is what we want. To keep them