Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not (n.is_bos() or n.is_eos()):
part, word = n.feature.split(',', 1)
if part == "名詞" or part == "動詞":
word_list.append(n.surface)
# テキストファイルごとの単語リストを追加
txt_word_list.append(word_list)
# bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成
corpus_dic = corpora.Dictionary(txt_word_list)
# 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換
corpus_list = [corpus_dic.doc2bow(word_in_text) for word_in_text in txt_word_list]
# コーパスリストをスパースマトリックス(csc型)に変換
word_matrix = matutils.corpus2csc(corpus_list)
def worker_loop():
"""Train the model, lifting lists of sentences from the job_queue."""
work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory
neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
jobs_processed = 0
while True:
job = job_queue.get()
if job is None:
progress_queue.put(None)
break # no more jobs => quit this worker
sentences, alpha = job
tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
progress_queue.put((len(sentences), tally, raw_tally)) # report back progress
jobs_processed += 1
logger.debug("worker exiting, processed %i jobs", jobs_processed)
lda = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus,
id2word=dictionary, num_topics=num_topics)
topics = lda.show_topics(num_topics=num_topics, num_words=num_words,
formatted=False)
distributions = [dist for dist in lda.load_document_topics()]
else:
print('Generating model with Gensim LDA ...')
lda = gensim.models.LdaModel(corpus, id2word=dictionary,
num_topics=num_topics, alpha='auto', chunksize=1, eval_every=1)
gensim_topics = [t[1] for t in lda.show_topics(num_topics=num_topics,
num_words=num_words, formatted=False)]
topics = [[(i[1], i[0]) for i in t] for t in gensim_topics]
distributions = []
matrix = gensim.matutils.corpus2csc(corpus)
for i in range(matrix.get_shape()[1]):
bow = gensim.matutils.scipy2sparse(matrix.getcol(i).transpose())
distributions.append(lda.get_document_topics(bow, 0))
topics = exclude_topics(topics)
keywords = generate_keywords(corpus, dictionary, topics, num_keywords)
print_keywords(keywords)
save_keywords(keywords)
save_topics(topics)
save_distributions(distributions)
def worker_train():
"""Train the model, lifting lists of sentences from the jobs queue."""
work = zeros(self.layer2_size, dtype=REAL) # each thread must have its own work memory
neu1 = matutils.zeros_aligned(self.layer2_size, dtype=REAL)
while True:
job = jobs.get()
if job is None: # data finished, exit
break
# update the learning rate before every job
alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
# how many words did we train on? out-of-vocabulary (unknown) words do not count
if self.sg:
job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job)
else:
job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job)
with lock:
word_count[0] += job_words
elapsed = time.time() - start
if elapsed >= next_report[0]:
# Reduce vocabulary
dictionary.filter_extremes(no_below=2, no_above=1.0, keep_n=None)
print('Generate term document matrix...')
corpus = []
for i,bill in enumerate(bills):
text = get_bill_text(bill)
if text is None:
continue
tokens = cleaner.clean(text)
corpus.append(dictionary.doc2bow(tokens))
if i % 100 == 0:
print(i)
dtm = matutils.corpus2csc(corpus).transpose()
print('Calculating similarities')
csims = cosine_similarity(dtm)
# Store output (store the complete matrix w/o diagonal, in case
# left and right bill is different in the other datasets
outline = '{},{},{}\n'
with open('../../data/ncsl/cosine_similarities.csv', 'w') as outfile:
outfile.write(outline.format('left_doc_id',
'right_doc_id',
'cosine_similarity'))
for i in range(len(ids_with_text)):
for j in range(len(ids_with_text)):
if i == j:
continue
Parameters
----------
topicid : int
The ID of the topic to be returned
topn : int, optional
Number of the most significant words that are associated with the topic.
Returns
-------
list of (int, float)
Word ID - probability pairs for the most relevant words generated by the topic.
"""
topic = self.get_topics()[topicid]
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn, reverse=True)
return [(idx, topic[idx]) for idx in bestn]
def extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
if not matutils.ismatrix(corpus):
corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
else:
corpus_csc = corpus
# Need corpus to be a streaming gensim list corpus for len and inference functions below:
corpus = matutils.Sparse2Corpus(corpus_csc)
# TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
# for now, I'll just make sure we don't ever get zeros...
fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
doc_lengths = corpus_csc.sum(axis=0).A.ravel()
assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))
if hasattr(topic_model, 'lda_alpha'):
num_topics = len(topic_model.lda_alpha)
else:
elif word in self.artist2vec_model.vocab:
mean.append(weight * self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index])
all_words.add(self.artist2vec_model.vocab[artist].index)
else:
raise KeyError("artist '%s' not in vocabulary" % artist)
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
limited = self.song2vec_model.syn0norm if restrict_vocab is None \
else self.song2vec_model.syn0norm[:restrict_vocab]
# limited += self.artist2vec_model.syn0norm if restrict_vocab is None \
# else self.artist2vec_model.syn0norm[:restrict_vocab]
dists = dot(limited, mean)
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
result = [(self.song2vec_model.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]
except Exception, e:
print 'error = %s' % e
raise e
word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab]
if not word_vocabs:
warnings.warn("All the input context words are out-of-vocabulary for the current model.")
return None
word2_indices = [word.index for word in word_vocabs]
l1 = np_sum(self.wv.vectors[word2_indices], axis=0)
if word2_indices and self.cbow_mean:
l1 /= len(word2_indices)
# propagate hidden -> output and take softmax to get probabilities
prob_values = exp(dot(l1, self.trainables.syn1neg.T))
prob_values /= sum(prob_values)
top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
# returning the most probable output words with their probabilities
return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]