Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# use the built-in Gensim lemmatize engine
if lemma:
return utils.lemmatize(text, stopwords=ignore_words, min_length=3)
# tokenize words using NLTK Twitter Tokenizer
tknzr = TweetTokenizer()
text = tknzr.tokenize(text)
# lowercase, remove words less than len 2 & remove numbers in tokenized list
return [word.lower() for word in text if len(word) > 2 and not word.isdigit() and not word in ignore_words]
def filenames_to_generator(directory):
for filename in os.listdir(directory):
yield directory + str(filename)
class DocCorpus(gensim.corpora.TextCorpus):
# overrides the get_texts function of Gensim TextCorpus in order to use
# directory of texts as corpus, where each text file is a document
def __init__(self, docs_loc, lemmatize, dictionary=None, metadata=None):
self.docs_loc = docs_loc
self.lemmatize = lemmatize
self.metadata = metadata
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
def get_texts(self):
pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
func = partial(preprocess_text, self.lemmatize)
for tokens in pool.map(func, filenames_to_generator(self.docs_loc)):
yield tokens
pool.terminate()
lame = ['from', 'subject', 'nntp', 'posting', 'host', 'organization', 'lines', 're', 'nntp-posting-host']
stoplist = nltk.corpus.stopwords.words('english') + lame + list(string.punctuation) + ['--', "''", '``', "'s", "n't", '...']
def is_good_word(word):
return len(word) > 3 and word not in stoplist
# texts is the document turned into a list of words.
texts = [[word for word in nltk.word_tokenize(document.lower()) if is_good_word(word)] for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
# Creating a dictionary to map words to integers.
dictionary = corpora.Dictionary(texts)
# print(dictionary.token2id).items()[:10]
# new_doc = "Human computer interaction"
# new_vec = dictionary.doc2bow(new_doc.lower().split())
# print(new_vec)
# Vectorize: turn each document (list-of-words) into a vectorized bag-of-words.
corpus = [dictionary.doc2bow(text) for text in texts]
# Transformations, from one vector representation to another.
# E.g. bag-of-words to Tf-IDF
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model. This learns document frequencies.
# You can transform a vectorized bag-of-words:
# doc_bow = [(0, 1), (1, 1)]
# print(tfidf[doc_bow])
def prepare_lsi(self, doc):
# 给训练集用的
list_total, list_tag = self.load_data(doc)
stop_word = self.load_stopword()
texts = [[word for word in document.lower().split() if word not in stop_word]
for document in list_total]
# train dictionary done# 抽取一个bag-of-words,将文档的token映射为id
dictionary = corpora.Dictionary(texts) # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2}
# print dictionary.token2id
# 产生文档向量,将用字符串表示的文档转换为用id和词频表示的文档向量
corpus = [dictionary.doc2bow(text) for text in texts]
# [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]]
# 例如(9,2)这个元素代表第二篇文档中id为9的单词出现了2次
# 用TFIDF的方法计算词频,sublinear_tf 表示学习率
tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word)
# 对文本中所有的用户对应的所有的评论里面的单词进行TFIDF的计算,找出每个词对应的tfidf值
X_sp = tfv.fit_transform(list_total)
# train model done基于这些“训练文档”计算一个TF-IDF模型
tfidf_model = models.TfidfModel(corpus)
joblib.dump(tfidf_model, "tfidf_model.model")
# 转化文档向量,将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量
corpus_tfidf = tfidf_model[corpus]
# [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]]
# 训练LSI模型 即将训练文档向量组成的矩阵SVD分解,并做一个秩为2的近似SVD分解
def construct_dictionary(documents_path, filter=None) :
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
if filter :
# remove stop words and words that appear only once
stoplist = set('for a of the and to in'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
return dictionary
def load_plain_corpus(self):
""" Load the plain corpus from file """
return corpora.MmCorpus(self._PLAIN_CORPUS_FILE)
def save_lda_model(self, lda_model, corpus, dictionary, index):
index.save(self.lda_path + 'index.lda')
pyLDAvis.save_json(pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), self.lda_path + '/../static/js/lda.json')
print(lda_model.print_topics())
lda_model.save(self.lda_path + 'model.lda')
dictionary.save(self.lda_path + 'dict.lda')
corpora.MmCorpus.serialize(self.lda_path + 'corpus.mm', corpus)
def fill_dictionary(self, tokens):
""" Fills a dictionary
Parameters
----------
tokens: list of list of str
e.g. [['hi', 'ho'], ['my', 'name', ...], ...]
"""
self.dictionary = corpora.Dictionary(tokens, prune_at=self.prune_at)
self.dictionary.filter_extremes(self.no_below, self.no_above,
keep_n=self.keep_n)
if len(sys.argv) != 3:
raise Exception("Usage: ")
ntop=int(sys.argv[1])
ncores=int(sys.argv[2])
print "Using " + str(ntop) + " topics and " + str(ncores) + " cores"
lda = gensim.models.LdaMulticore(corpus=mmTrain, id2word=id2word, num_topics= ntop, workers=ncores)
lda_file = 'LDA/lda'+str(ntop)
lda.save(lda_file)
out_vect = 'LDA/wikipedia_lda'+str(ntop)+'.txt'
#gensim.corpora.MmCorpus.serialize(out_vect, (gensim.matutils.unitvec(vec) for vec in lda[mmTest]))
gensim.corpora.MmCorpus.serialize(out_vect, lda[mmTest])
@classmethod
def load(cls, save_dir='./'):
"""
Load the corpus from a save directory.
"""
tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
tagsToDocs = tables[0]
docsToTags = tables[1]
titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
ksearch = KeySearch(dictionary, tfidf_model,
corpus_tfidf, titles, tagsToDocs,
docsToTags, files, doc_line_nums)
return ksearch
def run(self):
repdocs = self.read_repdocs()
dictionary = gensim.corpora.Dictionary(repdocs)
logging.info('term count in paper repdoc corpus pre-filtering: %d' %
len(dictionary))
dictionary.filter_extremes(2, 1, len(dictionary))
logging.info('term count in paper repdoc corpus post-filtering: %d' %
len(dictionary))
dictionary.save(self.output().path)