Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self):
#load positive and negative sentenses from files
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
positive_examples = list(f.readlines())
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
negative_examples = list(f.readlines())
#s.strip: clear "\n"; clear_str; pad
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
self.examples = positive_examples + negative_examples
self.sentences_texts = [sample.split() for sample in self.examples]
#word dictionary
dictionary = corpora.Dictionary(self.sentences_texts)
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...}
#set lables: postive is 1; negative is 0
positive_labels = [1 for _ in positive_examples]
negative_labels = [0 for _ in negative_examples]
self.lables = positive_labels + negative_labels
examples_lables = list(zip(self.examples,self.lables))
random.shuffle(examples_lables)
self.MRDataset_frame = examples_lables
#transform word to id
self.MRDataset_wordid = \
[(
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
sent[1]
) for sent in self.MRDataset_frame]
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]
get_histogram(doc_clean)
exit(0)
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=3, num_words=20))
def get_tfidf_weighted_keyphrases(sentences,
grammar=r'NP: {<dt>? * +}',
top_n=10):
valid_chunks = get_chunks(sentences, grammar=grammar)
dictionary = corpora.Dictionary(valid_chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
weighted_phrases = {dictionary.get(id): round(value,3)
for doc in corpus_tfidf
for id, value in doc}
weighted_phrases = sorted(weighted_phrases.items(),
key=itemgetter(1), reverse=True)
return weighted_phrases[:top_n]
</dt>
if not output_path.endswith('/'): output_path = output_path + '/'
check_dir(output_path) # if directory does not exist, then create
logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
logging.info( 'loading corpus' )
texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
logging.info( 'tokenizing' )
all_tokens = [item for sublist in texts for item in sublist]
logging.info( 'mark tokens which have frequency less than %d', min_freq )
tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
logging.info( '|D|=%d' , len(texts) )
logging.info( 'filter low frequency tokens' )
texts = [[word for word in text if word not in tokens_once] for text in texts]
logging.info( '|D|=%d' , len(texts) )
logging.info( 'building dictionary' )
dictionary = corpora.Dictionary(texts)
logging.info( 'saving dictionary' )
dictFile = output_path + corpus_name + '.dict'
dictionary.save(dictFile)
logging.info( 'building corpus in mm format' )
corpus = [dictionary.doc2bow(text) for text in texts]
logging.info( 'saving corpus' )
gensim_corpus_file = output_path + corpus_name + '.mm'
corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
logging.info( 'computing tfidf' )
tfidf = models.TfidfModel(corpus) # tfidf model
corpus_tfidf = tfidf[corpus] # tfidf corpus
logging.info( 'saving tfidf corpus' )
corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
logging.info( 'gensim corpus is ready' )
##################################################################################
def VSM(articleMatrix):
dictionary = corpora.Dictionary(articleMatrix) # Transfer to dictionary
corpus = [dictionary.doc2bow(article) for article in articleMatrix] # For each article create a bag-of-words
# Use TF-IDF Model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
# Similarity Matrix
tmp_file = get_tmpfile("vsm_similarity")
similarity = similarities.Similarity(tmp_file, corpus_tfidf, num_features=len(dictionary))
# Calculate similarity
similarityMat = similarity[corpus_tfidf]
return similarityMat
def __init__(self, data_path):
self.data_path = data_path
self.dictionary = corpora.Dictionary()
self.corpus = []
self.labels = []
self.cut_doc_obj = cutDoc()
self.w2v_file = W2V_FILE
self.class_num = CLASS_NUM
self.filter_sizes = (3, 8)
self.num_filters = 10
self.hidden_dims = 64
global dictionary, lsi, raw_docs
print('正在查询问题。。。')
raw_docs = read_quesion()
print(raw_docs)
# 如果已经训练过,且自定义的问题未更改,可以直接加载模型、返回index
# if 'Lsi_matrix.index' in os.listdir('.'):
# index = similarities.SparseMatrixSimilarity.load('Lsi_matrix.index')
# return index
# 没有模型,或者自定义问题更改,开始重新训练。
all_doc_list = [list(jieba.cut(doc)) for doc in raw_docs]
# 制作词袋
dictionary = corpora.Dictionary(all_doc_list)
# 语料库:
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
# 将corpus语料库(初识语料库) 使用Lsi模型进行训练
lsi = models.LsiModel(corpus)
# 文本相似度
# 稀疏矩阵
index = similarities.SparseMatrixSimilarity(
lsi[corpus], num_features=len(dictionary.keys()))
# 保存矩阵模型
index.save('Lsi_matrix.index')
return index
def __init__(self, infile, stemmer, remove_same_state, type="all"):
self.infile = infile
self.exclude = set(['', ' '])
self.size = 0
self.dictionary = corpora.Dictionary()
self.stemmer = stemmer
self.remove_same_state = remove_same_state
self.schar = re.compile('[^A-Za-z]')
self.type = type
self.no_align = 0
def __init__(self, corpus):
self.corpus = corpus
self.iter_1, self.iter_2 = itertools.tee(self.corpus, 2)
self.tokens = [tokens for tokens in iter_corpus(self.iter_1)]
# Create dictionary out of input corpus tokens
self.dict = gensim.corpora.Dictionary(self.tokens)
self.filename = None