Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text):
idx = IndexStore()
tdf_vals = term_freqs.values
valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
tdf_vals = term_freqs[valid_terms_mask].values
terms = np.array(term_freqs.index)[valid_terms_mask]
lengths = []
fact = CSRMatrixFactory()
for i, t in enumerate(terms):
for tok in t.split():
fact[i, idx.getidx(tok)] = 1
lengths.append(len(t.split()))
lengths = np.array(lengths)
mat = fact.get_csr_matrix()
coocs = lengths - (mat * mat.T)
pairs = np.argwhere(coocs == 0).T
pairs = self._limit_to_non_identical_terms(pairs)
pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
def _get_features_and_labels_from_documents_and_indexes(self,
category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store):
y = []
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
for document_index, (category, parsed_text) in enumerate(category_doc_iter):
self._register_doc_and_category(X_factory,
mX_factory,
category,
category_idx_store,
document_index,
parsed_text,
term_idx_store,
metadata_idx_store,
y)
X = X_factory.get_csr_matrix()
mX = mX_factory.get_csr_matrix()
y = np.array(y)
return X, mX, y
for term in terms_in_corpus[label_term_mask]]
if new_meta_X is None:
new_meta_X = label_X
else:
label_X_pad = (CSRMatrixFactory()
.set_last_col_idx(cols_to_pad - 1)
.set_last_row_idx(sum(label_doc_mask) - 1)
.get_csr_matrix())
padded_label_X = scipy.sparse.hstack([label_X_pad, label_X])
new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1])
new_meta_X = scipy.sparse.vstack([new_meta_X,
padded_label_X])
new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list)
new_meta_X = new_meta_X.tocsr()
new_mX = (CSRMatrixFactory()
.set_last_col_idx(new_meta_X.shape[1] - 1)
.set_last_row_idx(new_meta_X.shape[0] - 1)
.get_csr_matrix().tolil())
start_row = 0
for doc_label in ordered_doc_labels:
label_doc_mask = doc_labels == doc_label
num_rows = sum(label_doc_mask)
new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :]
start_row += num_rows
new_mX = new_mX.tocsr()
new_tdm = self._make_new_term_doc_matrix(self._X,
new_mX,
self._y,
self._term_idx_store,
self._category_idx_store,
def build(self):
'''Constructs the term doc matrix.
Returns
-------
TermDocMatrix
'''
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
term_idx_store = IndexStore()
metadata_idx_store = IndexStore()
parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
X_factory,
mX_factory,
term_idx_store,
metadata_idx_store,
self)
df = self._clean_and_filter_nulls_and_empties_from_dataframe()
tdm = self._apply_pipeline_and_get_build_instance(X_factory,
mX_factory,
df,
parse_pipeline,
term_idx_store,
def use_categories_as_metadata(self):
'''
Returns a TermDocMatrix which is identical to self except the metadata values are now identical to the
categories present.
:return: TermDocMatrix
'''
new_metadata_factory = CSRMatrixFactory()
for i, category_idx in enumerate(self.get_category_ids()):
new_metadata_factory[i, category_idx] = 1
new_metadata = new_metadata_factory.get_csr_matrix()
new_tdm = self._make_new_term_doc_matrix(self._X,
new_metadata,
self._y,
self._term_idx_store,
self._category_idx_store,
copy(self._category_idx_store),
self._y == self._y)
return new_tdm
def init_term_doc_matrix_variables():
y = []
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
category_idx_store = IndexStore()
term_idx_store = IndexStore()
metadata_idx_store = IndexStore()
return X_factory, mX_factory, category_idx_store, \
term_idx_store, metadata_idx_store, y
def _get_features_and_labels_from_documents_and_indexes(self,
category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store):
y = []
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
for document_index, (category, parsed_text) in enumerate(category_doc_iter):
self._register_doc_and_category(X_factory,
mX_factory,
category,
category_idx_store,
document_index,
parsed_text,
term_idx_store,
metadata_idx_store,
y)
X = X_factory.get_csr_matrix()
mX = mX_factory.get_csr_matrix()
y = np.array(y)
return X, mX, y