Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
name of column in convention_df with a feature dictionary
metadata_col : str, optional
name of column in convention_df with a meatadata dictionary
parsed_col : str, optional
name of column in convention_df with parsed strings
'''
self._df = df.reset_index()
self._category_col = category_col
self._text_col = text_col
self._feature_col = feature_col
self._parsed_col = parsed_col
self._metadata_col = metadata_col
self._category_idx_store = IndexStore()
self._X_factory = CSRMatrixFactory()
self._mX_factory = CSRMatrixFactory()
self._term_idx_store = IndexStore()
self._metadata_idx_store = IndexStore()
def add_doc_names_as_metadata(self, doc_names):
'''
:param doc_names: array-like[str], document names of reach document
:return: Corpus-like object with doc names as metadata. If two documents share the same name
(doc number) will be appended to their names.
'''
if len(doc_names) != self.get_num_docs():
raise Exception("The parameter doc_names contains %s elements. "
"It should have %s elements, one per document." % (len(doc_names), self.get_num_docs()))
doc_names_counter = collections.Counter(np.array(doc_names))
metafact = CSRMatrixFactory()
metaidxstore = IndexStore()
doc_id_uses = collections.Counter()
for i in range(self.get_num_docs()):
doc_id = doc_names[i]
if doc_names_counter[doc_id] > 1:
doc_id_uses[doc_id] += 1
doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id]))
else:
doc_name_idx = metaidxstore.getidx(doc_id)
metafact[i, i] = doc_name_idx
return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
'''
Parameters
----------
category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs
Returns
----------
t : TermDocMatrix
'''
term_idx_store = IndexStore()
category_idx_store = IndexStore()
metadata_idx_store = IndexStore()
X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
(category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store)
return TermDocMatrix(X,
mX,
y,
term_idx_store=term_idx_store,
category_idx_store=category_idx_store,
metadata_idx_store=metadata_idx_store)
def batch_delete_idx(self, idx_list):
new_idxstore = IndexStore()
last_idx_to_delete = -1
number_of_values = self.getnumvals()
for idx_to_delete in sorted(idx_list):
if idx_to_delete >= number_of_values:
raise ValueError('index ' + str(idx_to_delete) + ' not found')
new_idxstore._i2val += self._i2val[last_idx_to_delete + 1:idx_to_delete]
last_idx_to_delete = idx_to_delete
new_idxstore._i2val += self._i2val[last_idx_to_delete + 1:]
new_idxstore._val2i = {val: i for i, val in enumerate(new_idxstore._i2val)}
new_idxstore._next_i = len(new_idxstore._val2i)
return new_idxstore
def term_group_freq_df(self, group_col):
# type: (str) -> pd.DataFrame
'''
Returns a dataframe indexed on the number of groups a term occured in.
Parameters
----------
group_col
Returns
-------
pd.DataFrame
'''
group_idx_store = IndexStore()
X = self._X
group_idx_to_cat_idx, row_group_cat \
= self._get_group_docids_and_index_store(X, group_col, group_idx_store)
newX = self._change_document_type_in_matrix(X, row_group_cat)
newX = self._make_all_positive_data_ones(newX)
category_row = newX.tocoo().row
for group_idx, cat_idx in group_idx_to_cat_idx.items():
category_row[category_row == group_idx] = cat_idx
catX = self._change_document_type_in_matrix(newX, category_row)
return self._term_freq_df_from_matrix(catX)
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
'''
Parameters
----------
category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs
Returns
----------
t : TermDocMatrix
'''
term_idx_store = IndexStore()
category_idx_store = IndexStore()
metadata_idx_store = IndexStore()
X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
(category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store)
return TermDocMatrix(X,
mX,
y,
term_idx_store=term_idx_store,
category_idx_store=category_idx_store,
metadata_idx_store=metadata_idx_store)
def build(self):
'''Constructs the term doc matrix.
Returns
-------
TermDocMatrix
'''
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
term_idx_store = IndexStore()
metadata_idx_store = IndexStore()
parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
X_factory,
mX_factory,
term_idx_store,
metadata_idx_store,
self)
df = self._clean_and_filter_nulls_and_empties_from_dataframe()
tdm = self._apply_pipeline_and_get_build_instance(X_factory,
mX_factory,
df,
parse_pipeline,
term_idx_store,
metadata_idx_store)
return tdm
def build(self):
'''Constructs the term doc matrix.
Returns
-------
TermDocMatrix
'''
X_factory = CSRMatrixFactory()
mX_factory = CSRMatrixFactory()
term_idx_store = IndexStore()
metadata_idx_store = IndexStore()
parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
X_factory,
mX_factory,
term_idx_store,
metadata_idx_store,
self)
df = self._clean_and_filter_nulls_and_empties_from_dataframe()
tdm = self._apply_pipeline_and_get_build_instance(X_factory,
mX_factory,
df,
parse_pipeline,
term_idx_store,
metadata_idx_store)
return tdm
----------
df : pd.DataFrame
contains category_col, and parse_col, were parsed col is entirely spacy docs
category_col : str
name of category column in convention_df
parsed_col : str
name of spacy parsed column in convention_df
feats_from_spacy_doc : FeatsFromSpacyDoc
'''
self._df = df.reset_index()
self._category_col = category_col
self._parsed_col = parsed_col
self._category_idx_store = IndexStore()
self._X_factory = CSRMatrixFactory()
self._mX_factory = CSRMatrixFactory()
self._term_idx_store = IndexStore()
self._metadata_idx_store = IndexStore()
self._feats_from_spacy_doc = feats_from_spacy_doc
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
'''
Parameters
----------
category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs
Returns
----------
t : TermDocMatrix
'''
term_idx_store = IndexStore()
category_idx_store = IndexStore()
metadata_idx_store = IndexStore()
X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
(category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store)
return TermDocMatrix(X,
mX,
y,
term_idx_store=term_idx_store,
category_idx_store=category_idx_store,
metadata_idx_store=metadata_idx_store)