Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, corpus):
'''
Parameters
----------
corpus
'''
assert isinstance(corpus, ParsedCorpus)
self.corpus = corpus
self.termidxstore = corpus._term_idx_store
matfact = CSRMatrixFactory()
self.doclabs = []
self.sentlabs = []
self.sentdocs = []
senti = 0
for doci, doc in enumerate(corpus.get_parsed_docs()):
for sent in doc.sents:
validsent = False
for t in sent:
try:
termi = self.termidxstore.getidxstrict(t.lower_)
except:
continue
if validsent is False:
def _make_new_term_doc_matrix(self,
new_X=None,
new_mX=None,
new_y=None,
new_term_idx_store=None,
new_category_idx_store=None,
new_metadata_idx_store=None,
new_y_mask=None):
return ParsedCorpus(
X=new_X if new_X is not None else self._X,
mX=new_mX if new_mX is not None else self._mX,
y=new_y if new_y is not None else self._y,
parsed_col=self._parsed_col,
category_col=self._category_col,
term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
category_idx_store=new_category_idx_store if new_category_idx_store is not None else self._category_idx_store,
metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None else self._metadata_idx_store,
df=self._df[new_y_mask] if new_y_mask is not None else self._df,
unigram_frequency_path=self._unigram_frequency_path
)
def build(self):
'''Constructs the term doc matrix.
Returns
-------
scattertext.ParsedCorpus.ParsedCorpus
'''
self._y = self._get_y_and_populate_category_idx_store()
self._df.apply(self._add_to_x_factory, axis=1)
self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
if self._parsed_col is not None and self._parsed_col in self._df:
return ParsedCorpus(self._df,
self._X,
self._mX,
self._y,
self._term_idx_store,
self._category_idx_store,
self._metadata_idx_store,
self._parsed_col,
self._category_col)
else:
return CorpusDF(self._df,
self._X,
self._mX,
self._y,
self._text_col,
self._term_idx_store,
self._category_idx_store,
def _make_new_term_doc_matrix(self,
new_X,
new_mX,
new_y,
new_term_idx_store,
new_category_idx_store,
new_metadata_idx_store,
new_y_mask):
if self._parsed_col is not None and self._parsed_col in self._df:
return ParsedCorpus(self._df[new_y_mask],
new_X,
new_mX,
new_y,
new_term_idx_store,
new_category_idx_store,
new_metadata_idx_store,
self._parsed_col,
self._category_col)
else:
return CorpusDF(self._df[new_y_mask],
new_X,
new_mX,
new_y,
self._text_col,
new_term_idx_store,
new_category_idx_store,
def __init__(self, corpus, alternative_text_field=None):
'''
Parameters
----------
corpus, Corpus: Corpus to extract documents and labels from
alternative_text_field, str or None: if str, corpus must be parsed corpus
'''
#assert (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
# or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))))
self._texts_to_display = None
if alternative_text_field is not None:
if not isinstance(corpus, ParsedCorpus):
raise CorpusShouldBeParsedCorpusException(
'Corpus type needs to be ParsedCorpus to use the alternative text field.')
self._texts_to_display = corpus.get_field(alternative_text_field)
self._use_non_text_features = False
self._corpus = corpus
def __init__(self, corpus, word2vec_model=None):
'''
Parameters
----------
corpus: ParsedCorpus
from which to build word2vec model
word2vec_model: word2vec.Word2Vec
Gensim instance to be used to train word2vec model
'''
try:
from gensim.models import word2vec
assert word2vec_model is None or isinstance(word2vec_model, word2vec.Word2Vec)
except:
warnings.warn("You should really install gensim, but we're going to duck-type your model and pray it works")
assert isinstance(corpus, ParsedCorpus)
self.corpus = corpus
self.model = self._get_word2vec_model(word2vec_model)
def build(self):
'''Constructs the term doc matrix.
Returns
-------
scattertext.ParsedCorpus.ParsedCorpus
'''
self._y = self._get_y_and_populate_category_idx_store()
self._df.apply(self._add_to_x_factory, axis=1)
self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
return ParsedCorpus(self._df,
self._X,
self._mX,
self._y,
self._term_idx_store,
self._category_idx_store,
self._metadata_idx_store,
self._parsed_col,
self._category_col)
def _make_new_term_doc_matrix(self,
new_X=None,
new_mX=None,
new_y=None,
new_term_idx_store=None,
new_category_idx_store=None,
new_metadata_idx_store=None,
new_y_mask=None):
return ParsedCorpus(self._df[new_y_mask] if new_y_mask else self._df,
self._X if new_X is None else new_X,
self._mX if new_mX is None else new_mX,
self._y if new_y is None else new_y,
self._term_idx_store if new_term_idx_store is None else new_term_idx_store,
self._category_idx_store if new_category_idx_store is None else new_category_idx_store,
self._metadata_idx_store if new_metadata_idx_store is None else new_metadata_idx_store,
self._parsed_col,
self._category_col)