Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self,
df,
category_col,
parsed_col,
feats_from_spacy_doc=FeatsFromSpacyDoc()):
'''
Parameters
----------
df : pd.DataFrame
contains category_col, and parse_col, were parsed col is entirely spacy docs
category_col : str
name of category column in convention_df
parsed_col : str
name of spacy parsed column in convention_df
feats_from_spacy_doc : FeatsFromSpacyDoc
'''
self._df = df.reset_index()
self._category_col = category_col
self._parsed_col = parsed_col
self._category_idx_store = IndexStore()
from collections import Counter
from functools import partial
from sys import version_info
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
class FeatsFromSpacyDocAndEmpath(FeatsFromSpacyDoc):
def __init__(self,
use_lemmas=False,
entity_types_to_censor=set(),
tag_types_to_censor=set(),
strip_final_period=False,
empath_analyze_function=None,
**kwargs):
'''
Parameters
----------
empath_analyze_function: function (default=empath.Empath().analyze)
Function that produces a dictionary mapping Empath categories to
Other parameters from FeatsFromSpacyDoc.__init__
'''
if empath_analyze_function is None:
from collections import Counter
from scattertext.external.phrasemachine import phrasemachine
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
class PhraseMachinePhrases(FeatsFromSpacyDoc):
'''
Returns unigrams and phrase machine phrases
'''
def get_feats(self, doc):
'''
Parameters
----------
doc, Spacy Doc
Returns
-------
Counter noun chunk -> count
'''
ngram_counter = Counter()
for sent in doc.sents:
...u'Did sometimes march? by heaven I charge thee, speak!',
...u'Halt! Who goes there?',
...u'[Intro]',
...u'It is I sire Tone from Brooklyn.',
...u'Well, speak up man what is it?',
...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!']
>>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5
>>> clean_function = lambda text: '' if text.startswith('[') else text
>>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build()
"""
self._category_text_iter = category_text_iter
self._clean_function = clean_function
self._nlp = nlp
self._entity_types_to_censor = set()
if feats_from_spacy_doc is None:
self._feats_from_spacy_doc = FeatsFromSpacyDoc()
else:
self._feats_from_spacy_doc = feats_from_spacy_doc
from collections import Counter
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
class UseFullDocAsMetadata(FeatsFromSpacyDoc):
def get_feats(self, doc):
return Counter()
def get_doc_metadata(self, doc):
'''
Parameters
----------
doc, Spacy Docs
Returns
-------
Counter str -> count
'''
return Counter({str(doc): 1})
)
return text_df
def get_doc_metadata(self, doc, prefix=''):
feature_counter = Counter()
if version_info[0] >= 3:
doc = str(doc)
for category, score in self._analyze(doc).to_dict()[0].items():
feature_counter[prefix + category] = int(score)
return feature_counter
@abstractmethod
def _get_terms_from_doc(self, doc):
pass
class FeatsFromTopicModel(FeatsFromSpacyDoc, FeatsFromTopicModelBase):
def __init__(self,
topic_model,
use_lemmas=False,
entity_types_to_censor=set(),
tag_types_to_censor=set(),
strip_final_period=False,
**kwargs):
'''
Parameters
----------
topic_model : dict
{topicmodelname: [term1, term2, ....], ...}
Other parameters from FeatsFromSpacyDoc.__init__
'''
check_topic_model_string_format(topic_model)
'''
Parameters
----------
doc, Spacy Doc
Returns
-------
Counter noun chunk -> count
'''
ngram_counter = Counter()
for sent in doc.sents:
ngram_counter += _phrase_counts(sent)
return ngram_counter
class PhraseMachinePhrasesAndUnigrams(FeatsFromSpacyDoc):
'''
Returns unigrams and phrase machine phrases
'''
def get_feats(self, doc):
'''
Parameters
----------
doc, Spacy Doc
Returns
-------
Counter noun chunk -> count
'''
# ngram_counter = phrasemachine.get_phrases(str(doc), tagger='spacy')['counts']
ngram_counter = Counter()
from collections import Counter
from re import split
from sys import version_info
import pandas as pd
from scattertext.Common import GENERAL_INQUIRER_URL
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
class FeatsFromGeneralInquirer(FeatsFromSpacyDoc):
def __init__(self,
use_lemmas=False,
entity_types_to_censor=set(),
tag_types_to_censor=set(),
strip_final_period=False,
**kwargs):
'''
Parameters
----------
empath_analyze_function: function (default=empath.Empath().analyze)
Function that produces a dictionary mapping Empath categories to
Other parameters from FeatsFromSpacyDoc.__init__
'''
self._lexicon_df = self._download_and_parse_general_inquirer()
super(FeatsFromGeneralInquirer, self).__init__(use_lemmas,
specificed entities, instead of labeled by their lower case orthographic
form or lemma, will be labeled by their entity type.
Parameters
----------
entity_types : set of entity types outputted by spaCy
'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE',
'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT'
Returns
---------
self
'''
assert type(entity_types) == set
self._entity_types_to_censor = entity_types
self._feats_from_spacy_doc = FeatsFromSpacyDoc(
use_lemmas=self._use_lemmas,
entity_types_to_censor=self._entity_types_to_censor
)
return self
from collections import Counter
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
class FeatsFromSpacyDocOnlyNounChunks(FeatsFromSpacyDoc):
'''
Just returns noun chunks from spaCy
'''
def get_feats(self, doc):
'''
Parameters
----------
doc, Spacy Docs
Returns
-------
Counter noun chunk -> count
'''
return Counter([str(c).lower() for c in doc.noun_chunks])