Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("x", default=False, force=True)
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["LosAngeles", "start"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
heads = [(doc[0], 1), doc[1]]
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["hello", "world", "!"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs=attrs)
def test_matcher_extension_set_membership(en_vocab):
matcher = Matcher(en_vocab)
get_reversed = lambda token: "".join(reversed(token.text))
Token.set_extension("reversed", getter=get_reversed, force=True)
pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
matcher.add("REVERSED", None, pattern)
doc = Doc(en_vocab, words=["hi", "bye", "hello"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["aardvark"])
matches = matcher(doc)
assert len(matches) == 0
def __init__(self, infl_fn, overrides_fn=None):
self.infl_data = self._loadInflections(infl_fn)
if overrides_fn:
self.overrides = self._loadOverrides(overrides_fn)
if 'spacy' in sys.modules:
min_version = '2.0'
mv = min_version.split('.')
sv = spacy.__version__.split('.')
if sv[0] > mv[0] or (sv[0] == mv[0] and sv[1] >= mv[1]):
spacy.tokens.Token.set_extension('inflect', method=self.spacyGetInfl, force=True)
else:
logging.warning('Spacy extensions are disabled. Spacy version is %s. '
'A minimum of %s is required', spacy.__version__, min_version)
to custom descriptions, e.g. translations or other annotations.
RETURNS (callable): A spaCy pipeline component.
"""
self._has_emoji, self._is_emoji, self._emoji_desc, self._emoji = attrs
self.merge_spans = merge_spans
self.lookup = lookup
self.matcher = PhraseMatcher(nlp.vocab)
emoji_patterns = list(nlp.tokenizer.pipe(EMOJI.keys()))
self.matcher.add(pattern_id, None, *emoji_patterns)
# Add attributes
Doc.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension)
Doc.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension)
Span.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension)
Span.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension)
Token.set_extension(self._is_emoji, default=False, force=force_extension)
Token.set_extension(self._emoji_desc, getter=self.get_emoji_desc, force=force_extension)
def do() -> None:
Token.set_extension('censored', default=None)
Token.set_extension('is_profane', getter=SpacyProfanityFilterComponent.token_is_profane)
Token.set_extension('original_profane_word', default=None)
Span.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
Doc.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
from eventlet.green.urllib.request import urlopen
from eventlet.timeout import Timeout
from redditscore.models.redditmodel import word_ngrams
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Doc, Token
try:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
except ImportError:
warnings.warn(
'nltk could not be imported, some features will be unavailable')
Token.set_extension('transformed_text', default='', force=True)
Doc.set_extension('tokens', default='', force=True)
TIMEOUT = 3.0
POS_EMOJIS = [u'๐', u'โค', u'โฅ', u'๐', u'๐', u'๐', u'๐', u'๐',
u'๐', u'๐', u'โบ', u'โก', u'๐', u'โ', u'๐', u'๐', u'๐', u'๐']
NEG_EMOJIS = [u'๐ญ', u'๐ฉ', u'๐', u'๐', u'๐ฑ']
NEUTRAL_EMOJIS = [u'๐']
NORMALIZE_RE = re.compile(r"([a-zA-Z])\1\1+")
ALPHA_DIGITS_RE = re.compile(r"[a-zA-Z0-9_]+")
TWITTER_HANDLES_RE = re.compile(r"@\w{1,15}")
REDDITORS_RE = re.compile(r"u/\w{1,20}")
SUBREDDITS_RE = re.compile(r"/r/\w{1,20}")
QUOTES_RE = re.compile(r'^".*"$')
def add_punct_tagger(self, tagger):
""" Tagging for punct
"""
self.nlp.add_pipe(tagger, name='tag_punct', first=True)
# Add custom fields needed for this usecase
Token.set_extension('is_punct', default=False, force=True)
Token.set_extension('to_skip', default=False, force=True)