Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
text = "Wow ๐ This is really cool! ๐ ๐"
doc = Doc(en_vocab, words=text.split(" "))
pos_emoji = ["๐", "๐", "๐", "๐คฃ", "๐", "๐"]
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == "HAPPY":
doc.sentiment += 0.1
span = doc[start:end]
with doc.retokenize() as retokenizer:
retokenizer.merge(span)
token = doc[start]
token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab)
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher(doc)
assert doc.sentiment != 0
assert doc[1].norm_ == "happy emoji"
def test_issue850_basic():
"""Test Matcher matches with '*' operator and Boolean flag"""
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab)
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern)
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc)
assert len(match) == 1
ent_id, start, end = match[0]
assert start == 0
assert end == 4
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add("pat1", None, [{"orth": "hello"}])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=["hello"])
assert len(new_matcher(new_doc)) == 1
def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches):
assert match[1:] == re_match
def test_create(self):
vocab = Vocab()
matcher = Matcher(vocab, {})
def __init__(self, lowercase=True, keepcaps=False, normalize=3,
ignore_quotes=False, ignore_reddit_quotes=False,
ignore_stopwords=False, stem=False,
remove_punct=True, remove_breaks=True, decontract=False,
twitter_handles=False, urls=False, hashtags=False,
numbers=False, subreddits=False, reddit_usernames=False,
emails=False, extra_patterns=None, keep_untokenized=None,
whitespaces_to_underscores=True, remove_nonunicode=False,
pos_emojis=None, neg_emojis=None, neutral_emojis=None,
print_url_warnings=False, latin_chars_fix=False,
ngrams=1):
self.params = locals()
self._nlp = English()
self._merging_matcher = Matcher(self._nlp.vocab)
self._matcher = Matcher(self._nlp.vocab)
self._replacements = {}
self._domains = {}
self._realnames = {}
self._stopwords = None
alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)
self._merging_matcher.add(
'HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
self._merging_matcher.add(
'SUBREDDIT', None,
[{'ORTH': '/r'}, {'ORTH': '/'}, {alpha_digits_flag: True}],
def __init__(self, nlp, patterns: list = None):
"""
SpaCy pipe to match Entity based on multiple patterns.
Pattern examples:
patterns = [
{'kind': 'phrase', 'value': 'amazon', 'entity': 'PRODUCT'},
{'kind': 'regex', 'value': 'ama(.+)', 'entity': 'PRODUCT'}
]
:param nlp: The NLP object
:param patterns: The matcher patterns
"""
self.nlp = nlp
self.phrase_matcher = PhraseMatcher(nlp.vocab)
self.matcher = Matcher(nlp.vocab)
self.extra_patterns = []
# start add pattern
self.add_patterns(patterns=patterns or [])
# # print(doc)
# # error occurs when there are more than one hyphen within span, basically it can be ignored
span.merge()
return doc
CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]'
r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER)
Language = get_lang_class('ru')
Language.Defaults.infixes += ('ยซยป',)
Language.Defaults.infixes += ('-',)
Language.Defaults.infixes += ('"\/',)
Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),)
# Token.set_extension('is_adjective', default=False, force=True)
nlp = Language()
matcher = Matcher(nlp.vocab)
pattern = nlp.vocab.strings['pattern']
sentence_terminal = nlp.vocab.strings['sentence_terminal']
if merge_patterns:
matcher.add(pattern, None, *merge_patterns)
if terminal_patterns:
matcher.add(sentence_terminal, None, *terminal_patterns)
# nlp.add_pipe(match_adjective, name='match_adjective', last=True)
nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True)
nlp.add_pipe(rules_matcher, name='rules_matcher', after='detect_sentence_boundaries')
for case in HYPHEN_SPICIAL_CASES:
nlp.tokenizer.add_special_case(case, [{'ORTH': case}])
for case in DOT_SPECIAL_CASES:
nlp.tokenizer.add_special_case(case, [{'ORTH': case}])
def __init__(self, lowercase=True, keepcaps=False, normalize=3,
ignore_quotes=False, ignore_reddit_quotes=False,
ignore_stopwords=False, stem=False,
remove_punct=True, remove_breaks=True, decontract=False,
twitter_handles=False, urls=False, hashtags=False,
numbers=False, subreddits=False, reddit_usernames=False,
emails=False, extra_patterns=None, keep_untokenized=None,
whitespaces_to_underscores=True, remove_nonunicode=False,
pos_emojis=None, neg_emojis=None, neutral_emojis=None,
print_url_warnings=False, latin_chars_fix=False,
ngrams=1):
self.params = locals()
self._nlp = English()
self._merging_matcher = Matcher(self._nlp.vocab)
self._matcher = Matcher(self._nlp.vocab)
self._replacements = {}
self._domains = {}
self._realnames = {}
self._stopwords = None
alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)
self._merging_matcher.add(
'HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
self._merging_matcher.add(
'SUBREDDIT', None,
[{'ORTH': '/r'}, {'ORTH': '/'}, {alpha_digits_flag: True}],
[{'ORTH': 'r'}, {'ORTH': '/'}, {alpha_digits_flag: True}])
from spacy.matcher import Matcher
from spacy.lang.en import English
TEXTS = [
"The name of the case is R v Horncastle [2009] AC 123",
"The name of the case is R v Horncastle [2009] 1 AC 123",
"The name of the case is R v Horncastle [2009] 1 Cr App R 109",
"The name of the case was Boaty McBoatface [2009] EWCA Civ 123",
"The name of the case was Boaty McBoatface [2009] 1 All ER 123",
"The name of the case was Boaty McBoatface [2009] EWHC 123 (Admin) and we like hats.",
"I shouldn't return any matched entities.",
]
nlp = English()
matcher = Matcher(nlp.vocab)
# Matches [2010] AC 123-style
pattern1 = [
{"IS_PUNCT": True},
{"LIKE_NUM": True},
{"IS_PUNCT": True},
{"IS_ALPHA": True},
{"LIKE_NUM": True},
]
# Matches [2010] 1 AC 123-style
pattern2 = [
{"IS_PUNCT": True},
{"LIKE_NUM": True},
{"IS_PUNCT": True},
{"LIKE_NUM": True},