Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("x", default=False, force=True)
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["LosAngeles", "start"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
heads = [(doc[0], 1), doc[1]]
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def test_doc_retokenize_merge_extension_attrs(en_vocab):
Token.set_extension("a", default=False, force=True)
Token.set_extension("b", default="nothing", force=True)
doc = Doc(en_vocab, words=["hello", "world", "!"])
# Test regular merging
with doc.retokenize() as retokenizer:
attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
retokenizer.merge(doc[0:2], attrs=attrs)
assert doc[0].lemma_ == "hello world"
assert doc[0]._.a is True
assert doc[0]._.b == "1"
# Test bulk merging
doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
assert doc[0]._.a is True
assert doc[0]._.b == "1"
assert doc[1]._.a is None
assert doc[1]._.b == "2"
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
"""Test that retokenization also sets attributes on the lexeme if they're
lexical attributes. For example, if a user sets IS_STOP, it should mean that
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
here is acceptable. Also see #2390.
"""
# Test regular merging
doc = Doc(en_vocab, words=["hello", "world", "!"])
assert not any(t.is_stop for t in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
assert doc[0].lemma_ == "hello world"
assert doc[0].is_stop
# Test bulk merging
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
assert not any(t.like_num for t in doc)
assert not any(t.is_stop for t in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"like_num": True})
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
assert doc[0].like_num
assert doc[1].is_stop
assert not doc[0].is_stop
assert not doc[1].like_num
def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
matcher.add("DET_HOUSE", None, pattern)
doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["my", "house"])
matches = matcher(doc)
assert len(matches) == 1
def test_issue3328(en_vocab):
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
matcher = Matcher(en_vocab)
patterns = [
[{"LOWER": {"IN": ["hello", "how"]}}],
[{"LOWER": {"IN": ["you", "doing"]}}],
]
matcher.add("TEST", None, *patterns)
matches = matcher(doc)
assert len(matches) == 4
matched_texts = [doc[start:end].text for _, start, end in matches]
assert matched_texts == ["Hello", "how", "you", "doing"]
def test_sents_1_3(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = True
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) >= 3
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = True
doc[2].sent_start = False
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 3
def test_spacy_training_sample_alignment(spacy_nlp_component):
from spacy.tokens import Doc
m1 = Message.build(text="I have a feeling", intent="feeling")
m2 = Message.build(text="", intent="feeling")
m3 = Message.build(text="I am the last message", intent="feeling")
td = TrainingData(training_examples=[m1, m2, m3])
attribute_docs = spacy_nlp_component.docs_for_training_data(td)
assert isinstance(attribute_docs["text"][0], Doc)
assert isinstance(attribute_docs["text"][1], Doc)
assert isinstance(attribute_docs["text"][2], Doc)
assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
assert [t.text for t in attribute_docs["text"][1]] == []
assert [t.text for t in attribute_docs["text"][2]] == [
"i",
"am",
"the",
"last",
"message",
]
attrs=('has_entities', 'is_entity', 'entity_desc', 'entities', 'canonical')):
"""Initialise the pipeline component.
"""
self._has_entities, self._is_entity, self._entity_desc, self._entities, self.canonical = attrs
# Set up the KeywordProcessor
self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
self.keyword_processor.add_keywords_from_list(keywords_list)
self.keyword_processor.add_keywords_from_dict(keywords_dict)
if keywords_file:
self.keyword_processor.add_keyword_from_file(keywords_file)
self.label = label
# Register attribute on the Doc and Span
Doc.set_extension(self._has_entities, getter=self.has_entities, force=True)
Doc.set_extension(self._entities, getter=self.iter_entities, force=True)
Span.set_extension(self._has_entities, getter=self.has_entities, force=True)
Span.set_extension(self._entities, getter=self.iter_entities, force=True)
# Register attribute on the Token.
Token.set_extension(self._is_entity, default=False, force=True)
Token.set_extension(self._entity_desc, getter=self.get_entity_desc, force=True)
Token.set_extension(self.canonical, default=None, force=True)