Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
n_keyterms = int(round(n_toks * n_keyterms))
if window_width < 2:
raise ValueError("`window_width` must be >= 2")
window_width = min(n_toks, window_width)
min_term_freq = min(n_toks // 1000, 4)
if isinstance(ngrams, int):
ngrams = (ngrams,)
# build full list of candidate terms
# if inverse doc freqs available, include nouns, adjectives, and verbs;
# otherwise, just include nouns and adjectives
# (without IDF downweighting, verbs dominate the results in a bad way)
include_pos = {"NOUN", "PROPN", "ADJ", "VERB"} if idf else {"NOUN", "PROPN", "ADJ"}
terms = itertoolz.concat(
extract.ngrams(
doc,
n,
filter_stops=True,
filter_punct=True,
filter_nums=False,
include_pos=include_pos,
min_freq=min_term_freq,
)
for n in ngrams
)
# get normalized term strings, as desired
# paired with positional index in document and length in a 3-tuple
if normalize == "lemma":
terms = [(term.lemma_, term.start, len(term)) for term in terms]
elif normalize == "lower":
def test_default(self, spacy_doc):
# TODO: figure out if this function no longer works, ugh
# expected = {"I.M.F.": "International Monetary Fund"}
expected = {"I.M.F.": ""}
observed = extract.acronyms_and_definitions(spacy_doc)
assert observed == expected
def test_determiner(self, spacy_doc):
result = list(extract.noun_chunks(spacy_doc, drop_determiners=False))
assert all(isinstance(span, Span) for span in result)
assert any(span[0].pos_ == "DET" for span in result)
def test_patdict_bool_int(self, spacy_doc):
matches = list(extract.matches(spacy_doc, [{"IS_DIGIT": True}]))[:5]
assert matches
assert all(span[0].is_digit is True for span in matches)
matches = list(extract.matches(spacy_doc, [{"LENGTH": 5}]))[:5]
assert matches
assert all(len(span[0]) == 5 for span in matches)
def test_default(self, spacy_doc):
result = list(extract.words(spacy_doc))
assert all(isinstance(tok, Token) for tok in result)
assert not any(tok.is_space for tok in result)
(
"IS_DIGIT:bool(True):? POS:NOUN:*",
[{"IS_DIGIT": True, "OP": "?"}, {"POS": "NOUN", "OP": "*"}],
),
(
"LENGTH:int(5) DEP:nsubj:!",
[{"LENGTH": 5}, {"DEP": "nsubj", "OP": "!"}],
),
("POS:DET :", [{"POS": "DET"}, {}]),
(
"IS_PUNCT:bool(False) : IS_PUNCT:bool(True)",
[{"IS_PUNCT": False}, {}, {"IS_PUNCT": True}],
),
]
for patstr, pat in patstr_to_pats:
assert extract._make_pattern_from_string(patstr) == pat
merge_ncs (bool, optional): if True, merge noun chunks into single tokens
Yields:
``spacy.Doc``: doc processed from next text in ``texts``
"""
spacy_nlp = data.load_spacy_pipeline(
lang=lang, entity=merge_nes, parser=merge_ncs)
for spacy_doc in spacy_nlp.pipe(texts, tag=True, parse=merge_ncs, entity=merge_nes,
n_threads=2, batch_size=1000):
if merge_nes is True:
spacy_utils.merge_spans(
extract.named_entities(
spacy_doc, bad_ne_types='numeric', drop_determiners=False))
if merge_ncs is True:
spacy_utils.merge_spans(
extract.noun_chunks(
spacy_doc, drop_determiners=False))
yield spacy_doc
def extras(corpus):
print('Corpus: ', corpus)
# find published docs
for doc in corpus.get(published_match_func, limit=3):
triples = textacy.extract.subject_verb_object_triples(doc)
print('Published doc: ', doc, list(triples))
# find doc with specific url
url = 'http://www.eea.europa.eu/publications/C23I92-826-5409-5'
for doc in corpus.get(url_match_func(url), limit=3):
print('specific url:', doc)
# get terms list
for doc in corpus.get(url_match_func(url), limit=3):
tlist = doc.to_terms_list(
ngrams=1, named_entities=True, as_strings=True
)
terms = list(tlist)
print(terms)
"filter_stops", "filter_punct", "filter_nums",
"include_pos", "exclude_pos",
"min_freq",
}
ng_kwargs = {key: val for key, val in kwargs.items() if key in ng_kwargs}
for n in sorted(utils.to_collection(ngrams, int, set)):
# use a faster function for unigrams
if n == 1:
unigrams_ = extract.words(doc, **ng_kwargs)
else:
ngrams_.append(extract.ngrams(doc, n, **ng_kwargs))
ngrams_ = itertoolz.concat(ngrams_)
if entities is not None:
ent_kwargs = {"include_types", "exclude_types", "drop_determiners", "min_freq"}
ent_kwargs = {key: val for key, val in kwargs.items() if key in ent_kwargs}
entities_ = extract.entities(doc, **ent_kwargs)
if ngrams:
# use ngrams as-is
if entities is None:
terms = itertoolz.concatv(unigrams_, ngrams_)
# remove unigrams + ngrams that are duplicates of entities
else:
entities_ = tuple(entities_)
ent_idxs = {(ent.start, ent.end) for ent in entities_}
unigrams_ = (
ug
for ug in unigrams_
if (ug.i, ug.i + 1) not in ent_idxs
)
ngrams_ = (
ng
for ng in ngrams_
that match any pattern in ``patterns``
Args:
doc (:class:`spacy.tokens.Doc`)
patterns (str or List[str] or List[dict] or List[List[dict]]):
One or multiple patterns to match against ``doc``
using a :class:`spacy.matcher.Matcher`.
Yields:
Tuple[:class:`spacy.tokens.Token`]: Next pattern-matching candidate,
as a tuple of constituent Tokens.
See Also:
:func:`textacy.extract.matches()`
"""
for match in extract.matches(doc, patterns, on_match=None):
yield tuple(match)