Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pos(self, spacy_doc):
result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
assert all(tok.pos_ == "NOUN" for tok in result1)
assert all(tok.pos_ == "NOUN" for tok in result2)
result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"}))
result4 = list(extract.words(spacy_doc, exclude_pos="NOUN"))
assert not any(tok.pos_ == "NOUN" for tok in result3)
assert not any(tok.pos_ == "NOUN" for tok in result4)
def test_min_freq(self, spacy_doc):
counts = collections.Counter()
counts.update(tok.lower_ for tok in spacy_doc)
result = list(extract.words(spacy_doc, min_freq=2))
assert all(counts[tok.lower_] >= 2 for tok in result)
def test_pos(self, spacy_doc):
result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
assert all(tok.pos_ == "NOUN" for tok in result1)
assert all(tok.pos_ == "NOUN" for tok in result2)
result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"}))
result4 = list(extract.words(spacy_doc, exclude_pos="NOUN"))
assert not any(tok.pos_ == "NOUN" for tok in result3)
assert not any(tok.pos_ == "NOUN" for tok in result4)
{"lemma", "lower", types.FunctionType, None},
)
)
if ngrams:
unigrams_ = []
ngrams_ = []
ng_kwargs = {
"filter_stops", "filter_punct", "filter_nums",
"include_pos", "exclude_pos",
"min_freq",
}
ng_kwargs = {key: val for key, val in kwargs.items() if key in ng_kwargs}
for n in sorted(utils.to_collection(ngrams, int, set)):
# use a faster function for unigrams
if n == 1:
unigrams_ = extract.words(doc, **ng_kwargs)
else:
ngrams_.append(extract.ngrams(doc, n, **ng_kwargs))
ngrams_ = itertoolz.concat(ngrams_)
if entities is not None:
ent_kwargs = {"include_types", "exclude_types", "drop_determiners", "min_freq"}
ent_kwargs = {key: val for key, val in kwargs.items() if key in ent_kwargs}
entities_ = extract.entities(doc, **ent_kwargs)
if ngrams:
# use ngrams as-is
if entities is None:
terms = itertoolz.concatv(unigrams_, ngrams_)
# remove unigrams + ngrams that are duplicates of entities
else:
entities_ = tuple(entities_)
ent_idxs = {(ent.start, ent.end) for ent in entities_}
unigrams_ = (
def __init__(self, doc):
self.lang = doc.vocab.lang
self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None
# get objs for basic count computations
hyphenator = cache.load_hyphenator(lang=self.lang)
words = tuple(
extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)
)
syllables_per_word = tuple(
len(hyphenator.positions(word.lower_)) + 1 for word in words
)
chars_per_word = tuple(len(word) for word in words)
# compute basic counts needed for most readability stats
self.n_words = len(words)
self.n_unique_words = len({word.lower for word in words})
self.n_chars = sum(chars_per_word)
self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
self.n_syllables = sum(syllables_per_word)
self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1)
self.n_polysyllable_words = sum(1 for spw in syllables_per_word if spw >= 3)
where larger values correspond to more similar documents.
References:
- Ofir Pele and Michael Werman, "A linear time histogram metric for improved
SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
- Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
- Kusner, Matt J., et al. "From word embeddings to document distances."
Proceedings of the 32nd International Conference on Machine Learning
(ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
"""
word_idxs = dict()
n = 0
word_vecs = []
for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
if word.has_vector and word_idxs.setdefault(word.orth, n) == n:
word_vecs.append(word.vector)
n += 1
distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
np.double
)
distance_mat /= distance_mat.max()
vec1 = collections.Counter(
word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
)
vec1 = np.array(
[vec1[word_idx] for word_idx in range(len(word_idxs))]
).astype(np.double)
vec1 /= vec1.sum() # normalize word counts
"""
word_idxs = dict()
n = 0
word_vecs = []
for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
if word.has_vector and word_idxs.setdefault(word.orth, n) == n:
word_vecs.append(word.vector)
n += 1
distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
np.double
)
distance_mat /= distance_mat.max()
vec1 = collections.Counter(
word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
)
vec1 = np.array(
[vec1[word_idx] for word_idx in range(len(word_idxs))]
).astype(np.double)
vec1 /= vec1.sum() # normalize word counts
vec2 = collections.Counter(
word_idxs[word.orth] for word in extract.words(doc2) if word.has_vector
)
vec2 = np.array(
[vec2[word_idx] for word_idx in range(len(word_idxs))]
).astype(np.double)
vec2 /= vec2.sum() # normalize word counts
return 1.0 - emd(vec1, vec2, distance_mat)
n += 1
distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
np.double
)
distance_mat /= distance_mat.max()
vec1 = collections.Counter(
word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
)
vec1 = np.array(
[vec1[word_idx] for word_idx in range(len(word_idxs))]
).astype(np.double)
vec1 /= vec1.sum() # normalize word counts
vec2 = collections.Counter(
word_idxs[word.orth] for word in extract.words(doc2) if word.has_vector
)
vec2 = np.array(
[vec2[word_idx] for word_idx in range(len(word_idxs))]
).astype(np.double)
vec2 /= vec2.sum() # normalize word counts
return 1.0 - emd(vec1, vec2, distance_mat)
Args:
spacy_docs (iterable(``spacy.Doc``))
lemmatize (bool, optional)
filter_stops (bool, optional)
filter_punct (bool, optional)
filter_nums (bool, optional)
good_pos_tags (set(str) or 'numeric', optional)
bad_pos_tags (set(str) or 'numeric', optional)
Yields:
list(str)
"""
for spacy_doc in spacy_docs:
if lemmatize is True:
yield [word.lemma_ for word in
extract.words(spacy_doc,
filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums,
good_pos_tags=good_pos_tags, bad_pos_tags=bad_pos_tags)]
else:
yield [word.orth_ for word in
extract.words(spacy_doc,
filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums,
good_pos_tags=good_pos_tags, bad_pos_tags=bad_pos_tags)]