Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_to_collection():
in_outs = [
[(1, int, list), [1]],
[([1, 2], int, tuple), (1, 2)],
[((1, 1.0), (int, float), set), {1, 1.0}],
]
assert utils.to_collection(None, int, list) is None
for in_, out_ in in_outs:
assert utils.to_collection(*in_) == out_
def identify_topn_langs(self, text, topn=3):
"""
Identify the ``topn`` most probable languages identified in ``text``.
Args:
text (str)
topn (int)
Returns:
List[Tuple[str, float]]: 2-letter language code and its probability
for the ``topn`` most probable languages.
"""
text_ = utils.to_collection(text[:self.max_text_len], str, list)
if self._is_valid(text_[0]):
lang_probs = sorted(
zip(self.pipeline.classes_, self.pipeline.predict_proba(text_).flat),
key=operator.itemgetter(1),
reverse=True,
)[:topn]
return [(lang.item(), prob.item()) for lang, prob in lang_probs]
else:
return [("un", 1.0)]
Args:
doc (:class:`spacy.tokens.Doc`)
ns (int or Tuple[int]): One or more n values for which to generate n-grams.
For example, ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams.
include_pos (str or Set[str]): One or more POS tags with which to filter ngrams.
If None, include tokens of all POS tags.
Yields:
Tuple[:class:`spacy.tokens.Token`]: Next ngram candidate,
as a tuple of constituent Tokens.
See Also:
:func:`textacy.extract.ngrams()`
"""
ns = t_utils.to_collection(ns, int, tuple)
include_pos = t_utils.to_collection(include_pos, compat.unicode_, set)
ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns)
ngrams = (
ngram
for ngram in ngrams
if not (ngram[0].is_stop or ngram[-1].is_stop)
and not any(word.is_punct or word.is_space for word in ngram)
)
if include_pos:
ngrams = (
ngram
for ngram in ngrams
if all(word.pos_ in include_pos for word in ngram)
)
for ngram in ngrams:
yield ngram
through synonym substitution.
num (int or float): If int, maximum number of words with available synonyms
to substitute with a randomly selected synonym; if float, probability
that a given word with synonyms will be substituted.
pos (str or Set[str]): Part of speech tag(s) of words to be considered
for augmentation. If None, all words with synonyms are considered.
Returns:
List[:obj:`AugTok`]: New, augmented sequence of tokens.
Note:
This transform requires :class:`textacy.resources.ConceptNet` to be downloaded
to work properly, since this is the data source for word synonyms to be substituted.
"""
_validate_aug_toks(aug_toks)
pos = utils.to_collection(pos, str, set)
cand_idxs = [
idx for idx, aug_tok in enumerate(aug_toks)
if aug_tok.syns and (pos is None or aug_tok.pos in pos)
]
rand_idxs = set(_select_random_candidates(cand_idxs, num))
if not rand_idxs:
return aug_toks[:]
new_aug_toks = []
for idx, aug_tok in enumerate(aug_toks):
if idx in rand_idxs:
new_aug_toks.append(
aug_utils.AugTok(
text=random.choice(aug_tok.syns),
ws=aug_tok.ws,
pos=aug_tok.pos,
Randomly delete words,
up to ``num`` times or with a probability of ``num``.
Args:
aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment
through word deletion.
num (int or float): If int, maximum number of words to delete;
if float, probability that a given word will be deleted.
pos (str or Set[str]): Part of speech tag(s) of words to be considered
for augmentation. If None, all words are considered.
Returns:
List[:class:`AugTok`]: New, augmented sequence of tokens.
"""
_validate_aug_toks(aug_toks)
pos = utils.to_collection(pos, str, set)
# bail out on very short sentences to avoid clobbering meaning
if len(aug_toks) < 3:
return aug_toks[:]
cand_idxs = [
idx for idx, aug_tok in enumerate(aug_toks)
if aug_tok.is_word and (pos is None or aug_tok.pos in pos)
]
rand_idxs = set(_select_random_candidates(cand_idxs, num))
if not rand_idxs:
return aug_toks[:]
new_aug_toks = []
padded_triplets = itertoolz.sliding_window(3, [None] + aug_toks + [None])
for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets):
if idx in rand_idxs:
topn (int or float): Number of top-ranked terms to return as key terms.
If an integer, represents the absolute number; if a float, value
must be in the interval (0.0, 1.0], which is converted to an int by
``int(round(len(candidates) * topn))``
Returns:
List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
their corresponding scores.
References:
Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity
Aware Keyword Extraction. Information Sciences. 477.
https://arxiv.org/abs/1811.10831v1
"""
# validate / transform args
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"topn={} is invalid; "
"must be an int, or a float between 0.0 and 1.0".format(topn)
)
# bail out on empty docs
if not doc:
return []
# build up a graph of good words, edges weighting by adjacent sentence co-occurrence
cooc_mat = collections.Counter()
n_sents = itertoolz.count(doc.sents) # in case doc only has 1 sentence
for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
window_words = (
Returns:
List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
their corresponding SGRank scores
Raises:
ValueError: if ``topn`` is a float but not in (0.0, 1.0] or
``window_size`` < 2
References:
Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical
Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction."
Lexical and Computational Semantics (* SEM 2015) (2015): 117.
"""
# validate / transform args
ngrams = utils.to_collection(ngrams, int, tuple)
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if window_size < 2:
raise ValueError("`window_size` must be >= 2")
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"`topn` must be an int, or a float between 0.0 and 1.0"
)
n_toks = len(doc)
window_size = min(n_toks, window_size)
# bail out on (nearly) empty docs
if n_toks < 2:
return []
candidates, candidate_counts = _get_candidates(doc, normalize, ngrams, include_pos)
are included. Both low and high values must be specified, but a null value
for either is automatically replaced by the minimum or maximum
valid values, respectively.
min_len (int): Filter movie reviews by the length (number of characters)
of their text content.
limit (int): Yield no more than ``limit`` movie reviews that match all
specified filters.
Yields:
str: Text of the next movie review in dataset passing all filters.
dict: Metadata of the next movie review in dataset passing all filters.
Raises:
ValueError: If any filtering options are invalid.
"""
self._subset = utils.to_collection(subset, (str, bytes), tuple)
self._label = utils.to_collection(label, (str, bytes), tuple)
try:
filters = self._get_filters(rating_range, min_len)
for record in itertools.islice(self._filtered_iter(filters), limit):
yield record.pop("text"), record
finally:
self._subset = None
self._label = None
Returns:
List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
their corresponding SGRank scores
Raises:
ValueError: if ``topn`` is a float but not in (0.0, 1.0] or
``window_size`` < 2
References:
Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical
Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction."
Lexical and Computational Semantics (* SEM 2015) (2015): 117.
"""
# validate / transform args
ngrams = utils.to_collection(ngrams, int, tuple)
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if window_size < 2:
raise ValueError("`window_size` must be >= 2")
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"`topn` must be an int, or a float between 0.0 and 1.0"
)
n_toks = len(doc)
window_size = min(n_toks, window_size)
# bail out on (nearly) empty docs
if n_toks < 2:
return []
candidates, candidate_counts = _get_candidates(doc, normalize, ngrams, include_pos)
# scale float topn based on total number of initial candidates