Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_records():
for text, meta in DATASET.records(limit=3):
assert isinstance(text, compat.unicode_)
assert isinstance(meta, dict)
def test_texts():
for text in DATASET.texts(limit=3):
assert isinstance(text, compat.unicode_)
LOGGER.warning("input `terms` is empty, so output graph is also empty")
return nx.Graph()
# if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it
if len(terms) < window_size:
LOGGER.info(
"`terms` has fewer items (%s) than `window_size` (%s); "
"setting window width to %s",
len(terms),
window_size,
len(terms),
)
window_size = len(terms)
first_term, terms = itertoolz.peek(terms)
if isinstance(first_term, compat.unicode_):
windows = itertoolz.sliding_window(window_size, terms)
elif isinstance(first_term, (Span, Token)):
windows = itertoolz.sliding_window(
window_size, utils.normalize_terms(terms, normalize))
else:
raise TypeError(
"items in `terms` must be strings or spacy tokens, not {}".format(
type(first_term)
)
)
graph = nx.Graph()
if edge_weighting == "count":
cooc_mat = collections.Counter(
w1_w2
for window in windows
def get_filename_from_url(url):
"""
Derive a filename from a URL's path.
Args:
url (str): URL from which to extract a filename.
Returns:
str: Filename in URL.
"""
return os.path.basename(compat.urlparse(compat.url_unquote_plus(url)).path)
# validate / transform args
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"topn={} is invalid; "
"must be an int, or a float between 0.0 and 1.0".format(topn)
)
# bail out on empty docs
if not doc:
return []
if position_bias is True:
word_pos = collections.defaultdict(float)
for word, norm_word in compat.zip_(doc, ke_utils.normalize_terms(doc, normalize)):
word_pos[norm_word] += 1 / (word.i + 1)
sum_word_pos = sum(word_pos.values())
word_pos = {word: pos / sum_word_pos for word, pos in word_pos.items()}
else:
word_pos = None
# build a graph from all words in doc, then score them
graph = graph_base.build_graph_from_terms(
[word for word in doc],
normalize=normalize,
window_size=window_size,
edge_weighting=edge_weighting,
)
word_scores = graph_base.rank_nodes_by_pagerank(
graph, weight="weight", personalization=word_pos)
# generate a list of candidate terms
candidates = _get_candidates(doc, normalize, include_pos)
of several components, then combine components into per-word scores.
Args:
doc (:class:`spacy.tokens.Doc`)
word_occ_vals (Dict[int, Dict[str, list]])
word_freqs (Dict[int, int])
stop_words (Set[str])
Returns:
Dict[int, float]
"""
word_weights = collections.defaultdict(dict)
# compute summary stats for word frequencies
freqs_nsw = [freq for w_id, freq in word_freqs.items() if w_id not in stop_words]
freq_max = max(word_freqs.values())
freq_baseline = compat.mean_(freqs_nsw) + compat.stdev_(freqs_nsw)
n_sents = itertoolz.count(doc.sents)
for w_id, vals in word_occ_vals.items():
freq = word_freqs[w_id]
word_weights[w_id]["case"] = sum(vals["is_uc"]) / compat.log2_(1 + freq)
word_weights[w_id]["pos"] = compat.log2_(compat.log2_(3 + compat.median_(vals["sent_idx"])))
word_weights[w_id]["freq"] = freq / freq_baseline
word_weights[w_id]["disp"] = len(set(vals["sent_idx"])) / n_sents
n_unique_lc = len(set(vals["l_context"]))
n_unique_rc = len(set(vals["r_context"]))
try:
wl = n_unique_lc / len(vals["l_context"])
except ZeroDivisionError:
wl = 0.0
try:
wr = n_unique_rc / len(vals["r_context"])
except ZeroDivisionError:
make_dirs (bool)
dialect (str): a grouping of formatting parameters that determine how
the tabular data is parsed when reading/writing
delimiter (str): 1-character string used to separate fields in a row
See Also:
https://docs.python.org/3/library/csv.html#csv.writer
Note:
Here, CSV is used as a catch-all term for *any* delimited file
format, and ``delimiter=','`` is merely the function's default value.
Other common delimited formats are TSV (tab-separated-value, with
``delimiter='\\t'``) and PSV (pipe-separated-value, with ``delimiter='|'``.
"""
with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f:
csv_writer = compat.csv.writer(f, dialect=dialect, delimiter=delimiter)
csv_writer.writerows(rows)
depending on how much RAM you have available.
.. note:: Since chunking is done by character, chunks edges' probably
won't respect natural language segmentation, which means that every
``chunk_size`` characters, spaCy will probably get tripped up and
make weird parsing errors.
Returns:
:class:`spacy.tokens.Doc`: A single processed document, initialized from
components accumulated chunk by chunk.
"""
if isinstance(lang, compat.unicode_):
lang = cache.load_spacy_lang(lang)
elif not isinstance(lang, Language):
raise TypeError(
"`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
)
words = []
spaces = []
np_arrays = []
cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
text_len = len(text)
i = 0
# iterate over text chunks and accumulate components needed to make a doc
while i < text_len:
chunk_doc = lang(text[i : i + chunk_size])
words.extend(tok.text for tok in chunk_doc)
spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
np_arrays.append(chunk_doc.to_array(cols))
i += chunk_size
# now, initialize the doc from words and spaces
filepath (str): /path/to/file on disk from which rows will be streamed
encoding (str)
dialect (str): a grouping of formatting parameters that determine how
the tabular data is parsed when reading/writing; if 'infer', the
first 1024 bytes of the file is analyzed, producing a best guess for
the correct dialect
delimiter (str): 1-character string used to separate fields in a row
Yields:
List[obj]: Next row, whose elements are strings and/or numbers.
.. seealso:: https://docs.python.org/3/library/csv.html#csv.reader
"""
with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f:
if dialect == 'infer':
sniffer = compat.csv.Sniffer()
# add pipes to the list of preferred delimiters, and put spaces last
sniffer.preferred = [',', '\t', '|', ';', ':', ' ']
dialect = sniffer.sniff(f.read(1024))
f.seek(0)
for row in compat.csv.reader(f, dialect=dialect, delimiter=delimiter):
yield row