Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_plaintext_functionality(text):
preprocessed_text = preprocessing.normalize_whitespace(text)
preprocessed_text = preprocessing.remove_punctuation(text)
preprocessed_text = preprocessed_text.lower()
assert all(char.islower() for char in preprocessed_text if char.isalpha())
assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
keyword = "America"
kwics = text_utils.keyword_in_context(
text, keyword, window_width=35, print_only=False
)
for pre, kw, post in kwics:
assert kw == keyword
assert isinstance(pre, compat.unicode_)
assert isinstance(post, compat.unicode_)
def test_lang(self, doc):
lang = doc._.lang
assert isinstance(lang, compat.unicode_)
assert lang == doc.vocab.lang
if obj_title.startswith('File:') or obj_title.startswith('Image:'):
section.remove(obj)
except Exception:
pass
sec['text'] = unicode_(section.strip_code(normalize=True, collapse=True)).strip()
if sec.get('title'):
sec['text'] = re.sub(r'^' + re.escape(sec['title']) + r'\s*', '', sec['text'])
parsed_page['sections'].append(sec)
section_idx += 1
# dammit! the parser has failed us; let's handle it as best we can
elif len(headings) > 1:
titles = [unicode_(h.title).strip() for h in headings]
levels = [int(h.level) for h in headings]
sub_sections = [
unicode_(ss) for ss in
re.split(r'\s*' + '|'.join(re.escape(unicode_(h)) for h in headings) + r'\s*', unicode_(section))]
# re.split leaves an empty string result up front :shrug:
if sub_sections[0] == '':
del sub_sections[0]
if len(headings) != len(sub_sections):
LOGGER.warning(
'# headings = %s, but # sections = %s',
len(headings), len(sub_sections))
for i, sub_section in enumerate(sub_sections):
try:
if titles[i].lower() in bad_section_titles:
continue
parsed_page['sections'].append({'title': titles[i], 'level': levels[i], 'idx': section_idx,
'text': strip_markup(sub_section)})
section_idx += 1
except IndexError:
must be in the interval (0.0, 1.0], which is converted to an int by
``int(round(len(candidates) * topn))``
Returns:
List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
their corresponding scores.
References:
Campos, Mangaravite, Pasquali, Jorge, Nunes, and Jatowt. (2018).
A Text Feature Based Automatic Keyword Extraction Method for Single Documents.
Advances in Information Retrieval. ECIR 2018.
Lecture Notes in Computer Science, vol 10772, pp. 684-691.
"""
# validate / transform args
ngrams = utils.to_collection(ngrams, int, tuple)
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"topn={} is invalid; "
"must be an int, or a float between 0.0 and 1.0".format(topn)
)
# bail out on empty docs
if not doc:
return []
stop_words = set()
seen_candidates = set()
# compute key values on a per-word basis
word_occ_vals = _get_per_word_occurrence_values(doc, normalize, stop_words, window_size)
# doc doesn't have any words...
def _parse_content(self, content, parser):
wikicode = parser.parse(content)
parsed_page = {'sections': []}
wikilinks = [unicode_(wc.title) for wc in wikicode.ifilter_wikilinks()]
parsed_page['categories'] = [wc for wc in wikilinks if wc.startswith('Category:')]
parsed_page['wiki_links'] = [wc for wc in wikilinks
if not wc.startswith('Category:') and
not wc.startswith('File:') and
not wc.startswith('Image:')]
parsed_page['ext_links'] = [
unicode_(wc.url) for wc in wikicode.ifilter_external_links()]
def _filter_tags(obj):
return obj.tag == 'ref' or obj.tag == 'table'
bad_section_titles = {'external links', 'notes', 'references'}
section_idx = 0
for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
headings = section.filter_headings()
]
categories = [
wc
for wc in wikilinks
if wc.startswith(cat_link) or wc.startswith(lc_cat_link)
]
parsed_record["categories"] = categories
parsed_record["wiki_links"] = [
wc
for wc in wikilinks
if wc not in categories
and not wc.startswith("File:")
and not wc.startswith("Image:")
]
parsed_record["ext_links"] = [
compat.unicode_(wc.url) for wc in wikicode.ifilter_external_links()
]
parsed_record["text"] = strip_markup_slow(
wikicode, include_headings, parser,
)
return parsed_record
if sec_title.lower() in bad_section_titles:
continue
sec['title'] = sec_title
sec['level'] = int(headings[0].level)
except IndexError:
if section_idx == 0:
sec['level'] = 1
# strip out references, tables, and file/image links
for obj in section.ifilter_tags(matches=_filter_tags, recursive=True):
try:
section.remove(obj)
except Exception:
continue
for obj in section.ifilter_wikilinks(recursive=True):
try:
obj_title = unicode_(obj.title)
if obj_title.startswith('File:') or obj_title.startswith('Image:'):
section.remove(obj)
except Exception:
pass
sec['text'] = unicode_(section.strip_code(normalize=True, collapse=True)).strip()
if sec.get('title'):
sec['text'] = re.sub(r'^' + re.escape(sec['title']) + r'\s*', '', sec['text'])
parsed_page['sections'].append(sec)
section_idx += 1
# dammit! the parser has failed us; let's handle it as best we can
elif len(headings) > 1:
titles = [unicode_(h.title).strip() for h in headings]
levels = [int(h.level) for h in headings]
sub_sections = [
unicode_(ss) for ss in
an already-instantiated spaCy language pipeline.
chunk_size (int): Number of characters comprising each text chunk
(excluding the last chunk, which is probably smaller). For best
performance, value should be somewhere between 1e3 and 1e7,
depending on how much RAM you have available.
.. note:: Since chunking is done by character, chunks edges' probably
won't respect natural language segmentation, which means that every
``chunk_size`` characters, spaCy will probably get tripped up and
make weird parsing errors.
Returns:
:class:`spacy.tokens.Doc`: A single processed document, initialized from
components accumulated chunk by chunk.
"""
if isinstance(lang, compat.unicode_):
lang = cache.load_spacy_lang(lang)
elif not isinstance(lang, Language):
raise TypeError(
"`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
)
words = []
spaces = []
np_arrays = []
cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
text_len = len(text)
i = 0
# iterate over text chunks and accumulate components needed to make a doc
while i < text_len:
chunk_doc = lang(text[i : i + chunk_size])
words.extend(tok.text for tok in chunk_doc)
def slow_parse(include_headings, parser, record):
wikicode = parser.parse(record["text"])
parsed_record = record.copy()
cat_link = MAPPING_CAT[self.lang]
# catch category links errantly marked up in lowercase
lc_cat_link = cat_link.lower()
wikilinks = [
compat.unicode_(wc.title)
for wc in wikicode.ifilter_wikilinks()
]
categories = [
wc
for wc in wikilinks
if wc.startswith(cat_link) or wc.startswith(lc_cat_link)
]
parsed_record["categories"] = categories
parsed_record["wiki_links"] = [
wc
for wc in wikilinks
if wc not in categories
and not wc.startswith("File:")
and not wc.startswith("Image:")
]
parsed_record["ext_links"] = [