Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def guess_language(html):
hits = dict()
htmlset = set(str(html).split(" "))
for lang in justext.get_stoplists():
hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
return max(hits, key=hits.get)
# logger.debug("Skipping %s", article['uuid'])
return {'article': processed, 'from_store': True}
article['title'] = item.get('title', None)
href = item.get('link', None)
article['href'] = strip_google(href)
article['source'] = derive_source(article['href'])
article['collected'] = now_time()
article['published'] = item.get('published', None)
article['summary'] = item.get('summary', None)
page_content = get_page_content(article['href'])
if not page_content:
logger.debug("No content found: %s" % article['href'])
return {'article': None, 'from_store': True}
paragraphs = justext.justext(page_content,
justext.get_stoplist("English"),
no_headings=True,
max_heading_distance=150,
length_high=140,
max_link_density=0.4,
stopwords_low=0.2,
stopwords_high=0.3)
text_content = list()
for paragraph in paragraphs:
if paragraph.is_boilerplate:
continue
text_content.append(paragraph.text)
text_content = '\n'.join(text_content)
tokens = get_tokens(text_content)
article['word_count'] = len(tokens)
article['read_time'] = round(float(article['word_count'])/250, 2)
NOTE: quality dependent on correct language detection.
:param page_str: str HTML page source.
:param lang: str Google Translate language code.
:param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
is returned. Short and bad segments in between are kept.
:return: list List of non-boilerplate segments/paragraphs.
"""
if lang not in GTRANS_JUSTEXT_LANG_MAP:
#raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
return []
jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]
paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))
if relaxed:
good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]
if len(good_indexes) == 0:
return []
return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
else:
return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
def extract(self):
html = self.data_instance.get_raw_html()
html = html.encode(self.data_instance.raw_encoding,'ignore')
paragraphs = justext.justext(html, justext.get_stoplist('English'),
encoding = self.data_instance.raw_encoding)
good_paragraphs = []
for para in paragraphs:
if para['class'] == 'good':
paragraph_text = para['text']
# this asseration makes sure we catch string and unicode only
assert isinstance(paragraph_text, basestring)
if type(paragraph_text) == unicode:
good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
else:
good_paragraphs.append(paragraph_text)
return '\n\n'.join(good_paragraphs)