Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def crawl_url(url):
g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
r = g.fetcher.fetch_obj(url)
html = r.content.decode('utf-8').strip()
# Make src urls absolute
html = abs_src(html, r.url)
page = g.extract(raw_html=html)
infos = page.infos
infos['final_url'] = page.final_url
infos['status'] = r.status_code
infos['headers'] = r.headers
infos['link_hash'] = page.link_hash
infos['final_url'] = r.url
infos['domain'] = get_hostname(r.url)
infos['links'] = LinksExtractor(g.config, page).extract(url)
infos['meta']['robots'] = RobotsExtractor(g.config, page).extract()
infos['content'] = ' '.join(page.cleaned_text.split())
def summary(url):
g = Goose()
article = g.extract(url)
title = article.title
publish_date = "None"
headlines = []
if title == None :
title = url
try:
image = article.top_image.src
except Exception:
if len(article.images)>0:
image = article.images[0]
else:
image = "http://www.sahalnews.com/wp-content/uploads/2014/12/news-update-.jpg"
for bullets in summarize(url,title,article.cleaned_text,n_bullets):
headlines.append(bullets)
if len(headlines)==0:
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
else:
raise TextUnavailable("Não existem textos disponíveis no NewsPlease para análise. Tente com Goose3")
# Definir data
if artigo.date_publish is not None:
data = str(artigo.date_publish)
elif artigo.date_modify is not None and artigo.date_modify is not "None":
data = str(artigo.date_modify)
else:
data = str(artigo.date_download)
objeto = ArticleObject(fixcharset(artigo.title), url, None, data, artigo.authors,
artigo.source_domain, text)
return objeto
except Exception:
from goose3 import Goose
g = Goose(
{'strict': False, 'use_meta_language': True,
'target_language': Config().values()['language'].replace("-", "_"),
'parser_class': 'lxml', 'enable_image_fetching': False, 'http_timeout': 1})
artigo = g.extract(url=url)
if artigo.cleaned_text:
text = fixcharset(artigo.cleaned_text)
elif artigo.meta_description:
text = fixcharset(artigo.meta_description)
else:
raise TextUnavailable("Não existem textos suficientes para análise.")
objeto = ArticleObject(fixcharset(artigo.title), url, None,
artigo.publish_date, artigo.authors, artigo.domain, text)
return objeto
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
self.keywords = []
self.names = []
self.fulltext = None
self.language = None
self.description = None
self.canonical_url = None
self.image = None
self.published_date = None
self.modified_date = None
self.scraped_date = None
self.contenthash = None
self.reading_time = None
config = Configuration()
config.enable_image_fetching = False
self.goose = Goose(config=config)
self.tree = None