How to use the goose3.Goose function in goose3

To help you get started, we’ve selected a few goose3 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
def crawl_url(url):
    g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
    r = g.fetcher.fetch_obj(url)
    html = r.content.decode('utf-8').strip()
    # Make src urls absolute
    html = abs_src(html, r.url)
    page = g.extract(raw_html=html)
    infos = page.infos

    infos['final_url']       = page.final_url
    infos['status']          = r.status_code
    infos['headers']         = r.headers
    infos['link_hash']       = page.link_hash
    infos['final_url']       = r.url
    infos['domain']          = get_hostname(r.url)
    infos['links']           = LinksExtractor(g.config, page).extract(url)
    infos['meta']['robots']  = RobotsExtractor(g.config, page).extract()
    infos['content']         = ' '.join(page.cleaned_text.split())
github team-anything / Briefly / App / subscribe.py View on Github external
def summary(url):
    g = Goose()
    article = g.extract(url)
    title = article.title
    publish_date = "None"
    headlines = []
    if title == None :
        title = url
    try:
        image = article.top_image.src
    except Exception:
        if len(article.images)>0:
            image = article.images[0]
        else:
            image = "http://www.sahalnews.com/wp-content/uploads/2014/12/news-update-.jpg"
    for bullets in summarize(url,title,article.cleaned_text,n_bullets):
        headlines.append(bullets)
    if len(headlines)==0:
github LuChang-CS / news-crawler / article / bbc_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github verifiqueme / core / jano / controllers / ArticleExtractor.py View on Github external
else:
                raise TextUnavailable("Não existem textos disponíveis no NewsPlease para análise. Tente com Goose3")
            # Definir data
            if artigo.date_publish is not None:
                data = str(artigo.date_publish)
            elif artigo.date_modify is not None and artigo.date_modify is not "None":
                data = str(artigo.date_modify)
            else:
                data = str(artigo.date_download)

            objeto = ArticleObject(fixcharset(artigo.title), url, None, data, artigo.authors,
                                   artigo.source_domain, text)
            return objeto
        except Exception:
            from goose3 import Goose
            g = Goose(
                {'strict': False, 'use_meta_language': True,
                 'target_language': Config().values()['language'].replace("-", "_"),
                 'parser_class': 'lxml', 'enable_image_fetching': False, 'http_timeout': 1})
            artigo = g.extract(url=url)
            if artigo.cleaned_text:
                text = fixcharset(artigo.cleaned_text)
            elif artigo.meta_description:
                text = fixcharset(artigo.meta_description)
            else:
                raise TextUnavailable("Não existem textos suficientes para análise.")

            objeto = ArticleObject(fixcharset(artigo.title), url, None,
                                   artigo.publish_date, artigo.authors, artigo.domain, text)
            return objeto
github LuChang-CS / news-crawler / article / nytimes_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github LuChang-CS / news-crawler / article / reuters_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github fanmatics / metadoc / metadoc / extract / extractor.py View on Github external
self.keywords = []
    self.names = []
    self.fulltext = None
    self.language = None
    self.description = None
    self.canonical_url = None
    self.image = None
    self.published_date = None
    self.modified_date = None
    self.scraped_date = None
    self.contenthash = None
    self.reading_time = None

    config = Configuration()
    config.enable_image_fetching = False
    self.goose = Goose(config=config)

    self.tree = None

goose3

Html Content / Article Extractor, web scrapping for Python3

Apache-2.0
Latest version published 10 months ago

Package Health Score

71 / 100
Full package analysis