How to use the goose3.extractors.BaseExtractor function in goose3

To help you get started, we’ve selected a few goose3 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
class LinksExtractor(BaseExtractor):

    def extract(self, url):
        links = []
        items = self.parser.getElementsByTag(self.article._raw_doc, 'a')

        for i in items:
            href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
            attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
            if attr:
                links.append(attr)
        return links


class RobotsExtractor(BaseExtractor):

    def extract(self):
        robots = []
        kwargs = {'tag': 'meta', 'attr': 'name', 'value': 'robots'}
        items = self.parser.getElementsByTag(self.article._raw_doc, **kwargs)
        for i in items:
            attr = self.parser.getAttribute(i, 'content')
            if attr and len(attr):
                attr = [a.strip().lower() for a in attr.split(',')]
                robots.extend(attr)
        return robots



def crawl_url(url):
    g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


from goose3 import Goose
from goose3.extractors import BaseExtractor
from lib.utils import *

import config as cfg



class LinksExtractor(BaseExtractor):

    def extract(self, url):
        links = []
        items = self.parser.getElementsByTag(self.article._raw_doc, 'a')

        for i in items:
            href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
            attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
            if attr:
                links.append(attr)
        return links


class RobotsExtractor(BaseExtractor):

    def extract(self):

goose3

Html Content / Article Extractor, web scrapping for Python3

Apache-2.0
Latest version published 11 months ago

Package Health Score

74 / 100
Full package analysis