How to use the hepcrawl.spiders.base_spider.BaseSpider function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def parsed_node_without_link():
    """Call parse_node function without a direct link"""
    spider = base_spider.BaseSpider()
    body = """
    
    
        
        
    
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node.extract()

    with requests_mock.Mocker() as mock:
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def parsed_node():
    """Call parse_node function with a direct link"""
    url = "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf"
    responses.add(responses.HEAD, url, status=200,
                  content_type='application/pdf')
    spider = base_spider.BaseSpider()
    body = """
    
    
        
        
    
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node[0].extract()

    parsed_item = spider.parse_node(response, node[0])
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def record():
    """Return built HEPRecord from the BASE spider."""
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')

    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    response.meta["record"] = nodes[0].extract()
    response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]

    parsed_item = spider.build_item(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def urls():
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    return spider.get_urls_in_record(nodes[0])
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def splash():
    """Call web scraper function, return final HEPRecord."""
    spider = base_spider.BaseSpider()
    splash_response = fake_response_from_file('base/test_1_splash.htm')
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    splash_response.meta["record"] = nodes[0].extract()

    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf',
            headers={
                'Content-Type': 'text/html',
            },
        )

        parsed_item = spider.scrape_for_pdf(splash_response)
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def parsed_node_missing_scheme():
    """Call parse_node function with a link missing a http identifier."""
    spider = base_spider.BaseSpider()
    body = """
    
    
        
        
    
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node.extract_first()

    with requests_mock.Mocker() as mock:
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def direct_links():
    spider = base_spider.BaseSpider()
    urls = ["http://hdl.handle.net/1885/10005"]
    return spider.find_direct_links(urls)
github inspirehep / hepcrawl / hepcrawl / spiders / base_spider.py View on Github external
def __init__(self, source_file=None, *args, **kwargs):
        """Construct BASE spider"""
        super(BaseSpider, self).__init__(*args, **kwargs)
        self.source_file = source_file