Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
chunksize = 1
class ChunkSize2PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 2
class ChunkSize3PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 3
class ChunkSize4PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 4
class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest):
def queue(self):
return MarshalLifoDiskQueue(self.qpath)
def test_serialize(self):
q = self.queue()
q.push('a')
q.push(123)
q.push({'a': 'dict'})
self.assertEqual(q.pop(), {'a': 'dict'})
self.assertEqual(q.pop(), 123)
self.assertEqual(q.pop(), 'a')
test_nonserializable_object = nonserializable_object_test
from scrapy.squeues import MarshalFifoDiskQueue, MarshalLifoDiskQueue, PickleFifoDiskQueue, PickleLifoDiskQueue
from scrapy.item import Item, Field
from scrapy.http import Request
from scrapy.loader import ItemLoader
class TestItem(Item):
name = Field()
def _test_procesor(x):
return x + x
class TestLoader(ItemLoader):
default_item_class = TestItem
name_out = staticmethod(_test_procesor)
class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest):
chunksize = 100000
def queue(self):
return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize)
def test_serialize(self):
q = self.queue()
q.push('a')
q.push(123)
q.push({'a': 'dict'})
self.assertEqual(q.pop(), 'a')
self.assertEqual(q.pop(), 123)
self.assertEqual(q.pop(), {'a': 'dict'})
def test_nonserializable_object(self):
assert r2.meta['request'] is r2
class ChunkSize1PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 1
class ChunkSize2PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 2
class ChunkSize3PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 3
class ChunkSize4PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 4
class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest):
def queue(self):
return MarshalLifoDiskQueue(self.qpath)
def test_serialize(self):
q = self.queue()
q.push('a')
q.push(123)
q.push({'a': 'dict'})
self.assertEqual(q.pop(), {'a': 'dict'})
self.assertEqual(q.pop(), 123)
self.assertEqual(q.pop(), 'a')
def test_nonserializable_object(self):
# Trigger Twisted bug #7989
import twisted.persisted.styles # NOQA
except Exception:
# Trigger Twisted bug #7989
import twisted.persisted.styles # NOQA
self.assertRaises(ValueError, q.push, lambda x: x)
else:
# Use a different unpickleable object
class A(object): pass
a = A()
a.__reduce__ = a.__reduce_ex__ = None
self.assertRaises(ValueError, q.push, a)
# Selectors should fail (lxml.html.HtmlElement objects can't be pickled)
sel = Selector(text='<p>some text</p>')
self.assertRaises(ValueError, q.push, sel)
class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest):
chunksize = 100000
def queue(self):
return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize)
def test_serialize(self):
q = self.queue()
q.push('a')
q.push(123)
q.push({'a': 'dict'})
self.assertEqual(q.pop(), 'a')
self.assertEqual(q.pop(), 123)
self.assertEqual(q.pop(), {'a': 'dict'})
test_nonserializable_object = nonserializable_object_test
import time
import psycopg2
import logging
import queuelib
from queuelib import QueueService
import urlparse
import requests
class CategoryImporter(QueueService):
CATEGORIFY_API = 'https://categorify.org/api'
QUEUE_NAME = 'category'
def setup_bindings(self):
self.ch.queue_declare("category", durable=True, auto_delete=False)
self.ch.queue_bind("category", "org.blocked", "url.org")
self.ch.queue_bind("category", "org.blocked", "url.public")
self.session = requests.session()
def process_message(self, data):
url = data['url']
parsed_url = urlparse.urlparse(url)
domain = parsed_url.netloc.lower()
if domain.startswith('www.'):
domain = domain.split('.', 1)[-1]
def open(self, spider):
self.spider = spider
self.mqs = PriorityQueue(self._newmq)
self.dqs = self._dq() if self.dqdir else None
return self.df.open()
def _dq(self):
activef = join(self.dqdir, 'active.json')
if exists(activef):
with open(activef) as f:
prios = json.load(f)
else:
prios = ()
q = PriorityQueue(self._newdq, startprios=prios)
if q:
log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
spider=self.spider, queuesize=len(q))
return q
if slot not in self.pqueues:
self.pqueues[slot] = self.pqfactory()
queue = self.pqueues[slot]
queue.push(obj, priority)
def close(self):
active = {slot: queue.close()
for slot, queue in self.pqueues.items()}
self.pqueues.clear()
return active
def __len__(self):
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
class ScrapyPriorityQueue(PriorityQueue):
"""
PriorityQueue which works with scrapy.Request instances and
can optionally convert them to/from dicts before/after putting to a queue.
"""
def __init__(self, crawler, qfactory, startprios=(), serialize=False):
super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
self.serialize = serialize
self.spider = crawler.spider
@classmethod
def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
return cls(crawler, qfactory, startprios, serialize)
def push(self, request, priority=0):
if self.serialize:
request = request_to_dict(request, self.spider)
def __init__(self, settings):
self.settings = settings
self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
self.mqs = PriorityQueue(self.priority)
self.status = ScheduleStatus()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--verbose','-v', action='store_true')
parser.add_argument('queue', nargs='?')
args = parser.parse_args()
queuelib.setup_logging()
if args.queue:
CloudflareProbe.QUEUE_NAME = CloudflareProbe.QUEUE_NAME.replace('.org', '.'+args.queue)
cfprobe = CloudflareProbe()
logging.info("Listening on: %s", cfprobe.QUEUE_NAME)
cfprobe.run()