Skip to content

Commit 3b5a394

Browse files
committed
crawl_manager: default errback to None
1 parent 4fe6615 commit 3b5a394

File tree

4 files changed

+34
-20
lines changed

4 files changed

+34
-20
lines changed

docs/source/api.rst

+12-5
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,20 @@ callback
9797
- optional
9898

9999
Must exist as method of scheduled spider, does not need to contain string "self".
100-
If not passed or not found on spider default callback `parse`_ will be used.
100+
If not passed default Scrapy callback `parse`_ will be used. If there is no spider method
101+
with name specified by callback argument or callback is not callable API will return 400 HTTP error.
102+
103+
Example request with callback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&callback=parse_page``
101104

102105
errback
103106
- type: string
104107
- optional
105108

106109
Scrapy errback for request made from spider. It must exist as method of
107-
scheduled spider, otherwise exception will be raised. String does not need to contain 'self'.
110+
scheduled spider, otherwise API will return 400 HTTP error. String does not need to contain 'self'.
111+
Defaults to None, can be adjusted with `DEFAULT_ERRBACK_NAME`_ setting.
112+
113+
Example request with errback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&errback=my_errback``
108114

109115
max_requests
110116
- type: integer
@@ -520,11 +526,12 @@ Default: ``utf-8``.
520526
DEFAULT_ERRBACK_NAME
521527
~~~~~~~~~~~~~~~~~~~~
522528

523-
Default: ``"parse"``
529+
Default: ``None``
524530

525-
The name of the default errback_.
531+
String with the name of the default errback_.
526532

527-
Use an empty string or ``None`` to unset the errback altogether.
533+
Use this setting to set default errback for scrapy spider requests made from ScrapyRT.
534+
Errback must exist as method of spider and must be callable, otherwise 400 HTTP error will be raised.
528535

529536
.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing
530537

scrapyrt/conf/default_settings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,4 @@
3333

3434
TWISTED_REACTOR = None
3535

36-
DEFAULT_ERRBACK_NAME = 'parse'
36+
DEFAULT_ERRBACK_NAME = None

scrapyrt/core.py

+19-12
Original file line numberDiff line numberDiff line change
@@ -177,22 +177,29 @@ def spider_idle(self, spider):
177177
callback = getattr(self.crawler.spider, self.callback_name)
178178
assert callable(callback), 'Invalid callback'
179179
self.request = self.request.replace(callback=callback)
180-
181-
180+
except (AssertionError, AttributeError):
181+
msg = f"Invalid spider callback {self.callback_name}, callback not callable or not a method of a spider {self.spider_name}"
182+
self.user_error = Error(400, message=msg)
183+
try:
182184
if self.errback_name:
183185
errback = getattr(self.crawler.spider, self.errback_name)
184186
assert callable(errback), 'Invalid errback'
185187
self.request = self.request.replace(errback=errback)
186-
modify_request = getattr(
187-
self.crawler.spider, "modify_realtime_request", None)
188-
if callable(modify_request):
189-
self.request = modify_request(self.request)
190-
spider.crawler.engine.crawl(self.request)
191-
self._request_scheduled = True
192-
except Exception as e:
193-
self.user_error = Error(400, message=traceback.format_exc())
194-
else:
195-
raise DontCloseSpider
188+
except (AssertionError, AttributeError):
189+
msg = f"Invalid spider errback {self.errback_name}, errback not callable or not a method of a spider {self.spider_name}"
190+
self.user_error = Error(400, message=msg)
191+
if self.user_error:
192+
log.msg(self.user_error.message, level=log.ERROR)
193+
return
194+
195+
modify_request = getattr(
196+
self.crawler.spider, "modify_realtime_request", None)
197+
if callable(modify_request):
198+
self.request = modify_request(self.request)
199+
200+
spider.crawler.engine.crawl(self.request)
201+
self._request_scheduled = True
202+
raise DontCloseSpider
196203

197204
def handle_scheduling(self, request, spider):
198205
"""Handler of request_scheduled signal.

tests/test_crawl_manager.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def test_raise_error_if_not_callable(self):
113113
self.spider.parse_something = None
114114
self._call_spider_idle()
115115
self.assertIsNotNone(self.crawl_manager.user_error)
116-
msg = "Invalid callback"
116+
msg = "Invalid spider callback parse_something"
117117
assert re.search(msg, self.crawl_manager.user_error.message)
118118
self.assertFalse(self.crawler.engine.crawl.called)
119119

@@ -153,7 +153,7 @@ def test_pass_wrong_spider_errback(self):
153153
assert mng.request.errback is None
154154

155155
self.assertIsNotNone(mng.user_error)
156-
msg = "has no attribute 'handle_error'"
156+
msg = "Invalid spider errback"
157157
assert re.search(msg, mng.user_error.message)
158158

159159
def test_pass_good_spider_errback(self):

0 commit comments

Comments
 (0)