Revision: 59323
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# This snippet was taken from the old wiki.
#
# You can do this by overriding the Scrapy HTTP Client Factory, with the following (undocumented) setting:
#
# DOWNLOADER_HTTPCLIENTFACTORY = 'myproject.downloader.LimitSizeHTTPClientFactory'
#
MAX_RESPONSE_SIZE = 1048576 # 1Mb
from scrapy.core.downloader.webclient import ScrapyHTTPClientFactory, ScrapyHTTPPageGetter
class LimitSizePageGetter(ScrapyHTTPPageGetter):
def handleHeader(self, key, value):
ScrapyHTTPPageGetter.handleHeader(self, key, value)
if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
self.connectionLost('oversized')
class LimitSizeHTTPClientFactory(ScrapyHTTPClientFactory):
protocol = LimitSizePageGetter
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date : Sep 16, 2011
Initial URL
Initial Description
Initial Title
Avoid downloading pages which exceed a certain size
Initial Tags
download
Initial Language
Python