Return to Snippet

Revision: 59323
at September 1, 2012 07:15 by scrapy


Initial Code
# This snippet was taken from the old wiki.
# 
# You can do this by overriding the Scrapy HTTP Client Factory, with the following (undocumented) setting:
# 
#     DOWNLOADER_HTTPCLIENTFACTORY = 'myproject.downloader.LimitSizeHTTPClientFactory'
# 

MAX_RESPONSE_SIZE = 1048576 # 1Mb

from scrapy.core.downloader.webclient import ScrapyHTTPClientFactory, ScrapyHTTPPageGetter

class LimitSizePageGetter(ScrapyHTTPPageGetter):

    def handleHeader(self, key, value):
        ScrapyHTTPPageGetter.handleHeader(self, key, value)
        if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
            self.connectionLost('oversized')

class LimitSizeHTTPClientFactory(ScrapyHTTPClientFactory):

     protocol = LimitSizePageGetter

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date  : Sep 16, 2011

Initial URL

                                

Initial Description

                                

Initial Title
Avoid downloading pages which exceed a certain size

Initial Tags
download

Initial Language
Python