Revision: 59313
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:14 by scrapy
Initial Code
# This is a piece of SpiderMiddleware to control the crawl path of a CrawlSpider-like spider. It works by dropping
# some links extracted from certain pages. The idea is to get a bit more fine-grained control than using
# LinkExtractors alone because with this middleware you can set different rules to be applied on different pages.
#
# You do this with a series of regular expressions which define the path that the spider is allowed to take.
# Specifically, you put a list of tuples in your settings.py file of the form:
# (fromUrlPattern, [allowedToURLs], [denyToURLs]).
#
# The allow / deny mechanism works similarly to the one Scrapy uses in LinkExtractors. Each tuple says something along
# the lines of:
#
# ` If a link was gathered at a page matching the regex in fromUrlPattern
# Keep it as long as the link's target URL matches at least one pattern in [allowedToURLs]
# Unless the link's target URL also matches at least one pattern in [denyToURLs]`
#
# Each 'from' page is handled by the first tuple whose fromUrlPattern matches its URL.
# If no tuple matches the URL for the 'from' page, CrawlPathMiddleware ignores that page and doesn't do drop any of
# its extracted links.
#
# If you leave [allowedToURLs] as either '' or [], it allows all URLs. This is the same as passing [r'.*']. This is
# useful if you want to have a deny rule without an allow rule.
#
# If you leave [allowedToURLs] as None, it doesn't allow any URLs. This is the same as passing something like [r'a^']
# and is useful if you want to designate a certain page as a dead end.
#
# If you leave [denyToURLs] as either '', [], or None, it doesn't deny any URLs. This is the same as passing something
# like [r'a^'] and is useful if you want to have an allow rule without a deny rule.
#
# You can also provide a string containing a single regex instead of a list of regexes for [allowedToURLs] or
# [denyToURLs]. For example, [r'.*my_regex.*'] and r'.*my_regex.*' do the same thing for [allowedToURLs] and
# [denyToURLs].
#
# See the settings.py code for examples.
=======================
===== settings.py =====
=======================
SPIDER_MIDDLEWARES = {
'path.to.crawlpath_module.CrawlPathMiddleware': 550,
}
# regular expression patterns for defining url types in shorthand
type1 = r'.*this_regex_uniquely_selects_urls_from_page_type_1.*'
type2 = r'.*this_regex_uniquely_selects_urls_from_page_type_2.*'
type3 = r'.*this_regex_uniquely_selects_urls_from_page_type_3.*'
type4 = r'.*this_regex_uniquely_selects_urls_from_page_type_4.*'
# list of crawl paths for CrawlPathMiddleware
# each crawl path is defined by a tuple comprised of three elements:
# fromUrlPattern: a single regular expression (regex) defining pages where this crawl path apples
# that is, fromUrlPattern identifies candidate 'from' pages
# [allowPatterns]: regex or list of regexes defining urls the spider can crawl to from this
# 'from' page
# [denyPatterns]: regex or list of regexes defining urls the spider can't crawl to from this
# 'from' page
PATH_TUPLES = [
# these should take the form:
# (fromUrlPattern, [allowPatterns], [denyPatterns])
# type2 pages can only go to any type2 or type3 pages
(type1, [type2, type3], ''),
# type2 pages can only go to type2 pages which don't match either of the bad patterns
(type2, type2, [r'.*bad_pattern_1.*', r'.*bad_pattern_2.*']),
# type3 pages can go anywhere except type1 pages
(type3, '', type1),
# type4 pages can't go anywhere
(type4, None, ''),
]
# If you set PATH_DEBUG to True, CrawlPathMiddleware will log information about
# which links were allowed / denied and why
PATH_DEBUG = True
# This setting controls how wide the printed URLs should be in the logged DEBUG statements.
# This number is only about making the log pretty and readable. Adjust it as you like.
PATH_DEBUG_URL_LENGTH = 95
========================
===== crawlpath.py =====
========================
# This is a piece of SpiderMiddleware to control the crawl path of a CrawlSpider-like spider. It works by dropping
# some links extracted from certain pages. The idea is to get a bit more fine-grained control than using
# LinkExtractors alone because with this middleware you can set different rules to be applied on different pages.
#
# You do this with a series of regular expressions which define the path that the spider is allowed to take.
# Specifically, you put a list of tuples in your settings.py file of the form:
# (fromUrlPattern, [allowedToURLs], [denyToURLs]).
#
# The allow / deny mechanism works similarly to the one Scrapy uses in LinkExtractors. Each tuple says something along
# the lines of:
#
# If a link was gathered at a page matching the regex in fromUrlPattern
# Keep it as long as the link's target URL matches at least one pattern in [allowedToURLs]
# Unless the link's target URL also matches at least one pattern in [denyToURLs]
#
# Each 'from' page is handled by the first tuple whose fromUrlPattern matches its URL.
# If no tuple matches the URL for the 'from' page, CrawlPathMiddleware ignores that page and doesn't do drop any of
# its extracted links.
#
# If you leave [allowedToURLs] as either '' or [], it allows all URLs. This is the same as passing [r'.*']. This is
# useful if you want to have a deny rule without an allow rule.
#
# If you leave [allowedToURLs] as None, it doesn't allow any URLs. This is the same as passing something like [r'a^']
# and is useful if you want to designate a certain page as a dead end.
#
# If you leave [denyToURLs] as either '', [], or None, it doesn't deny any URLs. This is the same as passing something
# like [r'a^'] and is useful if you want to have an allow rule without a deny rule.
#
# You can also provide a string containing a single regex instead of a list of regexes for [allowedToURLs] or
# [denyToURLs]. For example, [r'.*my_regex.*'] and r'.*my_regex.*' do the same thing for [allowedToURLs] and
# [denyToURLs].
#
# See the settings.py code for examples.
from scrapy.http.request import Request
import re
from scrapy import log
class CrawlPathMiddleware(object):
"""SpiderMiddleware to shape the crawl path of a CrawlSpider-like spider using PATH_TUPLES defined in settings.py"""
def __init__(self, path_tuples, debug, debug_url_length):
self.DEBUG = debug
self.DEBUG_URL_LENGTH = debug_url_length
self.FIRST_MSG_LEN = len("Didn't match 'allow' patterns for") + 2 # not set in any setting
# path_tuples gets split up into three separate variables
# self.fromPats is a list of patterns to match fromUrls
# self.allowPatGroups is a list of lists of allow patterns
# self.denyPatGroups is a list of lists of deny patterns
self.fromPats = [re.compile(t[0]) for t in path_tuples] # patterns to match against each fromURL
allowPatGroups = [t[1] for t in path_tuples] # allow patterns to match against each toURL
denyPatGroups = [t[2] for t in path_tuples] # deny patterns to match against each toURL
# process and compile allow patterns
self.allowPatGroups = []
for pat_group in allowPatGroups:
if pat_group == '' or pat_group == []:
# blank allowPats ==> allow everything
# this will always match as the input string which will evaluate to True (unless input string is '')
pat_group = '.*'
elif pat_group is None:
# None allowPats ==> match nothing
pat_group = 'a^'
# compile all patterns in the group
if isinstance(pat_group, (str, unicode)):
pats_compiled = [re.compile(pat_group)]
else:
pats_compiled = [re.compile(pat) for pat in pat_group]
self.allowPatGroups.append(pats_compiled)
# process and compile deny patterns
self.denyPatGroups = []
for pat_group in denyPatGroups:
# blank or None denyPats ==> deny nothing
if pat_group == '' or pat_group == [] or pat_group is None:
# this compiles without a problem and won't match anything. tries to match "a before line start"
pat_group = r'a^'
# compile all patterns in the group
if isinstance(pat_group, (str, unicode)):
pats_compiled = [re.compile(pat_group)]
else:
pats_compiled = [re.compile(pat) for pat in pat_group]
self.denyPatGroups.append(pats_compiled)
@classmethod
def from_settings(cls, settings):
path_tuples = settings.getlist('PATH_TUPLES')
debug = settings.getbool('PATH_DEBUG', default=False)
debug_url_length = settings.getint('PATH_DEBUG_URL_LENGTH', default=90)
return cls(path_tuples, debug, debug_url_length)
def _firstIndex(self, myIterable):
"""find the index of the first element in myIterable which evaluates to True"""
for (counter, option) in enumerate(myIterable):
if option: return counter
return None
def log(self, message, spider, level=log.DEBUG):
"""Log the given messages at the given log level. Stolen from BaseSpider."""
# prepend the name of this class to message
message = '[' + self.__class__.__name__ + '] ' + message
log.msg(message, spider=spider, level=level)
def process_spider_output(self, response, result, spider):
fromUrl = response.url
# figure out which tuple should handle links from this fromUrl
fromMatches = [re.match(p, fromUrl) for p in self.fromPats]
tupleIndex = self._firstIndex(fromMatches)
if tupleIndex is None:
# fromUrl didn't match any pattern in fromPats. don't change anything.
if self.DEBUG: self.log('No matching fromUrl pattern for'.ljust(self.FIRST_MSG_LEN) + \
fromUrl.ljust(self.DEBUG_URL_LENGTH), spider=spider)
for r in result: yield r
else:
# get the allow and deny patterns from the proper tuple
allowPats = self.allowPatGroups[tupleIndex]
denyPats = self.denyPatGroups[tupleIndex]
# check each result element against the allow and deny patterns for the appropriate tuple
for r in result:
if isinstance(r, Request):
toUrl = r.url
allowMatches = [re.match(p, toUrl) for p in allowPats]
if any(allowMatches):
denyMatches = [re.match(p, toUrl) for p in denyPats]
if not any(denyMatches):
# toUrl matched an allow pattern and no deny patterns. allow it to pass.
if self.DEBUG: self.log('All ok for'.ljust(self.FIRST_MSG_LEN) + \
fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14)+ \
toUrl, spider=spider)
yield r
else:
# toUrl matched a deny pattern. drop it.
if self.DEBUG: self.log('Matched deny for'.ljust(self.FIRST_MSG_LEN) + \
fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14) + \
toUrl, spider=spider)
yield None
else:
# toUrl didn't match any of the allow patterns, drop it
if self.DEBUG: self.log("Didn't match 'allow' patterns for".ljust(self.FIRST_MSG_LEN) + \
fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14) + \
toUrl, spider=spider)
yield None
else:
# r is an Item. allow it to pass.
yield r
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: kevinbache
# date : May 11, 2012
Initial URL
Initial Description
Initial Title
CrawlPathMiddleware: easily control the crawl path of a spider
Initial Tags
Initial Language
Python