Revision: 59339
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# A downloader middleware automatically to redirect pages containing a rel=canonical in their contents to the canonical url (if the page itself is not the canonical one), from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.utils.url import url_is_from_spider from scrapy.http import HtmlResponse from scrapy import log class RelCanonicalMiddleware(object): _extractor = SgmlLinkExtractor(restrict_xpaths=['//head/link[@rel="canonical"]'], tags=['link'], attrs=['href']) def process_response(self, request, response, spider): if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False): rel_canonical = self._extractor.extract_links(response) if rel_canonical: rel_canonical = rel_canonical[0].url if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider): log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider) return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response) return response # Snippet imported from snippets.scrapy.org (which no longer works) # author: pablo # date : Aug 27, 2010
Initial URL
Initial Description
Initial Title
Downloader middleware to redirect to rel=canonical urls
Initial Tags
Initial Language
Python