Self-contained script to crawl a site [updated: scrapy 13.0dev]


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This scripts shows how to crawl a site without settings up a complete project.
  2. #
  3. # Note: the `crawler.start()` can't be called more than once due twisted's reactor limitation.
  4.  
  5. #!/usr/bin/env python
  6. # -*- coding: utf-8 -*-
  7. # author: Rolando Espinoza La fuente
  8. #
  9. # Changelog:
  10. # 24/07/2011 - updated to work with scrapy 13.0dev
  11. # 25/08/2010 - initial version. works with scrapy 0.9
  12.  
  13. from scrapy.contrib.loader import XPathItemLoader
  14. from scrapy.item import Item, Field
  15. from scrapy.selector import HtmlXPathSelector
  16. from scrapy.spider import BaseSpider
  17.  
  18.  
  19. class QuestionItem(Item):
  20. """Our SO Question Item"""
  21. title = Field()
  22. summary = Field()
  23. tags = Field()
  24.  
  25. user = Field()
  26. posted = Field()
  27.  
  28. votes = Field()
  29. answers = Field()
  30. views = Field()
  31.  
  32.  
  33. class MySpider(BaseSpider):
  34. """Our ad-hoc spider"""
  35. name = "myspider"
  36. start_urls = ["http://stackoverflow.com/"]
  37.  
  38. question_list_xpath = '//div[@id="content"]//div[contains(@class, "question-summary")]'
  39.  
  40. def parse(self, response):
  41. hxs = HtmlXPathSelector(response)
  42.  
  43. for qxs in hxs.select(self.question_list_xpath):
  44. loader = XPathItemLoader(QuestionItem(), selector=qxs)
  45. loader.add_xpath('title', './/h3/a/text()')
  46. loader.add_xpath('summary', './/h3/a/@title')
  47. loader.add_xpath('tags', './/a[@rel="tag"]/text()')
  48. loader.add_xpath('user', './/div[@class="started"]/a[2]/text()')
  49. loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title')
  50. loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()')
  51. loader.add_xpath('answers', './/div[contains(@class, "answered")]/div[1]/text()')
  52. loader.add_xpath('views', './/div[@class="views"]/div[1]/text()')
  53.  
  54. yield loader.load_item()
  55.  
  56.  
  57. def main():
  58. """Setups item signal and run the spider"""
  59. # set up signal to catch items scraped
  60. from scrapy import signals
  61. from scrapy.xlib.pydispatch import dispatcher
  62.  
  63. def catch_item(sender, item, **kwargs):
  64. print "Got:", item
  65.  
  66. dispatcher.connect(catch_item, signal=signals.item_passed)
  67.  
  68. # shut off log
  69. from scrapy.conf import settings
  70. settings.overrides['LOG_ENABLED'] = False
  71.  
  72. # set up crawler
  73. from scrapy.crawler import CrawlerProcess
  74.  
  75. crawler = CrawlerProcess(settings)
  76. crawler.install()
  77. crawler.configure()
  78.  
  79. # schedule spider
  80. crawler.crawl(MySpider())
  81.  
  82. # start engine scrapy/twisted
  83. print "STARTING ENGINE"
  84. crawler.start()
  85. print "ENGINE STOPPED"
  86.  
  87.  
  88. if __name__ == '__main__':
  89. main()
  90.  
  91. # Snippet imported from snippets.scrapy.org (which no longer works)
  92. # author: darkrho
  93. # date : Aug 25, 2010
  94.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.