Using Scrapy from a script


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This snippet can be used to run scrapy spiders independent of scrapyd or the scrapy command line tool and use it from a script.
  2. #
  3. # The multiprocessing library is used in order to work around a bug in Twisted, in which you cannot restart an already running reactor or in this case a scrapy instance.
  4. #
  5. # [Here](http://groups.google.com/group/scrapy-users/browse_thread/thread/f332fc5b749d401a) is the mailing-list discussion for this snippet.
  6.  
  7. #!/usr/bin/python
  8. import os
  9. os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings') #Must be at the top before other imports
  10.  
  11. from scrapy import log, signals, project
  12. from scrapy.xlib.pydispatch import dispatcher
  13. from scrapy.conf import settings
  14. from scrapy.crawler import CrawlerProcess
  15. from multiprocessing import Process, Queue
  16.  
  17. class CrawlerScript():
  18.  
  19. def __init__(self):
  20. self.crawler = CrawlerProcess(settings)
  21. if not hasattr(project, 'crawler'):
  22. self.crawler.install()
  23. self.crawler.configure()
  24. self.items = []
  25. dispatcher.connect(self._item_passed, signals.item_passed)
  26.  
  27. def _item_passed(self, item):
  28. self.items.append(item)
  29.  
  30. def _crawl(self, queue, spider_name):
  31. spider = self.crawler.spiders.create(spider_name)
  32. if spider:
  33. self.crawler.queue.append_spider(spider)
  34. self.crawler.start()
  35. self.crawler.stop()
  36. queue.put(self.items)
  37.  
  38. def crawl(self, spider):
  39. queue = Queue()
  40. p = Process(target=self._crawl, args=(queue, spider,))
  41. p.start()
  42. p.join()
  43. return queue.get(True)
  44.  
  45. # Usage
  46. if __name__ == "__main__":
  47. log.start()
  48.  
  49. """
  50. This example runs spider1 and then spider2 three times.
  51. """
  52. items = list()
  53. crawler = CrawlerScript()
  54. items.append(crawler.crawl('spider1'))
  55. for i in range(3):
  56. items.append(crawler.crawl('spider2'))
  57. print items
  58.  
  59. # Snippet imported from snippets.scrapy.org (which no longer works)
  60. # author: joehillen
  61. # date : Oct 24, 2010
  62.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.