Text-file Spider Queue


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # Description
  2. # -----------
  3. #
  4. # This is a Spider Queue that uses a plain text file as backend, storing one spider name per line.
  5. #
  6. # Usage example
  7. # -------------
  8. #
  9. # First start scrapy in server mode:
  10. #
  11. # scrapy runserver --set SPIDER_QUEUE_CLASS=path.to.TextFileSpiderQueue
  12. #
  13. # Then add spiders to crawl from the shell with:
  14. #
  15. # $ echo myspider >>queue.txt
  16. #
  17. # It also works with the Scrapy `queue` command:
  18. #
  19. # $ scrapy queue add myspider
  20. # Added: name=pwc_sg args={}
  21. #
  22. # Limitations
  23. # -----------
  24. #
  25. # * It doesn't support spider arguments
  26. #
  27. # * It's not concurrency safe.
  28.  
  29. import os
  30.  
  31. from zope.interface import implements
  32.  
  33. from scrapy.interfaces import ISpiderQueue
  34.  
  35. class TextFileSpiderQueue(object):
  36.  
  37. implements(ISpiderQueue)
  38.  
  39. FILE = 'queue.txt'
  40.  
  41. @classmethod
  42. def from_settings(cls, settings):
  43. return cls()
  44.  
  45. def add(self, name, **spider_args):
  46. with open(self.FILE, 'a') as f:
  47. f.write(name + os.linesep)
  48.  
  49. def pop(self):
  50. msgs = list(open(self.FILE)) if os.path.exists(self.FILE) else []
  51. if not msgs:
  52. return
  53. with open(self.FILE, 'w') as f:
  54. f.writelines(msgs[1:])
  55. return {'name': msgs[0].strip()}
  56.  
  57. def count(self):
  58. return len(list(open(self.FILE)))
  59.  
  60. def list(self):
  61. return [{'name': x.strip()} for x in open(self.FILE)]
  62.  
  63. def clear(self):
  64. os.remove(self.FILE)
  65.  
  66. # Snippet imported from snippets.scrapy.org (which no longer works)
  67. # author: pablo
  68. # date : Sep 05, 2010
  69.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.