Return to Snippet

Revision: 7562
at July 29, 2008 21:57 by panquetofobia


Updated Code
#!/usr/bin/python
"""
@name Music Downloader
@description This script downloads all mp3 links from given rss/atom feeds.
I wrote this to learn python so I'm sure there are better ways to do this.
It works in Mac OSX Leopard WITH wget.
__Add your feed's uris at the bottom__

@author [email protected]



------------------------------------------------------------------------------

Usage: $ python downloader.py [-t]
Options:
      -t  Shows the links retrieved from all feeds. No download takes place.

TODO:
- logging
- file size restriction support
- multithread support
- links as swf player param
"""
import sqlite3
import feedparser
import urllib
import urlparse
import os
import sys
import re

class Downloader:

   def __init__(self, feeds):
       if type(feeds) is not list:
           raise Exception('A list of blogs is needed')
       self.userAgent = 'Mozilla/5.0 (X11; U; Linux i686; en-US;rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)'
       self.conn = sqlite3.connect('downloader.db')
       self.downloadDir = os.getcwd()
       c = self.conn.cursor()
       sql = "SELECT name FROM sqlite_master WHERE type='table' AND name='downloaded'"
       if c.execute(sql).fetchone() == None:
           sql = 'CREATE TABLE downloaded(id INTEGER PRIMARY KEY, remoteFile TEXT, datetime DATE_TIME)'
           c.execute(sql)
           self.conn.commit()
       self.links = []
       for feed in feeds:
           self.feedData = feedparser.parse(feed)
           self.addLinks(feed)
       try:
           if sys.argv[1] == '-t':
               for i in self.links:
                   print i
       except IndexError, e:
           print '%i tracks will be downloaded' % len(self.links)
           print
           self.downloadFiles()

   def addLinks(self, feed = ''):
      re_atom = re.compile('^atom.*')
      re_rss  = re.compile('^rss.*')
      print 'Adding links in ' + self.feedData.version + ' feed in ' + feed
      if re_atom.search(self.feedData.version):
          self.__addAtomLinks()
      elif re_rss.search(self.feedData.version):
          self.__addRssLinks()

   def __addAtomLinks(self):
      re_links = re.compile('<a[^>]+href="(http:[^"]+)"[^>]*>')
      re_mp3   = re.compile('mp3$')
      html = ''
      for entry in self.feedData.entries:
          """ atom03 """
          html += entry.content[0]['value']
      links = re_links.findall(html)
      for link in links:
          if re_mp3.search(link.lower()):
              self.links.append(link)

   def __addRssLinks(self):
       for entry in self.feedData.entries:
           if hasattr(entry, 'enclosures'):
               for link in entry.enclosures:
                   if link.type == 'audio/mpeg':
                       self.links.append(link.href)

   def isDownloaded(self, remoteFile):
       c = self.conn.cursor()
       sql = "SELECT id FROM downloaded WHERE remoteFile = ?"
       return c.execute(sql, (remoteFile, )).fetchone() != None


   def downloadFiles(self):
       i = 0
       c = self.conn.cursor()
       for link in self.links:
           #if i > 5 : break
           remoteFile = urlparse.urlsplit(link)[2].split('/')[-1]
           localFileName = urllib.unquote(remoteFile)
           localFile  = self.downloadDir + "/" + localFileName
           if self.isDownloaded(remoteFile) == False:
               if os.path.exists(localFile): 
                   self.skippingMessage(localFileName)
                   continue;
               print
               print 'Downloading ' + localFileName + ' ... '
               os.system('wget --tries=2 --timeout=10 --user-agent="%s" "%s"' % (self.userAgent, link))
               sql = 'INSERT INTO downloaded (remoteFile, datetime) VALUES(?, DATETIME("NOW"))'
               c.execute(sql, (remoteFile, ))
               self.conn.commit()
               i += 1
               print '------------------------[ O K %i ]--------------------------' % i
           else:
               self.skippingMessage(localFileName)

   def skippingMessage(self, remoteFile):
       print 'File ' + remoteFile + ' alredy exists'
       print '------------------------[  S K I P ]--------------------------'
       
if __name__ == '__main__':
    
    feeds = [
      'http://oldbluebus.blogspot.com/atom.xml',
      'http://ravensingstheblues.blogspot.com/atom.xml'
    ]
    
    Downloader(feeds)

Revision: 7561
at July 29, 2008 21:54 by panquetofobia


Initial Code
#!/usr/bin/python
"""
@name Music Downloader
@description This script downloads all mp3 links from given rss/atom feeds.
I wrote this to learn python so I'm sure there are better ways to do this.
It works in Mac OSX Leopard.
__Add your feed's uris at the bottom__

@author [email protected]



------------------------------------------------------------------------------

Usage: $ python downloader.py [-t]
Options:
      -t  Shows the links retrieved from all feeds. No download takes place.

TODO:
- logging
- file size restriction support
- multithread support
- links as swf player param
"""
import sqlite3
import feedparser
import urllib
import urlparse
import os
import sys
import re

class Downloader:

   def __init__(self, feeds):
       if type(feeds) is not list:
           raise Exception('A list of blogs is needed')
       self.userAgent = 'Mozilla/5.0 (X11; U; Linux i686; en-US;rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)'
       self.conn = sqlite3.connect('downloader.db')
       self.downloadDir = os.getcwd()
       c = self.conn.cursor()
       sql = "SELECT name FROM sqlite_master WHERE type='table' AND name='downloaded'"
       if c.execute(sql).fetchone() == None:
           sql = 'CREATE TABLE downloaded(id INTEGER PRIMARY KEY, remoteFile TEXT, datetime DATE_TIME)'
           c.execute(sql)
           self.conn.commit()
       self.links = []
       for feed in feeds:
           self.feedData = feedparser.parse(feed)
           self.addLinks(feed)
       try:
           if sys.argv[1] == '-t':
               for i in self.links:
                   print i
       except IndexError, e:
           print '%i tracks will be downloaded' % len(self.links)
           print
           self.downloadFiles()

   def addLinks(self, feed = ''):
      re_atom = re.compile('^atom.*')
      re_rss  = re.compile('^rss.*')
      print 'Adding links in ' + self.feedData.version + ' feed in ' + feed
      if re_atom.search(self.feedData.version):
          self.__addAtomLinks()
      elif re_rss.search(self.feedData.version):
          self.__addRssLinks()

   def __addAtomLinks(self):
      re_links = re.compile('<a[^>]+href="(http:[^"]+)"[^>]*>')
      re_mp3   = re.compile('mp3$')
      html = ''
      for entry in self.feedData.entries:
          """ atom03 """
          html += entry.content[0]['value']
      links = re_links.findall(html)
      for link in links:
          if re_mp3.search(link.lower()):
              self.links.append(link)

   def __addRssLinks(self):
       for entry in self.feedData.entries:
           if hasattr(entry, 'enclosures'):
               for link in entry.enclosures:
                   if link.type == 'audio/mpeg':
                       self.links.append(link.href)

   def isDownloaded(self, remoteFile):
       c = self.conn.cursor()
       sql = "SELECT id FROM downloaded WHERE remoteFile = ?"
       return c.execute(sql, (remoteFile, )).fetchone() != None


   def downloadFiles(self):
       i = 0
       c = self.conn.cursor()
       for link in self.links:
           #if i > 5 : break
           remoteFile = urlparse.urlsplit(link)[2].split('/')[-1]
           localFileName = urllib.unquote(remoteFile)
           localFile  = self.downloadDir + "/" + localFileName
           if self.isDownloaded(remoteFile) == False:
               if os.path.exists(localFile): 
                   self.skippingMessage(localFileName)
                   continue;
               print
               print 'Downloading ' + localFileName + ' ... '
               os.system('wget --tries=2 --timeout=10 --user-agent="%s" "%s"' % (self.userAgent, link))
               sql = 'INSERT INTO downloaded (remoteFile, datetime) VALUES(?, DATETIME("NOW"))'
               c.execute(sql, (remoteFile, ))
               self.conn.commit()
               i += 1
               print '------------------------[ O K %i ]--------------------------' % i
           else:
               self.skippingMessage(localFileName)

   def skippingMessage(self, remoteFile):
       print 'File ' + remoteFile + ' alredy exists'
       print '------------------------[  S K I P ]--------------------------'
       
if __name__ == '__main__':
    
    feeds = [
      'http://oldbluebus.blogspot.com/atom.xml',
      'http://ravensingstheblues.blogspot.com/atom.xml'
    ]
    
    Downloader(feeds)

Initial URL

                                

Initial Description
This script downloads all mp3 links from given rss/atom feeds.
I wrote this to learn python so I'm sure there are better ways to do this.
Depends on wget.

Initial Title
Mp3 downloader from feeds

Initial Tags
python

Initial Language
Python