Return to Snippet

Revision: 1835
at November 9, 2006 23:57 by whitetiger


Updated Code
import re,sgmllib,sys,urllib

class Parser(sgmllib.SGMLParser):

    def start_a(self, attr):

        regx = re.compile('[Hh][Tt][Tt][Pp].*\....$')
        href = [v for a, v in attr if a == 'href']
        try:
            if regx.match(href[0]): print href[0]
        except:
            pass

if __name__ == '__main__':

    try:

        fd = urllib.urlopen(sys.argv[1])

        parser = Parser()
        parser.feed(fd.read())
        parser.close()

        fd.close()

    except Exception, error:

        print 'Errore: ' + str(error)

Revision: 1834
at November 9, 2006 23:55 by whitetiger


Initial Code
import re,sgmllib,sys,urllib

class Parser(sgmllib.SGMLParser):

    def start_a(self, attr):

        regx = re.compile('[Hh][Tt][Tt][Pp].*\....$')
        href = [v for a, v in attr if a == 'href']
        try:
            if regx.match(href[0]): print href[0]
        except:
            pass

if __name__ == '__main__':

    try:

        fd = urllib.urlopen(sys.argv[1])

        parser = Parser()
        parser.feed(fd.read())
        parser.close()

        fd.close()

    except Exception, error:

        print 'Errore: ' + str(error)

Initial URL

                                

Initial Description

                                

Initial Title
Python - Stampa i links di una pagina HTML

Initial Tags
regex, python

Initial Language
Python