Return to Snippet

Revision: 3991
at October 14, 2007 23:49 by iblis


Initial Code
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use LWP::Simple;
use HTML::Parser;
#
# Grab all links from local or remote html file
# perl html munging
#
# option -a (/ -r) grabs only absolute (/ relative) urls

# get options and argument
#
my %opts;
getopts('ar', \%opts);
my $arg = shift;
die "Usage: $0 [-a | -r] filename [| URL]\n"
	if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r

# get the page either from file or url
#
my $page;
if ($arg =~ m!^http://!) {
	$page = get($arg)
		or die "Couldn't get $arg: $!\n";
} 
else {
	open FH, "<", $arg
		or die "Couldn't open $arg: $!\n";
	$page = do { local $/; <FH> };
	close FH;
}

# set the parser and parse
#
my $parser = HTML::Parser->new( api_version => 3,
								start_h => [\&start,"tagname, attr"],
							 );
my @links;
sub start {
	my ($tag, $attr) = @_;
	if ($tag =~ /^a$/ and defined $attr->{href}) {
			return
				if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
			return
				if ($attr->{href} !~ m!http://! and $opts{a});	# exclude relative url when -a
			push @links, $attr->{href};
	}
}
$parser->parse($page);
$parser->eof;	

# output
#
map {print "$_\n"} @links;




			

Initial URL


Initial Description


Initial Title
Grab all links from local or remote html file

Initial Tags
html

Initial Language
Perl