Return to Snippet

Revision: 11183
at January 27, 2009 12:00 by iblis


Initial Code
#!/usr/bin/env perl
#
# grabcode.pl
# Download code between <pre> tags from remote HTML pages
# Takes a list of urls as argument

use strict; use warnings;

use WWW::Mechanize;
use HTML::TreeBuilder::XPath;
use Encode;

my @urls = @ARGV;

my $browser = WWW::Mechanize->new;
$browser->agent_alias('Linux Mozilla');
#$browser->credentials('uname', 'passwd');

foreach my $url (@urls) {
	
	my $page; 
	if ( $browser->get($url)->is_success() ) {
		$page = $browser->content();
	}
	else {
		warn "Skipping $url:\n$browser->status_line\n";
		next;
	} 

	my $tree= HTML::TreeBuilder::XPath->new;
	$tree->parse( $page );
		
	my $nodes = $tree->findnodes( '//pre');
	while ( my $node = $nodes->shift() ) {
		print encode("utf8",$node->as_text());
		print "\n";
	}
}

Initial URL


Initial Description
Won't work with Google code pages: they are javascript powered.

Initial Title
Batch download code between  tags from remote HTML pages

Initial Tags
html, download, web

Initial Language
Perl