#!/usr/bin/perl -w use strict; use Getopt::Long; require HTTP::Request; require HTTP::Response; require HTML::LinkExtor; require LWP::UserAgent; sub Get_local; sub Do_links; sub Done_url; sub Do_check; $main::debug=0; GetOptions( "ext!" => \$main::extern, "host=s" => \$main::baseurl, "h=s" => \$main::baseurl, "timeout=i" => \$main::timeout, "t=i" => \$main::timeout, "agent=s" => \$main::uagent, "a=s" => \$main::uagent, "exclude=s" => \$main::exclude, "x=s" => \$main::exclude, "depth=i" => \$main::depth, "md=i" => \$main::depth, "debug=i" =>\$main::debug, "d=i" =>\$main::debug ); %main::done = (); @main::urlstack = (); @main::xurl = (); unless($main::baseurl) { die $0." [--host|-h] \n". "[[--timeout|-t] ] [[--agent|-a] ]\n". "[[--exclude|-x] ]\n". "[[--depth|-md] ]". "[[--debug|-d] ]\n"; } if($main::exclude) { @main::xurl = split / /,$main::exclude; } Get_local($main::baseurl); exit(0); # request sub Get_local { my $request = shift; my ($response,$cont,$parse,$i); unless(Done_url($request)) { if($main::debug >0) { print "Checking local link $request\n"; } unshift @main::urlstack, $request; if($main::debug > 2) { foreach $i (@main::urlstack) { print " $i"; } print "\n"; } Do_check($request,\$response,\$cont); if($response->content_type =~ /text\/html/) { $parse = HTML::LinkExtor->new(\&Do_links,$request); $parse->parse($cont); } shift @main::urlstack; } } sub Done_url { my $url = shift; my $i; if((defined($main::depth))&& (scalar @main::urlstack > $main::depth)) { if($main::debug > 1) { print "\n*** Reached Maximum Depth $main::depth ***\n"; print "Ignoring $url.\n"; } return(1); } if(defined(@main::xurl)) { foreach $i (@main::xurl) { if($url =~ /$i/) { if($main::debug > 1) { print "\n$url matched $i. Ignoring.\n\n"; } return(1); } } } if($main::done{$url}) { return(1); } $main::done{$url} = 1; return(0); } sub Do_remote { my $url = shift; unless(Done_url($url)) { if($main::debug>0) { print "Resolving external link $url\n"; } Do_check($url); } } sub Do_check { my $url = shift; my $data = shift; my $content = shift; my($ua,$output,$response); $ua = LWP::UserAgent->new; if($main::timeout) { $ua->timeout($main::timeout); } if($main::uagent) { $ua->agent($main::uagent); } $output = HTTP::Request->new(GET => $url); $response = $ua->request($output); unless($response->is_success) { printf "** GET %s\n in %s returned\n %s\n", $url,$main::urlstack[0],$response->status_line; } if($data) { $$data = $response; } if($content) { $$content = $response->content; } } sub Do_links { my($tag, %links) = @_; my($ua, $output, $response); if(((${[%links]}[0] eq 'href')|| (${[%links]}[0] eq 'img')) && (!(${[%links]}[1] =~ /(#|news:|mailto:|telnet:)/))) { if(${[%links]}[1] =~ /$main::baseurl/) { Get_local(${[%links]}[1]); } elsif ((!(defined $main::extern)) ||($main::extern==1)) { Do_remote(${[%links]}[1]); } } }