#!/usr/bin/perl -w # link_check3.plx # This is a third version of an HTML link checker. # Beginning with a URL (required as a command line argument), # it spiders out the entire site (or as much of it as it can # reach via links followed recursively from the starting page), # checking all HREF and SRC attributes to make sure they work # using GET and HEAD requests from LWP::UserAgent. It then # reports on the bad links. use strict; use LWP::UserAgent; use HTTP::Request; use HTML::LinkExtor; use URI::URL; # required by HTML::LinkExtor, when invoked with base my $from_addr = 'your@address.here'; # email address for user_agent my $agent_name = 'link_check3.plx'; # name that robot will report my $delay = 1; # number of seconds between requests my $timeout = 5; # number of seconds to timeout a request my $max_pages = 1000; # maximum number of pages to process my $webify = 1; # produce Web-ready output? my $debug = 1; # produce debugging output on STDERR? my %bad_links; # A "hash of arrays" with keys consisting of URLs # under $start_base, values consisting of lists of # bad links on those pages. my %good; # A hash mapping URLs to # 0 or 1 (for bad or good). Caches the results of # previous checks so they needn't be repeated for # subsequent pages. my @queue; # An array containing a list of URLs # (under $start_url) to be checked. my $total_pages; # Contains the count of pages processed so far. # Configuration ends. Script proper begins. my $last_request = 0; # Time of the last request, for $delay # first, construct the user agent my $ua = LWP::UserAgent->new; $ua->agent("$agent_name " . $ua->agent); $ua->from($from_addr); $ua->timeout($timeout); # set timeout interval # now process the command-line argument my $start_url = shift or die "Usage: $0 http://start.url.com/\n"; my($success, $type, $actual) = &check_url($start_url); unless ($success and $type eq 'text/html') { die "The start_url isn't reachable, or isn't an HTML file.\n"; } $good{$start_url} = 1; push @queue, $start_url; my $start_base = $start_url; $start_base =~ s{/[^/]*$}{/}; # trim everything after last '/' my $escaped_start_base = quotemeta $start_base; while (@queue) { ++$total_pages; if ($total_pages > $max_pages) { warn "stopped checking after reaching $max_pages pages.\n"; --$total_pages; # kludge so the count is correct in report last; } my $page = shift @queue; &process_page($page); # possibly adding new entries to @queue } # print the report my $time = localtime; if ($webify) { # print an HTML version of the report print < $start_url $0 report

$start_url $0 report

Report created at $time

EndOfText foreach my $file (sort keys %bad_links) { print "

$file
\n"; foreach my $target (sort @{ $bad_links{$file} }) { print " $target
\n"; } print "\n

\n\n"; } print "\n"; } else { # just print a plain-text version of the report print "$start_url $0 report\n"; print "Report created at $time\n\n"; foreach my $file (sort keys %bad_links) { print "$file:\n"; foreach my $target (sort @{ $bad_links{$file} }) { print " $target\n"; } print "\n"; } } # script proper ends. subroutines follow. sub check_url { # Check that this URL is valid, using the HEAD (and GET, if # HEAD fails) methods. Returns a 3-element array: # ($success, $type, $actual). my $url = shift; if ($debug) { warn " checking $url...\n"; } unless (defined $url) { return ('', '', ''); } sleep 1 while (time - $last_request) < $delay; $last_request = time; my $response = $ua->request(HTTP::Request->new('HEAD', $url)); my $success = $response->is_success; unless ($success) { # try a GET request; some hosts don't like HEAD sleep 1 while (time - $last_request) < $delay; $last_request = time; $response = $ua->request(HTTP::Request->new('GET', $url)); $success = $response->is_success; } if ($debug) { if ($success) { warn " ...good.\n"; } else { warn " ...bad.\n"; } } my $type = $response->header('Content-Type'); my $actual; if ($success) { $actual = $response->base; # redirected? } return ($success, $type, $actual); } sub process_page { # Invoked with a single argument of a page under # the $start_base that needs to be processed. That page # will be (1) retrieved via GET, (2) parsed for any links it # contains, and (3) have those links checked for validity themselves # (with any that fail being written to %bad_links, and any that # point to valid, unchecked HTML files under $start_base # being added to @queue). The subroutine has no return value. my $page = shift; return unless defined $page; if ($debug) { warn "processing $page for links\n"; } sleep 1 while (time - $last_request) < $delay; $last_request = time; my $response = $ua->request(HTTP::Request->new('GET', $page)); unless ($response->is_success and $response->header('Content-Type') eq 'text/html') { # strange, since it passed these tests # via HEAD request in order to get into @queue $good{$page} = 0; return; } my $base = $response->base; unless ($base =~ /$escaped_start_base/o) { # looks like we were redirected away from $start_base return; } my $parser = HTML::LinkExtor->new(undef, $base); $parser->parse($response->content); my @links = $parser->links; foreach my $linkarray (@links) { my ($tag, %links) = @{$linkarray}; if ($tag =~ /^(a|img|frame)$/) { TARGET: while (my($attr, $target) = each %links) { if ($attr =~ /^(href|src|lowsrc)$/) { # these $target entries are the ones we're # interested in. next TARGET unless $target =~ /^(?:https?|ftp):/; $target =~ s/#.*//; # lose trailing #targets if (exists $good{$target}) { # have already seen this before if ($good{$target}) { # already known to be good next; } else { # already known to be bad push @{ $bad_links{$base} }, $target; } } else { # haven't seen this one before my($success, $type, $actual) = &check_url($target); unless ($success) { $good{$target} = 0; push @{ $bad_links{$base} }, $target; next TARGET; } $good{$target} = 1; if (defined $type and $type eq 'text/html' and defined $actual and $actual =~ /$escaped_start_base/o) { push @queue, $target; } } } } } } }