#!/usr/bin/perl -w # link_check2.plx # This is a modified HTML link checker. # It descends recursively from $start_dir, processing # all .htm or .html files to extract HREF and SRC # attributes, then checks all that point to a local # file to confirm that the file actually exists, and optionally # uses LWP::Simple to do a HEAD check on remote ones for the # same purpose. It then reports on the bad links. use strict; use File::Find; use LWP::Simple; # note: the first four configuration variables should *not* # have a trailing slash (/) my $start_dir = '/w1/s/socalsail/expo'; # where to begin looking my $hostname = 'www.socalsail.com'; # this site's hostname my $web_root = '/w1/s/socalsail'; # path to www doc root my $web_path = '/expo'; # Web path to $start_dir my $webify = 1; # produce Web-ready output? my $check_remote = 1; # check offsite links? my %bad_links; # a "hash of lists" with keys consisting of filenames, # values consisting of lists of bad links in those files my %good; # A hash mapping absolute filenames (or remote URLs) to # 0 or 1 (for good or bad). Used to cache the results of # previous checks. find(\&process, $start_dir); # this loads up the above hashes if ($webify) { # print an HTML version of the report print < $hostname$web_path link_check report

$hostname$web_path link_check report

Report created at $time

EndOfText foreach my $file (sort keys %bad_links) { my $pretty_file = $file; my $escaped_web_root = quotemeta $web_root; $pretty_file =~ s/$escaped_web_root//o; $pretty_file = "

$pretty_file
\n"; print $pretty_file; foreach my $target (sort @{ $bad_links{$file} }) { $target =~ s/$escaped_web_root//o; print "$target
\n"; } print "\n

\n\n"; } print "\n"; } else { # just print a plain-text version of the report print "$hostname$web_path link_check report\n"; print "Report created at $time\n\n"; foreach my $file (sort keys %bad_links) { print "$file:\n"; foreach my $target (sort @{ $bad_links{$file} }) { print " $target\n"; } print "\n"; } } sub process { # This is invoked by File::Find's find function for each # file it recursively finds. It extracts a list of HREF # and SRC attributes from an HTML file, checks them for # "badness", then stores the bad ones (keyed by filename) # in the %bad_links "hash of lists". return unless /\.html?$/; my $file = $File::Find::name; # warn "processing $file...\n"; unless (open IN, $file) { warn "can't open $file for reading: $!, continuing...\n"; return; } local $/; my $data = ; # all at once, courtesy the undef in $/ close IN; return unless $data; # don't care about empty files my @targets = ($data =~ /(?:href|src)\s*=\s*"([^"]+)"/gi); @targets = &convert($File::Find::dir, @targets); foreach my $target (@targets) { next if $target =~ /\n/; if (exists $good{$target}) { # we've already seen this one if ($good{$target}) { # already known to be good next; } else { # already known to be bad push @{ $bad_links{$file} }, $target; } } elsif ($target =~ /^http:/) { # a remote link we haven't seen before if (head($target)) { $good{$target} = 1; } else { push @{ $bad_links{$file} }, $target; $good{$target} = 0; } } else { # a local link we haven't seen before if (-e $target) { $good{$target} = 1; } else { push @{ $bad_links{$file} }, $target; $good{$target} = 0; } } } } sub convert { # this accepts the directory name of a file from # which a list of URLs was extracted (in the first argument) # and a list of URLs extracted from that file (in the # rest of the arguments). It returns a list of all the URLs # that did not point outside the local site absolutized into # local filesystem pathnames. Optionally, if $check_remote # is set to a true value, it also returns any links beginning # 'http:' that *do* point outside the local site, left in their # original form. my($dir, @urls) = @_; my @return_urls; my $escaped_hostname = quotemeta $hostname; foreach (@urls) { next if /^(ftp|mailto|https|news):/i; # skip these if (/^http:/i) { # URL starts with 'http:' if (/^http:\/\/$escaped_hostname/io) { # local link; convert to local filename # in preparation for further conversion below s/^http:\/\/$escaped_hostname//io; } else { # remote link push @return_urls, $_ if $check_remote; next; } } if (/^\//) { # URL starts with '/' $_ = $web_root . $_; } else { # URL is a relative path $_ = $dir . '/' . $_; } s/#.*//; # trim trailing #targets s/\?.*//; # trim trailing ?arguments push @return_urls, $_; } return @return_urls; }