$hostname$web_path link

#!/usr/bin/perl -w # link_check.plx # this is a first version of an HTML link checker. # it descends recursively from $start_dir, processing # all .htm or .html files to extract HREF and SRC # attributes, then checks all that point to a local # file to confirm that the file actually exists. use strict; use File::Find; # configuration section: # note: the first four configuration variables should *not* # have a trailing slash (/) my $start_dir = '/w1/s/socalsail/expo'; # where to begin looking my $hostname = 'www.socalsail.com'; # this site's hostname my $web_root = '/w1/s/socalsail'; # path to www doc root my $web_path = '/expo'; # Web path to $start_dir my $webify = 1; # produce Web-ready output? # end of configuration section my %bad_links; # A "hash of arrays" with keys consisting of URLs # under $start_base, and values consisting of lists # of bad links on those pages. my %good; # A hash mapping filesystem paths to # 0 or 1 (for good or bad). Used to cache the results # of previous checks so they needn't be repeated for # subsequent pages. find(\&process, $start_dir); # this loads up the above hashes my $time = localtime; if ($webify) { # print an HTML version of the report print < $hostname$web_path link_check report

$hostname$web_path link_check report

Report created at $time

EndOfText foreach my $file (sort keys %bad_links) { my $pretty_file = $file; my $escaped_web_root = quotemeta $web_root; $pretty_file =~ s/$escaped_web_root//o; $pretty_file = "

$pretty_file
\n"; print $pretty_file; foreach my $target (sort @{ $bad_links{$file} }) { $target =~ s/$escaped_web_root//o; print "$target
\n"; } print "\n

\n\n"; } print "\n"; } else { # just print a plain-text version of the report print "$hostname$web_path link_check report\n"; print "Report created at $time\n\n"; foreach my $file (sort keys %bad_links) { print "$file:\n"; foreach my $target (sort @{ $bad_links{$file} }) { print " $target\n"; } print "\n"; } } sub process { # this is invoked by File::Find's find function for each # file it recursively finds. it extracts a list of HREF # and SRC attributes from an HTML file, converts those # to local filesystem paths using the convert subroutine, # checks them for "badness", then stores the bad ones in # the %bad_links "hash of arrays". return unless /\.html?$/; my $file = $File::Find::name; unless (open IN, $file) { warn "can't open $file for reading: $!, continuing...\n"; return; } local $/; my $data = ; # all at once, courtesy the undef in $/ close IN; return unless $data; my @targets = ($data =~ /(?:href|src)\s*=\s*"([^"]+)"/gi); @targets = &convert($File::Find::dir, @targets); foreach my $target (@targets) { if (exists $good{$target}) { # we've already seen this one if ($good{$target}) { # already known to be good next; } else { # already known to be bad push @{ $bad_links{$file} }, $target; } } else { # haven't seen this one yet if (-e $target) { $good{$target} = 1; } else { $good{$target} = 0; push @{ $bad_links{$file} }, $target; } } } } sub convert { # This accepts the directory name of a file from # which a list of URLs was extracted (in the first argument) # and a list of URLs extracted from that file (in the # rest of the arguments). It returns a list of all the URLs # that did not point outside the local site, or were not # ftp:, mailto:, https:, or news: URLs, with those URLs # converted into local filesystem filenames. my($dir, @urls) = @_; my @return_urls; my $escaped_hostname = quotemeta $hostname; foreach (@urls) { next if /^(ftp|mailto|https|news):/i; if (/^http:/i) { # URL starts with 'http:' next unless /^http:\/\/$escaped_hostname/io; s/^http:\/\/$escaped_hostname//io; } if (/^\//) { # URL starts with '/' $_ = $web_root . $_; } else { # URL is a relative path $_ = $dir . '/' . $_; } s/#.*//; # trim trailing #targets s/\?.*//; # trim trailing ?arguments push @return_urls, $_; } @return_urls; }