#!/usr/local/bin/perl -w # Copyright 1999 by John Halleck # All rights reserved require 5.0; # not a prayer with perl 4 use strict; use lib qw(/home/nahaj/lib); use paths qw(&makepathrel &makepathabs); use urls qw(&makeurlabs &makeurlrel); use urlparse qw(&parseuri); use vars qw($progversion %links %names %backlinks %linkexists $title $doingtitle $baseurl $nextchar $urlbase $fail %exists @values $item $newbase $inversetag ); # Walk around examining files for links. my $progversion = "HTML LINK directory 1.7 February 25th, 2000"; # Get configuration info for dir vs. html mapping. use configdirsvshtml qw($thissite &homehtmldir &urlfromdirectory &directoryfromurl); use treewalk qw(&doit &bustname $needspace $wantabs $wanthelp $wantdots $wantfull $wantbig $wantsize $wantperms $wantdirs $wantnogrf $maxdepth $nodirmessage ); # -------------------------- On with the show. -------------------------------- # ------------------ $wantabs = 0; # Want fully qualified URL's only. %links = (); %names = (); %backlinks = (); %linkexists = (); $title = ''; $doingtitle = 0; $baseurl = ''; # --- Link processor; # (This section is a kludge, and not really correct... # # Someday it needs to be replaced by something # # more reasonable) $nextchar = ''; sub nextchar { if (eof THISFILE) { $nextchar = ''; return '' } $nextchar = getc THISFILE; $nextchar = '' if !defined $nextchar; return $nextchar; } sub skipwhite { while ($nextchar ne '' && $nextchar =~ m/^\s$/) { &nextchar() } } sub getquoted { my $quote = shift; my $result = ''; &nextchar(); while ($nextchar ne '' && $nextchar ne $quote) { if ($nextchar eq "\n") { $result .= '\\n' } elsif ($nextchar eq "\r") { $result .= '\\r' } elsif ($nextchar eq "\b") { $result .= '\\b' } elsif ($nextchar eq "\f") { $result .= '\\f' } else { $result .= $nextchar } if ($nextchar eq "\n") { last; } &nextchar(); } &nextchar() if $nextchar eq $quote || $nextchar eq "\n";; return $result; } sub gettoken { my $token = ''; &skipwhite(); if ($nextchar eq '\'' || $nextchar eq '"') { return &getquoted($nextchar); } while ( $nextchar ne '' && $nextchar ne '=' && $nextchar ne '>' && $nextchar !~ m/^\s/) { $token .= $nextchar; &nextchar(); } &skipwhite(); if ($token eq '' && $nextchar eq '=') { &nextchar(); &skipwhite(); return '='} return lc $token; } sub getpair { my $value = ''; my $name = &gettoken(); if ($nextchar eq '=') { &nextchar(); $value = &gettoken(); } return ($name, $value); } sub skiprestoftag { my $junk; while (&gettoken() ne '') {} if ($nextchar eq '>') {&nextchar()} } # ============== Deal with URL's. # --- Scan a file looking for HTML links. sub scanforlinks { my $depth = shift; my $directory = shift; my $name = shift; my $newbase; if ($depth == 0) {$newbase = $urlbase} else { $newbase = &urlfromdirectory($directory) } my ($scheme, $authority, $path, $query, $fragment) = &parseuri ($newbase); $scheme = 'http' if !defined $scheme; $authority = $thissite if !defined $authority; $linkexists{$newbase} = 1; while ($nextchar ne '') { while ($doingtitle && $nextchar ne '' && $nextchar ne '<') { $title .= $nextchar; &nextchar(); } while ($nextchar ne '<' && $nextchar ne '') { &nextchar() } if ($nextchar eq '<') { $inversetag = 0; &nextchar(); &skipwhite(); if ($nextchar eq '/') { $inversetag = 1 ; &nextchar(); &skipwhite();} elsif ($nextchar eq '!') { &skiprestoftag(); next; } my ($tagname, $value) = &getpair(); # Shouldn't actually have a value. if ($tagname eq 'title') { $doingtitle = !$inversetag; &skiprestoftag(); next; } # Parsing: my $attributename = $tagname; while ($attributename ne '') { ($attributename, $value) = &getpair(); if ($attributename eq 'name' && $tagname eq 'a') { $value = '#'. $value } my $absvalue = $value; if (($attributename eq 'name' && $tagname eq 'a') || $attributename eq 'href' || $attributename eq 'src') { $absvalue = &makeurlabs($value, $scheme, $authority, $path, $query, $fragment); if (!defined $absvalue || $absvalue eq '') { $absvalue = $value } my $referringlink = $absvalue; if ($attributename eq 'name' && $tagname eq 'a') { $linkexists{$referringlink} = 1 if defined $value && $value ne ''; $names{$value} = 1; } elsif ($attributename eq 'href' && $tagname eq 'base') { $referringlink = $newbase; $newbase = $absvalue; ($value, $scheme, $authority, $path, $query, $fragment) = &parseuri ($newbase); $scheme = 'http' if !defined $scheme; $authority = $thissite if !defined $authority; } if (! defined $links{$absvalue}) { $backlinks{$absvalue} = '' if !defined $backlinks{$absvalue}; $backlinks{$absvalue} .= ' - ' . $newbase . "\n"; } $links{$absvalue} = 1; } } # endwhile } } return $newbase; } # --- Global initialization and finalization sub init { # Any global initialization. return; } sub fini { # Any global finishing code. return; } # ----- What to do for each argument the user gives. sub arginit { # Any per argument initialization my $argument = shift; $baseurl = &urlfromdirectory($argument); %links = (); %backlinks = (); # no need to clear out "linkexists" since this argument # may overlap last one. return; } sub argfini{ # Any per argument finish code. my $argument = shift; my $url; if (%backlinks) { print "\n----------- Back Links -------------------------\n"; foreach $url (sort(keys %backlinks)) { print $url; next if !defined $url || $url eq ''; my ($scheme, $authority, $path, $query, $fragment) = &parseuri ($url); # MISTAKE? if (!defined $scheme || $scheme eq '' || $scheme !~ m/^https?$/) { print "\n"; next } if (!defined $linkexists{$url}) { # The link isn't one we saw, should we have? my $actual = &directoryfromurl ($url); if (defined $actual && $actual ne '') { if (!-e $actual && !-e $actual . '/index.html' && !-e $actual . '/.index.html') { print " *** url doesn't exist? ($actual) ***\a"; $linkexists{$url} = 1; } else { if (-e $actual && -d $actual && $actual !~ m:/$:) { print " *** Directory URL's should end in \"/\"***\a" } if (defined $fragment && $fragment ne '') { # And the full fragmented url doesn't exist print " *** Search tag doesn't seem to exist ***\a"; } } } } print "\n"; print $backlinks{$url}, "\n"; } print "-----------------------------------------\n"; } return; } # --- Do we want to see this file? sub displayfile { # Return 0 to make this file invisible. my $fullname = shift; my $inode = shift; $inode = '' if !defined $inode; return 1; } # --- Do we want to see this directory? sub wantdir { # Return 0 to force cull of tree here. return 1; } # --- Process a directory. sub enterdirectory { my $indent = shift; my $dirname = shift; my $url = ''; if (!-e $dirname) { return } if (-d $dirname) { $url = &urlfromdirectory($dirname); if ($url ne '') { if ($url =~ m;^(.*/)\.?index\.html?$;) { $url = $1 } print $indent, "Directory URL: ", $url; print " *** No default index.html file\a" if !-e "${dirname}index.html"; print "\n"; $needspace = 1; } } return; } # --- Process a directory. sub exitdirectory { my $indent = shift; my $dirname = shift; my ($dir, $file) = &main::bustname ($dirname); return } # --- Process a file. sub processfile { my $depth = shift; my $indent = shift; my $fullname = shift; my $url = ''; my $baseurl = ''; $needspace = 0; %links = (); %names = (); if (!-f $fullname) { return 0 } if (-d $fullname) { return 0 } if ($fullname !~ m/\.(html|htm)$/i) { return 0 } $title = ''; $doingtitle = 0; if (open (THISFILE, $fullname)) { &nextchar(); $newbase = &scanforlinks ($depth, $fullname); if ($title ne '') { $title =~ s/\n/\\n/g; $title =~ s/^\s/\\s/; $title =~ s/\s$/\\s/; print $indent, "Title: ", $title, "\n"; $needspace = 1; } $url = &urlfromdirectory($fullname); # print "Directory URL $url formed from directory $fullname\n"; if (defined $url && $url ne '') { if ($url =~ m;^(.*/)index\.html$;) { $url = $1 } print $indent, "URL: ", $url, "\n"; } $newbase = $url if !defined $newbase; if (defined $url && $url ne '' && $url ne $newbase && $newbase ne "$url/index.html") { print $indent, "BASE URL: ", $newbase, "\n"; } if (%names) { foreach $item (sort (keys (%names))) { print $indent, 'Name Anchor: ', $item, "\n"; } } if (%links) { my ($scheme, $authority, $path, $query, $fragment) = &parseuri($newbase); my $item; my $finalitem; my $filedir; $url = &urlfromdirectory($fullname); my %rmap = (); if ($wantabs) { @values = sort (keys (%links)); } else { my $rel; foreach $item (keys (%links)) { $rel = &makeurlrel($item, $scheme, $authority, $path, $query, $fragment); if ($rel eq './' or $rel eq '.' or $rel eq '/' or $rel =~ m/^(\.\.\/)+$/) { $rel = $item; # print "DEBUG: Path \"$item\" becomes $rel\n"; } $rmap {$rel} = $item; } @values = sort (keys (%rmap)); } my $abs; foreach $item (sort (@values)) { if ($wantabs) { $abs = $item } else { $abs = $rmap{$item} } print $indent, 'HREF: ', $item; if (!$exists{$abs}) { if ($item =~ m/\\n/) { print " *** URL contains new line character\a" } if ($item =~ m/\\r/) { print " *** URL contains a carriage return\a" } if ($item =~ m/\\b/) { print " *** URL contains a back space character\a" } if ($item =~ m/\\f/) { print " *** URL contains a form feed character\a" } $filedir = &directoryfromurl ($item); if (defined $filedir && $filedir ne '') { if (!-e $filedir) { print " *** Doesn't exist ($filedir)\a" } else { $exists{$abs} = 1 } # We really need to do the other checks here. } } print "\n"; } } $needspace = 1; close (THISFILE) or die "Unable to close $fullname ($!)\n"; } return; } # ---- Deal with directories. sub processdir { my $indent = shift; my $fullname = shift; $needspace = 0; # print "$indent + DEBUG: processfile called with $fullname\n"; return; } # ---- Deal with program optons. $fail = 0; sub processuseropt { # No options specific to this package. my $opt = shift; # print "DEBUG: Process user options called with $opt\n"; if ($opt eq 'base') { $urlbase = shift @ARGV; if (!defined $urlbase) { warn "??? No name given for URL base\n"; $fail ++; } else { if ($urlbase !~ m;/$;) {$urlbase .= '/'} return 1; } } elsif ($opt eq 'abs') { $wantabs = 1; return 1; } return 0; # return 1 if you processed the option. } sub printuseropt { # Ditto # print "DEBUG: printuseroption called\n"; warn " -base xxxx Fully qualified URL base to use.\n"; warn " -abs Always do absolute instead of relative URL's\n"; return; } # --- What to do if the user gives nothing. sub defaultfile { # what to do if the user specified nothing. # print "DEBUG: Default file called... $homehtmldir\n"; return &homehtmldir(); } # --- Fix up file name. sub mungename { my $given = shift; return $given; } # -------------------- &doit ($progversion); exit 0;