#! D:\perl\bin\perl.exe ############################################################################### ############################################################################### # # To extract image (IMG) alternative (ALT) text from an HTML page to a plain # text file for easy editing. # # Version 1 # # by Andrew Hardwick, http://duramecho.com, 2002/4/2 # # Released under GNU Public Licence. # ############################################################################### ############################################################################### # # How To Use # # Run from a command line with the source file path (relative to the current # working directory) as the arguement. # Output will be to a file with the same name as the source file with # '.AltText.txt' appended. # Output format for each image is the image path, as used in the HTML IMG # SRC tag, followed by a line break followed by the ALT text followed by two # line breaks. # If an image is found with no ALT text then it is given the ALT text # 'NO_ALT_TEXT' in the output file. # ############################################################################### # # Known Deficiencies # # The output format cannot cope with double line breaks in ALT text because # that is used as a record separator. # If the same image file is accessed by two different paths (e.g. relative & # absolute) then it will be counted as two separate images. # If the same image file is found with different ALT texts, only the last # one is used. # ############################################################################### ############################################################################### # Include libraries use Cwd; # To find current directory use strict; # Disenable automatic variables ############################################################################### # Main rountine ############################################################################### { # Get data from HTML file my $From=cwd().'/'.$ARGV[0]; open(HTMLFILE,'<'.$From)or die("Cannot open $From to read."); my $Html; read HTMLFILE,$Html,-s $From; close HTMLFILE; # Find IMG elements print "Images found:\n"; my %AltTexts; while($Html=~/()/isg) { my $ImgElement=$1; # Find SRC tag $ImgElement=~/SRC\s*=\s*([\"\'])(.*?)\1/is; my $Src=$2; print " $Src\n"; # Find ALT tag & store text if($ImgElement=~/ALT\s*=\s*([\"\'])(.*?)\1/is) { $AltTexts{$Src}=$2;} else { # No ALT found so give it default text if none already found unless(exists($AltTexts{$Src})) { $AltTexts{$Src}='NO_ALT_TEXT';}}} # Write extracted data to a file my $To=cwd().'/'.$ARGV[0].'AltText.txt'; open(ALTTEXTFILE,'>'.$To)or die("Cannot open $To to write."); foreach my $SrcPath (sort keys %AltTexts) { print ALTTEXTFILE "$SrcPath\n$AltTexts{$SrcPath}\n\n";} close ALTTEXTFILE;} ###############################################################################