#! D:\perl\bin\perl.exe

###############################################################################
###############################################################################
#
# To extract image (IMG) alternative (ALT) text from an HTML page to a plain
#  text file for easy editing.
#
#			Version 1
#
#	by Andrew Hardwick, http://duramecho.com, 2002/4/2
#
#	Released under GNU Public Licence.
#
###############################################################################
###############################################################################
#
# How To Use
#
# Run from a command line with the source file path (relative to the current
#  working directory) as the arguement.
# Output will be to a file with the same name as the source file with
#  '.AltText.txt' appended.
# Output format for each image is the image path, as used in the HTML IMG
#  SRC tag, followed by a line break followed by the ALT text followed by two
#  line breaks.
# If an image is found with no ALT text then it is given the ALT text
#  'NO_ALT_TEXT' in the output file.
#
###############################################################################
#
# Known Deficiencies
#
# The output format cannot cope with double line breaks in ALT text because
#  that is used as a record separator.
# If the same image file is accessed by two different paths (e.g. relative &
#  absolute) then it will be counted as two separate images.
# If the same image file is found with different ALT texts, only the last
#  one is used.
#
###############################################################################
###############################################################################

# Include libraries
use Cwd;		# To find current directory
use strict;		# Disenable automatic variables

###############################################################################
# Main rountine
###############################################################################

{	# Get data from HTML file
	my $From=cwd().'/'.$ARGV[0];
	open(HTMLFILE,'<'.$From)or
			die("Cannot open $From to read.");
	my $Html;
	read HTMLFILE,$Html,-s $From;
	close HTMLFILE;
	# Find IMG elements
	print "Images found:\n";
	my %AltTexts;
	while($Html=~/(<IMG\b.*?>)/isg)
	{	my $ImgElement=$1;
		# Find SRC tag
		$ImgElement=~/SRC\s*=\s*([\"\'])(.*?)\1/is;
		my $Src=$2;
		print " $Src\n";
		# Find ALT tag & store text
		if($ImgElement=~/ALT\s*=\s*([\"\'])(.*?)\1/is)
		{	$AltTexts{$Src}=$2;}
		else
		{	# No ALT found so give it default text if none already found
			unless(exists($AltTexts{$Src}))
			{	$AltTexts{$Src}='NO_ALT_TEXT';}}}
	# Write extracted data to a file
	my $To=cwd().'/'.$ARGV[0].'AltText.txt';
	open(ALTTEXTFILE,'>'.$To)or
			die("Cannot open $To to write.");
	foreach my $SrcPath (sort keys %AltTexts)
	{	print ALTTEXTFILE "$SrcPath\n$AltTexts{$SrcPath}\n\n";}
	close ALTTEXTFILE;}

###############################################################################
