#! D:\perl\bin\perl.exe

###############################################################################
###############################################################################
#
# To remove the mass of unwanted formatting & other crap that M$ Word 
#	puts in HTML when converting a Word document to HTML
#
#	by Andrew Hardwick, http://duramecho.com,
#	Released under GNU Public Licence.
#
###############################################################################
###############################################################################
# Version 1, 2001/12/12
# Version 2, 2002/3/22
#  More garbage removal added.
# Version 3, 2002/8/11
#  More garbage removal added.
# Version 4, 2005/3/5
#  Just added this version history section into the comments.
# Version 5, 2008/3/21
#  Made it convert from M$ Windows codepage 1252 character set to UTF-8.
###############################################################################
###############################################################################
# How To Use
# Run from a command line with the source file name as arguement.
# Output is to the same directory with file name prepended with 'Stripped'.
# The following things still need manual correction:
#  Remove small caps formatting before converting to HTML as Word converts
#   the characters instead of applying it as formatting.
#  Get rid of the <SUB>...</SUB> subscripting of pictures created from
#	Equation Editor equations.
#  Convert bulletted lists back from the paragraphs with dots that Word
#   saves them as to HTML bulletted <UL><LI>...</LI></UL> lists.
###############################################################################
###############################################################################

# Include libraries
use strict;		# Disenable automatic variables
use Cwd;		# To find current directory
use Encode;		# For changing character encodings

# Get data from file
my $From=cwd().'/'.$ARGV[0];
open(FILETOREAD,'<:encoding(windows-1252)',$From)||
		die("Cannot open $From to read.");
my $Html;
read FILETOREAD,$Html,-s $From;
close FILETOREAD;

# Remove HTML/XML comments
$Html=~s/<!--.*?-->//gs;
# Remove XML
$Html=~s/<!.*?>//gs;
# Remove Meta elements
$Html=~s/<meta\s.*?>//gsi;
# Remove visual formatting
$Html=~s/<span\s.*?>//gsi;
$Html=~s/<\/span>//gsi;
# Remove some unknown stuff
$Html=~s/<o:p>//gsi;
$Html=~s/<\/o:p>//gsi;
# Remove stylesheet stuff
$Html=~s/<p\s.*?>/<p>/gsi;
$Html=~s/<link\s.*?>//gsi;
$Html=~s/<style\W.*?<\/style>//gsi;
# Remove style attributes
$Html=~s/\sstyle\=\'[^']*?mso-.*?\'//gsi;
# Remove some more garbage
$Html=~s/<html\s.*?>/<html>/gsi;
$Html=~s/<body\s.*?>/<body>/gsi;
# Remove visual formatting
$Html=~s/<div\s.*?>//gsi;
$Html=~s/<\/div>//gsi;
# Remove duplicate spaces
$Html=~s/ +/ /gsi;
$Html=~s/(\&nbsp\;){2,}/\&nbsp\;/gsi;
$Html=~s/ \<p\>/\<p\>/gsi;
$Html=~s/\n{2,}/\n/gsi;
# Remove extra spaces after heading numbers
$Html=~s/(<h.*?>\S*?)\&nbsp\;/$1/gsi;
# Remove shape info from image tags
$Html=~s/\sv\:shapes\=\".*?\"\>/\>/gsi;
# Remove unused link targets (only footnote ones are linked to!)
$Html=~s/(<h.*?>)\s*?\<a\s+name\=\"_Toc\d+?\"\>(.*?)\<\/a\>/$1$2/gsi;
$Html=~s/<a\s+name\=\"_Ref\d+?\"\>Figure\s\<\/a\>/Figure /gsi;
$Html=~s/<a\s+name\=\"_Ref\d+?\"\>Table\s\<\/a\>/Table /gsi;
$Html=~s/<a\s+name\=\"_Ref\d+?\"\>\s?\<\/a\>//gsi;
# Remove table cell formatting
$Html=~s/(<td[^>]*?)\swidth\=\d+/$1/gsi;
$Html=~s/(<td[^>]*?)\svalign\=\w*/$1/gsi;
$Html=~s/(<td[^>]*?)\sstyle\=\'.*?\'/$1/gsi;
# Remove unnecessary paragraph marks from table cells
$Html=~s/(<td.*?>)\s*<p>(.*?)<\/p>\s*(<\/td>)/$1$2$3/gsi;

# Add DOCTYPE line
my $Temp='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '.
	'"http://www.w3.org/TR/html4/strict.dtd">';
$Html=$Temp."\n".$Html;
# Convert ASCII value non-breaking spaces to HTML ones
$Html=~s/\&\#160\;/\&nbsp\;/gsi;
# Specify character set as UTF-8 in header
$Temp="<meta http-equiv='content-type' content='text/html; charset=utf-8'>";
$Html=~s/(<\/head>)/$Temp\n$1/xsi;

# Write data to new file
my $To=cwd().'/Stripped'.$ARGV[0];
open(FILETOWRITE,'>:utf8',$To)||
		die("Cannot open $To to write.");
print FILETOWRITE $Html;
close FILETOWRITE;

