#! D:\perl\bin\perl.exe ############################################################################### ############################################################################### # # To remove the mass of unwanted formatting & other crap that M$ Word # puts in HTML when converting a Word document to HTML # # by Andrew Hardwick, http://duramecho.com, # Released under GNU Public Licence. # ############################################################################### ############################################################################### # Version 1, 2001/12/12 # Version 2, 2002/3/22 # More garbage removal added. # Version 3, 2002/8/11 # More garbage removal added. # Version 4, 2005/3/5 # Just added this version history section into the comments. # Version 5, 2008/3/21 # Made it convert from M$ Windows codepage 1252 character set to UTF-8. ############################################################################### ############################################################################### # How To Use # Run from a command line with the source file name as arguement. # Output is to the same directory with file name prepended with 'Stripped'. # The following things still need manual correction: # Remove small caps formatting before converting to HTML as Word converts # the characters instead of applying it as formatting. # Get rid of the ... subscripting of pictures created from # Equation Editor equations. # Convert bulletted lists back from the paragraphs with dots that Word # saves them as to HTML bulletted lists. ############################################################################### ############################################################################### # Include libraries use strict; # Disenable automatic variables use Cwd; # To find current directory use Encode; # For changing character encodings # Get data from file my $From=cwd().'/'.$ARGV[0]; open(FILETOREAD,'<:encoding(windows-1252)',$From)|| die("Cannot open $From to read."); my $Html; read FILETOREAD,$Html,-s $From; close FILETOREAD; # Remove HTML/XML comments $Html=~s///gs; # Remove XML $Html=~s///gs; # Remove Meta elements $Html=~s///gsi; # Remove visual formatting $Html=~s///gsi; $Html=~s/<\/span>//gsi; # Remove some unknown stuff $Html=~s///gsi; $Html=~s/<\/o:p>//gsi; # Remove stylesheet stuff $Html=~s//

/gsi; $Html=~s///gsi; $Html=~s///gsi; # Remove style attributes $Html=~s/\sstyle\=\'[^']*?mso-.*?\'//gsi; # Remove some more garbage $Html=~s///gsi; $Html=~s///gsi; # Remove visual formatting $Html=~s///gsi; $Html=~s/<\/div>//gsi; # Remove duplicate spaces $Html=~s/ +/ /gsi; $Html=~s/(\ \;){2,}/\ \;/gsi; $Html=~s/ \/\/gsi; $Html=~s/\n{2,}/\n/gsi; # Remove extra spaces after heading numbers $Html=~s/(\S*?)\ \;/$1/gsi; # Remove shape info from image tags $Html=~s/\sv\:shapes\=\".*?\"\>/\>/gsi; # Remove unused link targets (only footnote ones are linked to!) $Html=~s/()\s*?\(.*?)\<\/a\>/$1$2/gsi; $Html=~s/Figure\s\<\/a\>/Figure /gsi; $Html=~s/Table\s\<\/a\>/Table /gsi; $Html=~s/\s?\<\/a\>//gsi; # Remove table cell formatting $Html=~s/(]*?)\swidth\=\d+/$1/gsi; $Html=~s/(]*?)\svalign\=\w*/$1/gsi; $Html=~s/(]*?)\sstyle\=\'.*?\'/$1/gsi; # Remove unnecessary paragraph marks from table cells $Html=~s/()\s*

(.*?)<\/p>\s*(<\/td>)/$1$2$3/gsi; # Add DOCTYPE line my $Temp=''; $Html=$Temp."\n".$Html; # Convert ASCII value non-breaking spaces to HTML ones $Html=~s/\&\#160\;/\ \;/gsi; # Specify character set as UTF-8 in header $Temp=""; $Html=~s/(<\/head>)/$Temp\n$1/xsi; # Write data to new file my $To=cwd().'/Stripped'.$ARGV[0]; open(FILETOWRITE,'>:utf8',$To)|| die("Cannot open $To to write."); print FILETOWRITE $Html; close FILETOWRITE;