#!/usr/bin/perl

################################################################################
################################################################################
#
# Convert emails downloaded as web pages from Google Desktop Search back
#  to a mailbox.
#
#	Released under GPL.
#
#		by Andrew Hardwick, http://duramecho.com
#
################################################################################
################################################################################
# Version 1, 2006/9/11
# Version 2, 2006/9/12
#  Added conversion of dates in headers back from GDS to STMP format.
# Version 3, 2006/9/12
#  Added conversion of addresses in headers back from GDS to STMP format.
# Version 4, 2006/9/13
#  Added default values for To & From to stop Mozilla filling in wrongly.
# Version 5, 2009/5/16
#  Corrected dates in version history (had them as 2009 instead of 2006!).
################################################################################
# Instrucitons:
#  Run this with a current directory containing files, each of which is
#   an email web page saved from GDS with filename extension '.html' or
#   '.htm'.
#  It will extract the emails from the
#   web pages and reconstruct an MBOX format mailbox file from them suitable
#   for importing into Mozilla email client. The mailbox will be called
#   'RecoveredEmails.mbox'.
#  Note that GDS is likely to have distorted the emails including coverting to
#   HTML, removing attachments & changing text styling.
#  Note that this source code is very verbose & not optimised for speed of
#   running. That is to make it easier to understand as sample code.
################################################################################
# Known deficiencies
#  It has only been tested with the Mozilla email client version 1.7.12.
#   (Even if it does not work with other email clients directly the
#   Mozilla can be used an intermediary so that is not too bad.)
#  It has only been tested with emails saved from Google Desktop
#   version 20051208-en & it depends highly on the formatting of the pages
#   so it is likely that it will need rewriting to work with other versions.
#  It is rather sloppy about line breaks formats. It outputs "\n" which happens
#   to match the email header format on M$ Windows but not necessarily on
#   other OSes but GDS is currently only available for Windows anyway.
################################################################################
##### This file is formatted for 80 character lines and 4 character tabs. ######
################################################################################

################################################################################
# Load libraries
################################################################################

use strict;			# Ban automatic variables
use File::Find;		# For file finding command
use HTTP::Date;		# For date string formating
	
################################################################################

{	# Find all files (store names indexed by path)
	my @InputFilePaths;
	find(	sub
			{	if(-f&&/\.html?$/)
				{	push(@InputFilePaths,$File::Find::name);}},
			'.');
	# Create mailbox
	open(Mailbox,">RecoveredEmails.mbox");
	# Iterate over email web pages
	foreach my $InputFilePath (@InputFilePaths)
	{	print "Processing: $InputFilePath\n";
		# Slurp from file the email page as retrieved from GDS contents
		my $Page;
		open(InputFile,"<$InputFilePath");
		read(InputFile,$Page,-s(InputFile));
		close InputFile;
		# Get file date
		my $FileDate=(stat($InputFilePath))[9];
		# Extract stylesheet
		$Page=~/(<style>.*?<\/style>)/ms;
		my $Stylesheet=$1;
		# Extract email
		$Page=~/(<blockquote\s.*?<\/blockquote>)/ms;
		my $Email=$1;
		# Extract email headers
		$Email=~/(<table\s.*?<\/table>)/ms;
		my $HeaderBlock=$1;
		# Extract email body
		$Email=~/<\/table>(.*?)<\/blockquote>/ms;
		my $Body=$1;
		# Create default values for vital headers
		my %Headers;
		$Headers{'Subject'}='[Unknown. Not found in GDS cache.]';
		$Headers{'Date'}='[Unknown. Not found in GDS cache.]';
		$Headers{'To'}='"[Unknown. Not found in GDS cache.]" <>';
		$Headers{'From'}='"[Unknown. Not found in GDS cache.]" <>';
		# Extract email subject
		$HeaderBlock=~/<b>(.*?)<\/b>/ms;
		$Headers{'Subject'}=$1;
		# Extract the block containing the other headers
		$HeaderBlock=~/<font size=-1>(.*?)<\/font>/ms;
		my $OtherHeaders=$1;
		# Extract the other attributes
		foreach my $HeaderLine (split '\s*<br>\s*',$OtherHeaders)
		{	$HeaderLine=~/^(.+?)\:\s*(.*)$/;
			$Headers{$1}=$2;}
		# Add custom header to warn that this is a recovered email
		$Headers{'X-Recovered-from-Google-cache'}=time2str($FileDate);
		# Convert headers back to SMTP format
		my %HeadersSmtp=%Headers;
		foreach my $HeaderName (keys %HeadersSmtp)
		{	if($HeaderName=~/^(?:Date|Sent|Received)$/i)
			{	# Date header, reformat from GDS to SMTP order & add weekday
				if($HeadersSmtp{$HeaderName}=~
						/(...)\s+(\d+)\s+(\d+)\s+-\s+(\d+):(\d+)(..)/)
				{	# Date format recognised so covert it
					my($Year,$Month,$Day,$Hour,$Minute)=($3,$1,$2,$4,$5);
					if($6 eq 'pm' && $Hour ne 12)
					{	$Hour=$Hour+12;}
					$HeadersSmtp{$HeaderName}=time2str(str2time(
							"$Day $Month $Year $Hour:$Minute:00 GMT"));}}
			elsif($HeaderName=~/^(?:To|From|Cc|Bcc|Reply-To)$/i)
			{	# Address header
				my @Addresses;
				# Split addresses on commas (names with commas repaired later)
				my @AddressStrings=split(',',$HeadersSmtp{$HeaderName});
				for(my $c=0;$c<@AddressStrings;$c++)
				{	# Try several different GDS output formats
					if($AddressStrings[$c]=~
							/^\s*&lt;<a\s+.*?>(.+?)<\/a>&gt;\s*$/)
					{	push(@Addresses,$1);}
					elsif($AddressStrings[$c]=~
							/^\s*&lt;(.+?)&gt;\s*$/)
					{	push(@Addresses,$1);}
					elsif($AddressStrings[$c]=~
							/^\s*<a\s+.*?>(.+?)<\/a>\s*$/)
					{	push(@Addresses,$1);}
					elsif($AddressStrings[$c]=~
							/^\s*(.+?)\s*&lt;<a\s+.*?>(.+?)<\/a>&gt;\s*$/)
					{	push(@Addresses,"\"$1\" <$2>");}
					elsif($AddressStrings[$c]=~
							/^\s*(.+?)\s*&lt;(.+?)&gt;\s*$/)
					{	push(@Addresses,"\"$1\" <$2>");}
					elsif($c<@AddressStrings-1)
					{	# Unrecognised so assume unwanted split on comma
						$AddressStrings[$c+1]=
								"$AddressStrings[$c],$AddressStrings[$c+1]";}
					else
					{	# Give up & let it through unchanged
						push(@Addresses,$AddressStrings[$c]);}}
				$HeadersSmtp{$HeaderName}=join(',',@Addresses);}}
		# Escape lines beginning with 'From'
		$Stylesheet=~s/^(From)/ $1/igm;
		$Body=~s/^(From)/ $1/igm;
		# Output mail separator
		print Mailbox "From - Mon Jan 1 00:00:00 1965\n";
		# Output SMTP headers
		foreach my $HeaderName (keys %Headers)
		{	print Mailbox "$HeaderName: $HeadersSmtp{$HeaderName}\n";}
		# Create MIME header
		print Mailbox "MIME-Version: 1.0\n";
		print Mailbox "Content-Type: text/html; charset=\"iso-8859-1\"\n";
		print Mailbox "\n";
		# Start HTML output
		print Mailbox "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD ".
				"HTML 4.0 Transitional//EN\">\n";
		print Mailbox "<html><body>\n";
		print Mailbox $Stylesheet;
		# Add in human readable unaltered copies of the headers
		print Mailbox "\n<ul>\n";
		foreach my $HeaderName (keys %Headers)
		{	print Mailbox "<li>$HeaderName: $Headers{$HeaderName}</li>\n";}
		print Mailbox "</ul>\n";
		# Output the recovered email body (at last!)
		print Mailbox $Body;
		# Finish off the email
		print Mailbox "\n</body></html>\n";
		print Mailbox "\n";}
	close Mailbox;}
	
################################################################################