#!/usr/bin/perl ################################################################################ ################################################################################ # # Convert emails downloaded as web pages from Google Desktop Search back # to a mailbox. # # Released under GPL. # # by Andrew Hardwick, http://duramecho.com # ################################################################################ ################################################################################ # Version 1, 2006/9/11 # Version 2, 2006/9/12 # Added conversion of dates in headers back from GDS to STMP format. # Version 3, 2006/9/12 # Added conversion of addresses in headers back from GDS to STMP format. # Version 4, 2006/9/13 # Added default values for To & From to stop Mozilla filling in wrongly. # Version 5, 2009/5/16 # Corrected dates in version history (had them as 2009 instead of 2006!). ################################################################################ # Instrucitons: # Run this with a current directory containing files, each of which is # an email web page saved from GDS with filename extension '.html' or # '.htm'. # It will extract the emails from the # web pages and reconstruct an MBOX format mailbox file from them suitable # for importing into Mozilla email client. The mailbox will be called # 'RecoveredEmails.mbox'. # Note that GDS is likely to have distorted the emails including coverting to # HTML, removing attachments & changing text styling. # Note that this source code is very verbose & not optimised for speed of # running. That is to make it easier to understand as sample code. ################################################################################ # Known deficiencies # It has only been tested with the Mozilla email client version 1.7.12. # (Even if it does not work with other email clients directly the # Mozilla can be used an intermediary so that is not too bad.) # It has only been tested with emails saved from Google Desktop # version 20051208-en & it depends highly on the formatting of the pages # so it is likely that it will need rewriting to work with other versions. # It is rather sloppy about line breaks formats. It outputs "\n" which happens # to match the email header format on M$ Windows but not necessarily on # other OSes but GDS is currently only available for Windows anyway. ################################################################################ ##### This file is formatted for 80 character lines and 4 character tabs. ###### ################################################################################ ################################################################################ # Load libraries ################################################################################ use strict; # Ban automatic variables use File::Find; # For file finding command use HTTP::Date; # For date string formating ################################################################################ { # Find all files (store names indexed by path) my @InputFilePaths; find( sub { if(-f&&/\.html?$/) { push(@InputFilePaths,$File::Find::name);}}, '.'); # Create mailbox open(Mailbox,">RecoveredEmails.mbox"); # Iterate over email web pages foreach my $InputFilePath (@InputFilePaths) { print "Processing: $InputFilePath\n"; # Slurp from file the email page as retrieved from GDS contents my $Page; open(InputFile,"<$InputFilePath"); read(InputFile,$Page,-s(InputFile)); close InputFile; # Get file date my $FileDate=(stat($InputFilePath))[9]; # Extract stylesheet $Page=~/(