#!/usr/bin/python import os, argparse, hashlib, email, email.header, email.utils, glob parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.') parser.add_argument('-p', dest='pattern', action='store', default='*', help='glob pattern (including path) that has to be matched for a file to be parsed') parser.add_argument('-I', '--ignore-invalid', dest='ignore_invalid', action='store_true', help='process invalid e-mail files anyway, for example when missing message-id headers') args = parser.parse_args() options = vars(args) def getheader(header_text, default="ascii"): headers = email.header.decode_header(header_text) try: header_sections = [unicode(text, charset or default) for text, charset in headers] except UnicodeDecodeError: header_sections = [unicode(text, 'windows-1252') for text, charset in headers] except LookupError: return u"" return u"".join(header_sections) def find_submessages(message): if message.is_multipart(): return [find_submessages(part) for part in message.get_payload()] else: return message def flatten(x): # http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks result = [] for el in x: if hasattr(el, "__iter__") and not isinstance(el, basestring): result.extend(flatten(el)) else: result.append(el) return result def get_charset(part): charset = None if part.get_content_charset(): charset = part.get_content_charset() elif part.get_charset(): charset = part.get_charset() if charset is None or charset == "default" or charset.startswith("us-ascii"): return "ascii" else: return charset # Select all files matching the given pattern file_list = glob.glob(options['pattern']) finished = 0 for email_file in file_list: # To save time when updating a database, let's first check whether the filename is already present in the database as a hash. # There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything. message = email.message_from_file(open(email_file, 'r')) if message['message-id'] is None: if options['ignore_invalid'] == True: message_id = message['date'] print "WARNING: %s does not contain a valid message-id header. Falling back to date." % email_file else: print "%s is not a valid e-mail file." % email_file finished += 1 continue else: message_id = message['message-id'] if 'subject' not in message or message['subject'] is None: subject = "" else: subject = message['subject'] sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message_id, subject)).hexdigest() new_path = "%s/%s.eml" % (os.path.dirname(email_file), sha1_hash) os.rename(email_file, new_path) print "%s -> %s" % (email_file, new_path) finished += 1 print "Renamed %d files." % finished