From 37c1fc0bdb2cdcf6010f11f66175d63f492f3cc2 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 27 May 2012 16:19:30 +0200 Subject: [PATCH] Small fixes --- parse | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/parse b/parse index 1372162..0512217 100755 --- a/parse +++ b/parse @@ -21,7 +21,13 @@ options = vars(args) def getheader(header_text, default="ascii"): headers = email.header.decode_header(header_text) - header_sections = [unicode(text, charset or default) for text, charset in headers] + try: + header_sections = [unicode(text, charset or default) for text, charset in headers] + except UnicodeDecodeError: + header_sections = [unicode(text, 'windows-1252') for text, charset in headers] + except LookupError: + return u"" + return u"".join(header_sections) def find_submessages(message): @@ -41,12 +47,17 @@ def flatten(x): return result def get_charset(part): + charset = None + if part.get_content_charset(): - return part.get_content_charset() + charset = part.get_content_charset() elif part.get_charset(): - return part.get_charset() - else: + charset = part.get_charset() + + if charset is None or charset == "default" or charset.startswith("us-ascii"): return "ascii" + else: + return charset # Connect to database database = sqlite3.connect(options['database']) @@ -74,7 +85,7 @@ except sqlite3.OperationalError: file_list = glob.glob(options['pattern']) for email_file in file_list: - message = email.message_from_file(open(email_file)) + message = email.message_from_file(open(email_file, 'r')) if message['message-id'] is None: print "%s is not a valid e-mail file." % email_file @@ -98,10 +109,20 @@ for email_file in file_list: # Part of the message if part.get_content_type() == "text/plain": if textbody == "": - textbody = part.get_payload(decode=True).decode(get_charset(part)) + try: + textbody = part.get_payload(decode=True).decode(get_charset(part)) + except UnicodeDecodeError: + textbody = part.get_payload(decode=True).decode('windows-1252') + except LookupError: + pass elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml": if htmlbody == "": - htmlbody = part.get_payload(decode=True).decode(get_charset(part)) + try: + htmlbody = part.get_payload(decode=True).decode(get_charset(part)) + except UnicodeDecodeError: + htmlbody = part.get_payload(decode=True).decode('windows-1252') + except LookupError: + pass else: # Technically this is supposed to be part of the message body, but we have no idea what format it is in... print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash