emailparser/parse

193 lines
6.9 KiB
Plaintext
Raw Normal View History

2012-05-27 05:09:27 +02:00
#!/usr/bin/python
2012-05-27 10:58:48 +02:00
import os, argparse, hashlib, email, email.header, email.utils, glob, sqlite3, time
2012-05-27 12:15:04 +02:00
parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.')
2012-05-27 10:24:24 +02:00
parser.add_argument('-p', dest='pattern', action='store', default='*',
help='glob pattern (including path) that has to be matched for a file to be parsed')
parser.add_argument('-d', dest='database', action='store', default='emails.db',
help='path of the database that should be used to store the emails (will be created if it does not exist yet)')
parser.add_argument('-a', dest='attachment_dir', action='store', default='attachments',
help='path where attachments should be stored (will be created if it does not exist yet)')
parser.add_argument('-f', '--forced', dest='forced', action='store_true',
help='force insertion into database, even if entries already exist')
2012-05-27 10:24:24 +02:00
args = parser.parse_args()
options = vars(args)
def getheader(header_text, default="ascii"):
headers = email.header.decode_header(header_text)
2012-05-27 16:19:30 +02:00
try:
header_sections = [unicode(text, charset or default) for text, charset in headers]
except UnicodeDecodeError:
header_sections = [unicode(text, 'windows-1252') for text, charset in headers]
except LookupError:
return u""
return u"".join(header_sections)
2012-05-27 09:43:08 +02:00
def find_submessages(message):
if message.is_multipart():
return [find_submessages(part) for part in message.get_payload()]
else:
return message
def flatten(x):
# http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks
result = []
for el in x:
if hasattr(el, "__iter__") and not isinstance(el, basestring):
result.extend(flatten(el))
else:
result.append(el)
return result
def get_charset(part):
2012-05-27 16:19:30 +02:00
charset = None
2012-05-27 09:43:08 +02:00
if part.get_content_charset():
2012-05-27 16:19:30 +02:00
charset = part.get_content_charset()
2012-05-27 09:43:08 +02:00
elif part.get_charset():
2012-05-27 16:19:30 +02:00
charset = part.get_charset()
if charset is None or charset == "default" or charset.startswith("us-ascii"):
2012-05-27 09:43:08 +02:00
return "ascii"
2012-05-27 16:19:30 +02:00
else:
return charset
2012-05-27 10:26:03 +02:00
# Connect to database
database = sqlite3.connect(options['database'])
2012-05-27 07:20:01 +02:00
cursor = database.cursor()
2012-05-27 07:00:46 +02:00
try:
2012-05-27 10:26:03 +02:00
# Create attachment directory first
os.makedirs(options['attachment_dir'])
2012-05-27 10:14:04 +02:00
except OSError:
pass
try:
# Try to create emails table
2012-05-27 07:20:01 +02:00
cursor.execute("CREATE TABLE emails (`message_id`, `from`, `to`, `subject`, `date`, `body`, `html`, `hash`)")
2012-05-27 07:00:46 +02:00
except sqlite3.OperationalError:
2012-05-27 10:14:04 +02:00
pass
try:
# Try to create attachments table
cursor.execute("CREATE TABLE attachments (`message_hash`, `filename`, `type`, `hash`, `size`)")
except sqlite3.OperationalError:
2012-05-27 07:00:46 +02:00
pass
2012-05-27 10:26:03 +02:00
# Select all files matching the given pattern
2012-05-27 06:46:08 +02:00
file_list = glob.glob(options['pattern'])
finished = 0
2012-05-27 06:46:08 +02:00
for email_file in file_list:
2012-05-27 16:19:30 +02:00
message = email.message_from_file(open(email_file, 'r'))
2012-05-27 06:46:08 +02:00
if message['message-id'] is None:
print "%s is not a valid e-mail file." % email_file
else:
if 'subject' not in message or message['subject'] is None:
subject = ""
else:
subject = message['subject']
2012-05-27 09:43:08 +02:00
textbody = ""
htmlbody = ""
2012-05-27 10:23:52 +02:00
attachment_list = []
2012-05-27 09:43:08 +02:00
2012-05-27 06:46:08 +02:00
sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest()
2012-05-27 07:00:46 +02:00
2012-05-27 09:43:08 +02:00
message_parts = [find_submessages(message)]
message_parts = flatten(message_parts)
for part in message_parts:
if part.get_filename() is None:
# Part of the message
if part.get_content_type() == "text/plain":
if textbody == "":
2012-05-27 16:19:30 +02:00
try:
textbody = part.get_payload(decode=True).decode(get_charset(part))
except UnicodeDecodeError:
# This part is probably in windows-1252 encoding
try:
textbody = part.get_payload(decode=True).decode('windows-1252')
except UnicodeDecodeError:
# Ok, we really have no clue how to decode this, we'll just skip it...
continue
2012-05-27 16:19:30 +02:00
except LookupError:
pass
2012-05-27 09:43:08 +02:00
elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml":
if htmlbody == "":
2012-05-27 16:19:30 +02:00
try:
htmlbody = part.get_payload(decode=True).decode(get_charset(part))
except UnicodeDecodeError:
# This part is probably in windows-1252 encoding
try:
htmlbody = part.get_payload(decode=True).decode('windows-1252')
except UnicodeDecodeError:
# Ok, we really have no clue how to decode this, we'll just skip it...
continue
2012-05-27 16:19:30 +02:00
except LookupError:
pass
2012-05-27 09:43:08 +02:00
else:
2012-05-27 10:26:03 +02:00
# Technically this is supposed to be part of the message body, but we have no idea what format it is in...
2012-05-27 09:43:08 +02:00
print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
else:
# Attachment
2012-05-27 10:14:04 +02:00
attachment_data = part.get_payload(decode=True)
attachment_sha1 = hashlib.sha1(attachment_data).hexdigest()
attachment_filename = part.get_filename()
attachment_type = part.get_content_type()
2012-05-28 08:11:51 +02:00
attachment_extension = os.path.splitext(attachment_filename)[1]
2012-05-27 10:14:04 +02:00
attachment_size = len(attachment_data)
2012-05-28 08:11:51 +02:00
attachment_destination = "%s/%s%s" % (options['attachment_dir'], attachment_sha1, attachment_extension)
2012-05-27 10:23:52 +02:00
attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size))
2012-05-27 10:14:04 +02:00
attachment_file = open(attachment_destination, "w")
attachment_file.write(attachment_data)
attachment_file.close()
2012-05-27 09:43:08 +02:00
2012-05-27 10:58:48 +02:00
try:
timestamp = int(time.mktime(email.utils.parsedate(message['date'])))
except TypeError:
timestamp = 0
print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
2012-05-27 07:20:01 +02:00
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
if len(cursor.fetchall()) == 0 or options['forced'] == True:
new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash
else:
print "Skipping %s, already exists in the database." % sha1_hash
2012-05-27 10:14:04 +02:00
if len(attachment_list) > 0:
inserted = 0
2012-05-27 10:14:04 +02:00
for attachment in attachment_list:
cursor.execute("SELECT * FROM attachments WHERE `Hash` = ?", (attachment[2],))
if len(cursor.fetchall()) == 0 or options['forced'] == True:
new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3])
cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row)
inserted += 1
else:
print "Skipping attachment %s, already exists in the database." % attachment[2]
if inserted > 0:
print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash)
finished += 1
if finished % 100 == 0:
database.commit()
print "%d e-mails done, commited changes to database." % finished
2012-05-27 07:20:01 +02:00
database.commit()
print "Changes successfully committed to database, all done."