Speed up the parsing loop by not checking the database for existing rows, if -f (forced) is specified anyway.

master
Sven Slootweg 12 years ago
parent 7bae114ee9
commit f690610b9a

140
parse

@ -90,10 +90,12 @@ for email_file in file_list:
# To save time when updating a database, let's first check whether the filename is already present in the database as a hash. # To save time when updating a database, let's first check whether the filename is already present in the database as a hash.
# There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything. # There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything.
sha1_hash = os.path.splitext(os.path.split(email_file)[1])[0] sha1_hash = os.path.splitext(os.path.split(email_file)[1])[0]
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
if len(cursor.fetchall()) > 0 and options['forced'] == False: if options['forced'] == False:
print "Skipping %s, already exists in the database according to filename." % sha1_hash cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
continue if len(cursor.fetchall()) > 0:
print "Skipping %s, already exists in the database according to filename." % sha1_hash
continue
message = email.message_from_file(open(email_file, 'r')) message = email.message_from_file(open(email_file, 'r'))
@ -111,81 +113,85 @@ for email_file in file_list:
sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest() sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest()
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,)) if options['forced'] == False:
if len(cursor.fetchall()) == 0 or options['forced'] == True: cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
message_parts = [find_submessages(message)] if len(cursor.fetchall()) > 0:
message_parts = flatten(message_parts) print "Skipping %s, already exists in the database according to message hash." % sha1_hash
continue
for part in message_parts: message_parts = [find_submessages(message)]
if part.get_filename() is None: message_parts = flatten(message_parts)
# Part of the message
if part.get_content_type() == "text/plain": for part in message_parts:
if textbody == "": if part.get_filename() is None:
# Part of the message
if part.get_content_type() == "text/plain":
if textbody == "":
try:
textbody = part.get_payload(decode=True).decode(get_charset(part))
except UnicodeDecodeError:
# This part is probably in windows-1252 encoding
try: try:
textbody = part.get_payload(decode=True).decode(get_charset(part)) textbody = part.get_payload(decode=True).decode('windows-1252')
except UnicodeDecodeError: except UnicodeDecodeError:
# This part is probably in windows-1252 encoding # Ok, we really have no clue how to decode this, we'll just skip it...
try: continue
textbody = part.get_payload(decode=True).decode('windows-1252') except LookupError:
except UnicodeDecodeError: pass
# Ok, we really have no clue how to decode this, we'll just skip it... elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml":
continue if htmlbody == "":
except LookupError: try:
pass htmlbody = part.get_payload(decode=True).decode(get_charset(part))
elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml": except UnicodeDecodeError:
if htmlbody == "": # This part is probably in windows-1252 encoding
try: try:
htmlbody = part.get_payload(decode=True).decode(get_charset(part)) htmlbody = part.get_payload(decode=True).decode('windows-1252')
except UnicodeDecodeError: except UnicodeDecodeError:
# This part is probably in windows-1252 encoding # Ok, we really have no clue how to decode this, we'll just skip it...
try: continue
htmlbody = part.get_payload(decode=True).decode('windows-1252') except LookupError:
except UnicodeDecodeError: pass
# Ok, we really have no clue how to decode this, we'll just skip it...
continue
except LookupError:
pass
else:
# Technically this is supposed to be part of the message body, but we have no idea what format it is in...
print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
else: else:
# Attachment # Technically this is supposed to be part of the message body, but we have no idea what format it is in...
attachment_data = part.get_payload(decode=True) print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
attachment_sha1 = hashlib.sha1(attachment_data).hexdigest() else:
attachment_filename = part.get_filename() # Attachment
attachment_type = part.get_content_type() attachment_data = part.get_payload(decode=True)
attachment_extension = os.path.splitext(attachment_filename)[1] attachment_sha1 = hashlib.sha1(attachment_data).hexdigest()
attachment_size = len(attachment_data) attachment_filename = part.get_filename()
attachment_destination = "%s/%s%s" % (options['attachment_dir'], attachment_sha1, attachment_extension) attachment_type = part.get_content_type()
attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size)) attachment_extension = os.path.splitext(attachment_filename)[1]
attachment_size = len(attachment_data)
attachment_file = open(attachment_destination, "w") attachment_destination = "%s/%s%s" % (options['attachment_dir'], attachment_sha1, attachment_extension)
attachment_file.write(attachment_data) attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size))
attachment_file.close()
attachment_file = open(attachment_destination, "w")
try: attachment_file.write(attachment_data)
timestamp = int(time.mktime(email.utils.parsedate(message['date']))) attachment_file.close()
except TypeError:
timestamp = 0
print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash) try:
cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row) timestamp = int(time.mktime(email.utils.parsedate(message['date'])))
print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash except TypeError:
else: timestamp = 0
print "Skipping %s, already exists in the database." % sha1_hash print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash
if len(attachment_list) > 0: if len(attachment_list) > 0:
inserted = 0 inserted = 0
for attachment in attachment_list: for attachment in attachment_list:
cursor.execute("SELECT * FROM attachments WHERE `Hash` = ?", (attachment[2],)) if options['forced'] == False:
if len(cursor.fetchall()) == 0 or options['forced'] == True: cursor.execute("SELECT * FROM attachments WHERE `Hash` = ?", (attachment[2],))
new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3]) if len(cursor.fetchall()) > 0:
cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row) print "Skipping attachment %s, already exists in the database." % attachment[2]
inserted += 1 continue
else:
print "Skipping attachment %s, already exists in the database." % attachment[2] new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3])
cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row)
inserted += 1
if inserted > 0: if inserted > 0:
print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash) print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash)

Loading…
Cancel
Save