Check presence of hash in the database before parsing the message, according to the filename

master
Sven Slootweg 13 years ago
parent b41d658f16
commit 7bae114ee9

12
parse

@ -87,6 +87,14 @@ file_list = glob.glob(options['pattern'])
finished = 0 finished = 0
for email_file in file_list: for email_file in file_list:
# To save time when updating a database, let's first check whether the filename is already present in the database as a hash.
# There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything.
sha1_hash = os.path.splitext(os.path.split(email_file)[1])[0]
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
if len(cursor.fetchall()) > 0 and options['forced'] == False:
print "Skipping %s, already exists in the database according to filename." % sha1_hash
continue
message = email.message_from_file(open(email_file, 'r')) message = email.message_from_file(open(email_file, 'r'))
if message['message-id'] is None: if message['message-id'] is None:
@ -103,6 +111,8 @@ for email_file in file_list:
sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest() sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest()
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
if len(cursor.fetchall()) == 0 or options['forced'] == True:
message_parts = [find_submessages(message)] message_parts = [find_submessages(message)]
message_parts = flatten(message_parts) message_parts = flatten(message_parts)
@ -159,8 +169,6 @@ for email_file in file_list:
timestamp = 0 timestamp = 0
print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
if len(cursor.fetchall()) == 0 or options['forced'] == True:
new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash) new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row) cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash

Loading…
Cancel
Save