Automatically migrated from Gitolite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

228 lines
8.2 KiB

9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
  1. #!/usr/bin/python
  2. import os, argparse, hashlib, email, email.header, email.utils, glob, sqlite3, time
  3. parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.')
  4. parser.add_argument('-p', dest='pattern', action='store', default='*',
  5. help='glob pattern (including path) that has to be matched for a file to be parsed')
  6. parser.add_argument('-d', dest='database', action='store', default='emails.db',
  7. help='path of the database that should be used to store the emails (will be created if it does not exist yet)')
  8. parser.add_argument('-a', dest='attachment_dir', action='store', default='attachments',
  9. help='path where attachments should be stored (will be created if it does not exist yet)')
  10. parser.add_argument('-f', '--forced', dest='forced', action='store_true',
  11. help='force insertion into database, even if entries already exist')
  12. parser.add_argument('-I', '--ignore-invalid', dest='ignore_invalid', action='store_true',
  13. help='process invalid e-mail files anyway, for example when missing message-id headers')
  14. args = parser.parse_args()
  15. options = vars(args)
  16. def getheader(header_text, default="ascii"):
  17. headers = email.header.decode_header(header_text)
  18. try:
  19. header_sections = [unicode(text, charset or default) for text, charset in headers]
  20. except UnicodeDecodeError:
  21. header_sections = [unicode(text, 'windows-1252') for text, charset in headers]
  22. except LookupError:
  23. return u""
  24. return u"".join(header_sections)
  25. def find_submessages(message):
  26. if message.is_multipart():
  27. return [find_submessages(part) for part in message.get_payload()]
  28. else:
  29. return message
  30. def flatten(x):
  31. # http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks
  32. result = []
  33. for el in x:
  34. if hasattr(el, "__iter__") and not isinstance(el, basestring):
  35. result.extend(flatten(el))
  36. else:
  37. result.append(el)
  38. return result
  39. def get_charset(part):
  40. charset = None
  41. if part.get_content_charset():
  42. charset = part.get_content_charset()
  43. elif part.get_charset():
  44. charset = part.get_charset()
  45. if charset is None or charset == "default" or charset.startswith("us-ascii"):
  46. return "ascii"
  47. else:
  48. return charset
  49. # Connect to database
  50. database = sqlite3.connect(options['database'])
  51. cursor = database.cursor()
  52. try:
  53. # Create attachment directory first
  54. os.makedirs(options['attachment_dir'])
  55. except OSError:
  56. pass
  57. try:
  58. # Try to create emails table
  59. cursor.execute("CREATE TABLE emails (`message_id`, `from`, `to`, `subject`, `date`, `body`, `html`, `hash`)")
  60. except sqlite3.OperationalError:
  61. pass
  62. try:
  63. # Try to create attachments table
  64. cursor.execute("CREATE TABLE attachments (`message_hash`, `filename`, `type`, `hash`, `size`)")
  65. except sqlite3.OperationalError:
  66. pass
  67. # Select all files matching the given pattern
  68. file_list = glob.glob(options['pattern'])
  69. finished = 0
  70. for email_file in file_list:
  71. # To save time when updating a database, let's first check whether the filename is already present in the database as a hash.
  72. # There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything.
  73. sha1_hash = os.path.splitext(os.path.split(email_file)[1])[0]
  74. if options['forced'] == False:
  75. cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
  76. if len(cursor.fetchall()) > 0:
  77. print "Skipping %s, already exists in the database according to filename." % sha1_hash
  78. continue
  79. message = email.message_from_file(open(email_file, 'r'))
  80. if message['message-id'] is None:
  81. if options['ignore_invalid'] == True:
  82. message_id = message['date']
  83. print "WARNING: %s does not contain a valid message-id header. Falling back to date." % email_file
  84. else:
  85. print "%s is not a valid e-mail file." % email_file
  86. finished += 1
  87. continue
  88. else:
  89. message_id = message['message-id']
  90. if 'subject' not in message or message['subject'] is None:
  91. subject = ""
  92. else:
  93. subject = message['subject']
  94. textbody = ""
  95. htmlbody = ""
  96. attachment_list = []
  97. sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message_id, subject)).hexdigest()
  98. if options['forced'] == False:
  99. cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
  100. if len(cursor.fetchall()) > 0:
  101. print "Skipping %s, already exists in the database according to message hash." % sha1_hash
  102. continue
  103. message_parts = [find_submessages(message)]
  104. message_parts = flatten(message_parts)
  105. for part in message_parts:
  106. if part.get_filename() is None:
  107. # Part of the message
  108. if part.get_content_type() == "text/plain":
  109. if textbody == "":
  110. try:
  111. textbody = part.get_payload(decode=True).decode(get_charset(part))
  112. except UnicodeDecodeError:
  113. # This part is probably in windows-1252 encoding
  114. try:
  115. textbody = part.get_payload(decode=True).decode('windows-1252')
  116. except UnicodeDecodeError:
  117. # Ok, we really have no clue how to decode this, we'll just skip it...
  118. continue
  119. except LookupError:
  120. pass
  121. elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml":
  122. if htmlbody == "":
  123. try:
  124. htmlbody = part.get_payload(decode=True).decode(get_charset(part))
  125. except UnicodeDecodeError:
  126. # This part is probably in windows-1252 encoding
  127. try:
  128. htmlbody = part.get_payload(decode=True).decode('windows-1252')
  129. except UnicodeDecodeError:
  130. # Ok, we really have no clue how to decode this, we'll just skip it...
  131. continue
  132. except LookupError:
  133. pass
  134. else:
  135. # Technically this is supposed to be part of the message body, but we have no idea what format it is in...
  136. print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
  137. else:
  138. # Attachment
  139. attachment_data = part.get_payload(decode=True)
  140. attachment_sha1 = hashlib.sha1(attachment_data).hexdigest()
  141. attachment_filename = part.get_filename()
  142. attachment_type = part.get_content_type()
  143. attachment_extension = os.path.splitext(attachment_filename)[1]
  144. attachment_size = len(attachment_data)
  145. attachment_destination = "%s/%s%s" % (options['attachment_dir'], attachment_sha1, attachment_extension)
  146. attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size))
  147. attachment_file = open(attachment_destination, "w")
  148. attachment_file.write(attachment_data)
  149. attachment_file.close()
  150. try:
  151. timestamp = int(time.mktime(email.utils.parsedate(message['date'])))
  152. except TypeError:
  153. timestamp = 0
  154. print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
  155. try:
  156. new_row = (getheader(message_id), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
  157. except UnicodeDecodeError:
  158. print "ERROR: Failed parsing %s, headers could not be decoded." % sha1_hash
  159. continue
  160. except email.errors.HeaderParseError:
  161. print "ERROR: Failed parsing %s, headers could not be parsed." % sha1_hash
  162. continue
  163. cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
  164. print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash
  165. if len(attachment_list) > 0:
  166. inserted = 0
  167. for attachment in attachment_list:
  168. if options['forced'] == False:
  169. cursor.execute("SELECT * FROM attachments WHERE `hash` = ? AND `message_hash` = ?", (attachment[2], sha1_hash))
  170. if len(cursor.fetchall()) > 0:
  171. print "Skipping attachment %s, already exists in the database." % attachment[2]
  172. continue
  173. try:
  174. new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3])
  175. cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row)
  176. inserted += 1
  177. except sqlite3.ProgrammingError:
  178. print "Inserting of attachment %s failed." % attachment[2]
  179. if inserted > 0:
  180. print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash)
  181. finished += 1
  182. if finished % 100 == 0:
  183. database.commit()
  184. print "%d e-mails done, commited changes to database." % finished
  185. database.commit()
  186. print "Changes successfully committed to database, all done."