Automatically migrated from Gitolite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

182 lines
6.5 KiB

9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
  1. #!/usr/bin/python
  2. import os, argparse, hashlib, email, email.header, email.utils, glob, sqlite3, time
  3. parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.')
  4. parser.add_argument('-p', dest='pattern', action='store', default='*',
  5. help='glob pattern (including path) that has to be matched for a file to be parsed')
  6. parser.add_argument('-d', dest='database', action='store', default='emails.db',
  7. help='path of the database that should be used to store the emails (will be created if it does not exist yet)')
  8. parser.add_argument('-a', dest='attachment_dir', action='store', default='attachments',
  9. help='path where attachments should be stored (will be created if it does not exist yet)')
  10. parser.add_argument('-f', '--forced', dest='forced', action='store_true',
  11. help='force insertion into database, even if entries already exist')
  12. args = parser.parse_args()
  13. options = vars(args)
  14. def getheader(header_text, default="ascii"):
  15. headers = email.header.decode_header(header_text)
  16. try:
  17. header_sections = [unicode(text, charset or default) for text, charset in headers]
  18. except UnicodeDecodeError:
  19. header_sections = [unicode(text, 'windows-1252') for text, charset in headers]
  20. except LookupError:
  21. return u""
  22. return u"".join(header_sections)
  23. def find_submessages(message):
  24. if message.is_multipart():
  25. return [find_submessages(part) for part in message.get_payload()]
  26. else:
  27. return message
  28. def flatten(x):
  29. # http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks
  30. result = []
  31. for el in x:
  32. if hasattr(el, "__iter__") and not isinstance(el, basestring):
  33. result.extend(flatten(el))
  34. else:
  35. result.append(el)
  36. return result
  37. def get_charset(part):
  38. charset = None
  39. if part.get_content_charset():
  40. charset = part.get_content_charset()
  41. elif part.get_charset():
  42. charset = part.get_charset()
  43. if charset is None or charset == "default" or charset.startswith("us-ascii"):
  44. return "ascii"
  45. else:
  46. return charset
  47. # Connect to database
  48. database = sqlite3.connect(options['database'])
  49. cursor = database.cursor()
  50. try:
  51. # Create attachment directory first
  52. os.makedirs(options['attachment_dir'])
  53. except OSError:
  54. pass
  55. try:
  56. # Try to create emails table
  57. cursor.execute("CREATE TABLE emails (`message_id`, `from`, `to`, `subject`, `date`, `body`, `html`, `hash`)")
  58. except sqlite3.OperationalError:
  59. pass
  60. try:
  61. # Try to create attachments table
  62. cursor.execute("CREATE TABLE attachments (`message_hash`, `filename`, `type`, `hash`, `size`)")
  63. except sqlite3.OperationalError:
  64. pass
  65. # Select all files matching the given pattern
  66. file_list = glob.glob(options['pattern'])
  67. finished = 0
  68. for email_file in file_list:
  69. message = email.message_from_file(open(email_file, 'r'))
  70. if message['message-id'] is None:
  71. print "%s is not a valid e-mail file." % email_file
  72. else:
  73. if 'subject' not in message or message['subject'] is None:
  74. subject = ""
  75. else:
  76. subject = message['subject']
  77. textbody = ""
  78. htmlbody = ""
  79. attachment_list = []
  80. sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest()
  81. message_parts = [find_submessages(message)]
  82. message_parts = flatten(message_parts)
  83. for part in message_parts:
  84. if part.get_filename() is None:
  85. # Part of the message
  86. if part.get_content_type() == "text/plain":
  87. if textbody == "":
  88. try:
  89. textbody = part.get_payload(decode=True).decode(get_charset(part))
  90. except UnicodeDecodeError:
  91. textbody = part.get_payload(decode=True).decode('windows-1252')
  92. except LookupError:
  93. pass
  94. elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml":
  95. if htmlbody == "":
  96. try:
  97. htmlbody = part.get_payload(decode=True).decode(get_charset(part))
  98. except UnicodeDecodeError:
  99. htmlbody = part.get_payload(decode=True).decode('windows-1252')
  100. except LookupError:
  101. pass
  102. else:
  103. # Technically this is supposed to be part of the message body, but we have no idea what format it is in...
  104. print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
  105. else:
  106. # Attachment
  107. attachment_data = part.get_payload(decode=True)
  108. attachment_sha1 = hashlib.sha1(attachment_data).hexdigest()
  109. attachment_filename = part.get_filename()
  110. attachment_type = part.get_content_type()
  111. attachment_extension = os.path.splitext(attachment_filename)[1][1:]
  112. attachment_size = len(attachment_data)
  113. attachment_destination = "%s/%s.%s" % (options['attachment_dir'], attachment_sha1, attachment_extension)
  114. attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size))
  115. attachment_file = open(attachment_destination, "w")
  116. attachment_file.write(attachment_data)
  117. attachment_file.close()
  118. try:
  119. timestamp = int(time.mktime(email.utils.parsedate(message['date'])))
  120. except TypeError:
  121. timestamp = 0
  122. print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
  123. cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
  124. if len(cursor.fetchall()) == 0 or options['forced'] == True:
  125. new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
  126. cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
  127. print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash
  128. else:
  129. print "Skipping %s, already exists in the database." % sha1_hash
  130. if len(attachment_list) > 0:
  131. inserted = 0
  132. for attachment in attachment_list:
  133. cursor.execute("SELECT * FROM attachments WHERE `Hash` = ?", (attachment[2],))
  134. if len(cursor.fetchall()) == 0 or options['forced'] == True:
  135. new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3])
  136. cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row)
  137. inserted += 1
  138. else:
  139. print "Skipping attachment %s, already exists in the database." % attachment[2]
  140. if inserted > 0:
  141. print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash)
  142. finished += 1
  143. if finished % 100 == 0:
  144. database.commit()
  145. print "%d e-mails done, commited changes to database." % finished
  146. database.commit()
  147. print "Changes successfully committed to database, all done."