Automatically migrated from Gitolite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
6.9 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. #!/usr/bin/python
  2. import os, argparse, hashlib, email, email.header, email.utils, glob, sqlite3, time
  3. parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.')
  4. parser.add_argument('-p', dest='pattern', action='store', default='*',
  5. help='glob pattern (including path) that has to be matched for a file to be parsed')
  6. parser.add_argument('-d', dest='database', action='store', default='emails.db',
  7. help='path of the database that should be used to store the emails (will be created if it does not exist yet)')
  8. parser.add_argument('-a', dest='attachment_dir', action='store', default='attachments',
  9. help='path where attachments should be stored (will be created if it does not exist yet)')
  10. parser.add_argument('-f', '--forced', dest='forced', action='store_true',
  11. help='force insertion into database, even if entries already exist')
  12. args = parser.parse_args()
  13. options = vars(args)
  14. def getheader(header_text, default="ascii"):
  15. headers = email.header.decode_header(header_text)
  16. try:
  17. header_sections = [unicode(text, charset or default) for text, charset in headers]
  18. except UnicodeDecodeError:
  19. header_sections = [unicode(text, 'windows-1252') for text, charset in headers]
  20. except LookupError:
  21. return u""
  22. return u"".join(header_sections)
  23. def find_submessages(message):
  24. if message.is_multipart():
  25. return [find_submessages(part) for part in message.get_payload()]
  26. else:
  27. return message
  28. def flatten(x):
  29. # http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks
  30. result = []
  31. for el in x:
  32. if hasattr(el, "__iter__") and not isinstance(el, basestring):
  33. result.extend(flatten(el))
  34. else:
  35. result.append(el)
  36. return result
  37. def get_charset(part):
  38. charset = None
  39. if part.get_content_charset():
  40. charset = part.get_content_charset()
  41. elif part.get_charset():
  42. charset = part.get_charset()
  43. if charset is None or charset == "default" or charset.startswith("us-ascii"):
  44. return "ascii"
  45. else:
  46. return charset
  47. # Connect to database
  48. database = sqlite3.connect(options['database'])
  49. cursor = database.cursor()
  50. try:
  51. # Create attachment directory first
  52. os.makedirs(options['attachment_dir'])
  53. except OSError:
  54. pass
  55. try:
  56. # Try to create emails table
  57. cursor.execute("CREATE TABLE emails (`message_id`, `from`, `to`, `subject`, `date`, `body`, `html`, `hash`)")
  58. except sqlite3.OperationalError:
  59. pass
  60. try:
  61. # Try to create attachments table
  62. cursor.execute("CREATE TABLE attachments (`message_hash`, `filename`, `type`, `hash`, `size`)")
  63. except sqlite3.OperationalError:
  64. pass
  65. # Select all files matching the given pattern
  66. file_list = glob.glob(options['pattern'])
  67. finished = 0
  68. for email_file in file_list:
  69. message = email.message_from_file(open(email_file, 'r'))
  70. if message['message-id'] is None:
  71. print "%s is not a valid e-mail file." % email_file
  72. else:
  73. if 'subject' not in message or message['subject'] is None:
  74. subject = ""
  75. else:
  76. subject = message['subject']
  77. textbody = ""
  78. htmlbody = ""
  79. attachment_list = []
  80. sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message['message-id'], subject)).hexdigest()
  81. message_parts = [find_submessages(message)]
  82. message_parts = flatten(message_parts)
  83. for part in message_parts:
  84. if part.get_filename() is None:
  85. # Part of the message
  86. if part.get_content_type() == "text/plain":
  87. if textbody == "":
  88. try:
  89. textbody = part.get_payload(decode=True).decode(get_charset(part))
  90. except UnicodeDecodeError:
  91. # This part is probably in windows-1252 encoding
  92. try:
  93. textbody = part.get_payload(decode=True).decode('windows-1252')
  94. except UnicodeDecodeError:
  95. # Ok, we really have no clue how to decode this, we'll just skip it...
  96. continue
  97. except LookupError:
  98. pass
  99. elif part.get_content_type() == "text/html" or part.get_content_type == "text/xhtml+xml" or part.get_content_type == "application/xhtml+xml":
  100. if htmlbody == "":
  101. try:
  102. htmlbody = part.get_payload(decode=True).decode(get_charset(part))
  103. except UnicodeDecodeError:
  104. # This part is probably in windows-1252 encoding
  105. try:
  106. htmlbody = part.get_payload(decode=True).decode('windows-1252')
  107. except UnicodeDecodeError:
  108. # Ok, we really have no clue how to decode this, we'll just skip it...
  109. continue
  110. except LookupError:
  111. pass
  112. else:
  113. # Technically this is supposed to be part of the message body, but we have no idea what format it is in...
  114. print "WARNING: Unknown message format encountered in %s, message may be incomplete." % sha1_hash
  115. else:
  116. # Attachment
  117. attachment_data = part.get_payload(decode=True)
  118. attachment_sha1 = hashlib.sha1(attachment_data).hexdigest()
  119. attachment_filename = part.get_filename()
  120. attachment_type = part.get_content_type()
  121. attachment_extension = os.path.splitext(attachment_filename)[1][1:]
  122. attachment_size = len(attachment_data)
  123. attachment_destination = "%s/%s.%s" % (options['attachment_dir'], attachment_sha1, attachment_extension)
  124. attachment_list.append((attachment_filename, attachment_type, attachment_sha1, attachment_size))
  125. attachment_file = open(attachment_destination, "w")
  126. attachment_file.write(attachment_data)
  127. attachment_file.close()
  128. try:
  129. timestamp = int(time.mktime(email.utils.parsedate(message['date'])))
  130. except TypeError:
  131. timestamp = 0
  132. print "WARNING: Failed to determine unix timestamp for %s." % sha1_hash
  133. cursor.execute("SELECT * FROM emails WHERE `Hash` = ?", (sha1_hash,))
  134. if len(cursor.fetchall()) == 0 or options['forced'] == True:
  135. new_row = (getheader(message['message-id']), getheader(message['from']), getheader(message['to']), getheader(subject), timestamp, textbody, htmlbody, sha1_hash)
  136. cursor.execute("INSERT INTO emails VALUES (?, ?, ?, ?, ?, ?, ?, ?)", new_row)
  137. print "Successfully parsed and inserted e-mail with SHA1 hash %s." % sha1_hash
  138. else:
  139. print "Skipping %s, already exists in the database." % sha1_hash
  140. if len(attachment_list) > 0:
  141. inserted = 0
  142. for attachment in attachment_list:
  143. cursor.execute("SELECT * FROM attachments WHERE `Hash` = ?", (attachment[2],))
  144. if len(cursor.fetchall()) == 0 or options['forced'] == True:
  145. new_row = (sha1_hash, attachment[0], attachment[1], attachment[2], attachment[3])
  146. cursor.execute("INSERT INTO attachments VALUES (?, ?, ?, ?, ?)", new_row)
  147. inserted += 1
  148. else:
  149. print "Skipping attachment %s, already exists in the database." % attachment[2]
  150. if inserted > 0:
  151. print "Successfully inserted %d attachment(s) for %s." % (inserted, sha1_hash)
  152. finished += 1
  153. if finished % 100 == 0:
  154. database.commit()
  155. print "%d e-mails done, commited changes to database." % finished
  156. database.commit()
  157. print "Changes successfully committed to database, all done."