Automatically migrated from Gitolite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
2.8 KiB

  1. #!/usr/bin/python
  2. import os, argparse, hashlib, email, email.header, email.utils, glob
  3. parser = argparse.ArgumentParser(description='Parses emails into an SQLite database and an attachment folder.')
  4. parser.add_argument('-p', dest='pattern', action='store', default='*',
  5. help='glob pattern (including path) that has to be matched for a file to be parsed')
  6. parser.add_argument('-I', '--ignore-invalid', dest='ignore_invalid', action='store_true',
  7. help='process invalid e-mail files anyway, for example when missing message-id headers')
  8. args = parser.parse_args()
  9. options = vars(args)
  10. def getheader(header_text, default="ascii"):
  11. headers = email.header.decode_header(header_text)
  12. try:
  13. header_sections = [unicode(text, charset or default) for text, charset in headers]
  14. except UnicodeDecodeError:
  15. header_sections = [unicode(text, 'windows-1252') for text, charset in headers]
  16. except LookupError:
  17. return u""
  18. return u"".join(header_sections)
  19. def find_submessages(message):
  20. if message.is_multipart():
  21. return [find_submessages(part) for part in message.get_payload()]
  22. else:
  23. return message
  24. def flatten(x):
  25. # http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks
  26. result = []
  27. for el in x:
  28. if hasattr(el, "__iter__") and not isinstance(el, basestring):
  29. result.extend(flatten(el))
  30. else:
  31. result.append(el)
  32. return result
  33. def get_charset(part):
  34. charset = None
  35. if part.get_content_charset():
  36. charset = part.get_content_charset()
  37. elif part.get_charset():
  38. charset = part.get_charset()
  39. if charset is None or charset == "default" or charset.startswith("us-ascii"):
  40. return "ascii"
  41. else:
  42. return charset
  43. # Select all files matching the given pattern
  44. file_list = glob.glob(options['pattern'])
  45. finished = 0
  46. for email_file in file_list:
  47. # To save time when updating a database, let's first check whether the filename is already present in the database as a hash.
  48. # There is no need to check for the format here, since if the filename is not a valid hash, it simply won't match anything.
  49. message = email.message_from_file(open(email_file, 'r'))
  50. if message['message-id'] is None:
  51. if options['ignore_invalid'] == True:
  52. message_id = message['date']
  53. print "WARNING: %s does not contain a valid message-id header. Falling back to date." % email_file
  54. else:
  55. print "%s is not a valid e-mail file." % email_file
  56. finished += 1
  57. continue
  58. else:
  59. message_id = message['message-id']
  60. if 'subject' not in message or message['subject'] is None:
  61. subject = ""
  62. else:
  63. subject = message['subject']
  64. sha1_hash = hashlib.sha1("%s/%s/%s/%s" % (message['from'], message['to'], message_id, subject)).hexdigest()
  65. new_path = "%s/%s.eml" % (os.path.dirname(email_file), sha1_hash)
  66. os.rename(email_file, new_path)
  67. print "%s -> %s" % (email_file, new_path)
  68. finished += 1
  69. print "Renamed %d files." % finished