diff options
Diffstat (limited to 'src/leap/bitmask/mail/walk.py')
-rw-r--r-- | src/leap/bitmask/mail/walk.py | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/src/leap/bitmask/mail/walk.py b/src/leap/bitmask/mail/walk.py new file mode 100644 index 0000000..d143d61 --- /dev/null +++ b/src/leap/bitmask/mail/walk.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# walk.py +# Copyright (C) 2013-2015 LEAP +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Walk a message tree and generate documents that can be inserted in the backend +store. +""" +from email.parser import Parser + +from cryptography.hazmat.backends.multibackend import MultiBackend +from cryptography.hazmat.backends.openssl.backend import ( + Backend as OpenSSLBackend) +from cryptography.hazmat.primitives import hashes + +from leap.mail.utils import first + +crypto_backend = MultiBackend([OpenSSLBackend()]) + +_parser = Parser() + + +def get_tree(msg): + p = {} + p['ctype'] = msg.get_content_type() + p['headers'] = msg.items() + + payload = msg.get_payload() + is_multi = msg.is_multipart() + if is_multi: + p['part_map'] = dict( + [(idx, get_tree(part)) for idx, part in enumerate(payload, 1)]) + p['parts'] = len(payload) + p['phash'] = None + else: + p['parts'] = 0 + p['size'] = len(payload) + p['phash'] = get_hash(payload) + p['part_map'] = {} + p['multi'] = is_multi + return p + + +def get_tree_from_string(messagestr): + return get_tree(_parser.parsestr(messagestr)) + + +def get_body_phash(msg): + """ + Find the body payload-hash for this message. + """ + for part in msg.walk(): + # XXX what other ctypes should be considered body? + if part.get_content_type() in ("text/plain", "text/html"): + # XXX avoid hashing again + return get_hash(part.get_payload()) + + +def get_raw_docs(msg): + """ + We get also some of the headers to be able to + index the content. Here we remove any mutable part, as the the filename + in the content disposition. + """ + return ( + {'type': 'cnt', + 'raw': part.get_payload(), + 'phash': get_hash(part.get_payload()), + 'content-type': part.get_content_type(), + 'content-disposition': first(part.get( + 'content-disposition', '').split(';')), + 'content-transfer-encoding': part.get( + 'content-transfer-encoding', '') + } for part in msg.walk() if not isinstance(part.get_payload(), list)) + + +def get_hash(s): + digest = hashes.Hash(hashes.SHA256(), crypto_backend) + digest.update(s) + return digest.finalize().encode("hex").upper() + + +""" +Groucho Marx: Now pay particular attention to this first clause, because it's + most important. There's the party of the first part shall be + known in this contract as the party of the first part. How do you + like that, that's pretty neat eh? + +Chico Marx: No, that's no good. +Groucho Marx: What's the matter with it? + +Chico Marx: I don't know, let's hear it again. +Groucho Marx: So the party of the first part shall be known in this contract as + the party of the first part. +""" |