diff options
Diffstat (limited to 'src/leap/mail/walk.py')
-rw-r--r-- | src/leap/mail/walk.py | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py new file mode 100644 index 0000000..f747377 --- /dev/null +++ b/src/leap/mail/walk.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +# walk.py +# Copyright (C) 2013 LEAP +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Utilities for walking along a message tree. +""" +import os + +from pycryptopp.hash import sha256 + +from leap.mail.utils import first + +DEBUG = os.environ.get("BITMASK_MAIL_DEBUG") + +if DEBUG: + get_hash = lambda s: sha256.SHA256(s).hexdigest()[:10] +else: + get_hash = lambda s: sha256.SHA256(s).hexdigest() + + +""" +Get interesting message parts +""" +get_parts = lambda msg: [ + {'multi': part.is_multipart(), + 'ctype': part.get_content_type(), + 'size': len(part.as_string()), + 'parts': len(part.get_payload()) + if isinstance(part.get_payload(), list) + else 1, + 'headers': part.items(), + 'phash': get_hash(part.get_payload()) + if not part.is_multipart() else None} + for part in msg.walk()] + +""" +Utility lambda functions for getting the parts vector and the +payloads from the original message. +""" + +get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts) +get_payloads = lambda msg: ((x.get_payload(), + dict(((str.lower(k), v) for k, v in (x.items())))) + for x in msg.walk()) + +get_body_phash_simple = lambda payloads: first( + [get_hash(payload) for payload, headers in payloads + if payloads]) + +get_body_phash_multi = lambda payloads: (first( + [get_hash(payload) for payload, headers in payloads + if payloads + and "text/plain" in headers.get('content-type', '')]) + or get_body_phash_simple(payloads)) + +""" +On getting the raw docs, we get also some of the headers to be able to +index the content. Here we remove any mutable part, as the the filename +in the content disposition. +""" + +get_raw_docs = lambda msg, parts: ( + {"type": "cnt", # type content they'll be + "raw": payload if not DEBUG else payload[:100], + "phash": get_hash(payload), + "content-disposition": first(headers.get( + 'content-disposition', '').split(';')), + "content-type": headers.get( + 'content-type', ''), + "content-transfer-encoding": headers.get( + 'content-transfer-type', '')} + for payload, headers in get_payloads(msg) + if not isinstance(payload, list)) + + +def walk_msg_tree(parts, body_phash=None): + """ + Take a list of interesting items of a message subparts structure, + and return a dict of dicts almost ready to be written to the content + documents that will be stored in Soledad. + + It walks down the subparts in the parsed message tree, and collapses + the leaf docuents into a wrapper document until no multipart submessages + are left. To achieve this, it iteratively calculates a wrapper vector of + all documents in the sequence that have more than one part and have unitary + documents to their right. To collapse a multipart, take as many + unitary documents as parts the submessage contains, and replace the object + in the sequence with the new wrapper document. + + :param parts: A list of dicts containing the interesting properties for + the message structure. Normally this has been generated by + doing a message walk. + :type parts: list of dicts. + :param body_phash: the payload hash of the body part, to be included + in the outer content doc for convenience. + :type body_phash: basestring or None + """ + PART_MAP = "part_map" + MULTI = "multi" + HEADERS = "headers" + PHASH = "phash" + BODY = "body" + + # parts vector + pv = list(get_parts_vector(parts)) + + inner_headers = parts[1].get(HEADERS, None) if ( + len(parts) == 2) else None + + if DEBUG: + print "parts vector: ", pv + print + + # wrappers vector + getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False + for i in range(len(pv) - 1)] + wv = getwv(pv) + + # do until no wrapper document is left + while any(wv): + wind = wv.index(True) # wrapper index + nsub = pv[wind] # number of subparts to pick + slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts + + cwra = { + MULTI: True, + PART_MAP: dict((index + 1, part) # content wrapper + for index, part in enumerate(slic)), + HEADERS: dict(parts[wind][HEADERS]) + } + + # remove subparts and substitue wrapper + map(lambda i: parts.remove(i), slic) + parts[wind] = cwra + + # refresh vectors for this iteration + pv = list(get_parts_vector(parts)) + wv = getwv(pv) + + if all(x == 1 for x in pv): + # special case in the rightmost element + main_pmap = parts[0].get(PART_MAP, None) + if main_pmap is not None: + last_part = max(main_pmap.keys()) + main_pmap[last_part][PART_MAP] = {} + for partind in range(len(pv) - 1): + print partind+1, len(parts) + main_pmap[last_part][PART_MAP][partind] = parts[partind + 1] + + outer = parts[0] + outer.pop(HEADERS) + if not PART_MAP in outer: + # we have a multipart with 1 part only, so kind of fix it + # although it would be prettier if I take this special case at + # the beginning of the walk. + pdoc = {MULTI: True, + PART_MAP: {1: outer}} + pdoc[PART_MAP][1][MULTI] = False + if not pdoc[PART_MAP][1].get(PHASH, None): + pdoc[PART_MAP][1][PHASH] = body_phash + if inner_headers: + pdoc[PART_MAP][1][HEADERS] = inner_headers + else: + pdoc = outer + pdoc[BODY] = body_phash + return pdoc + +""" +Groucho Marx: Now pay particular attention to this first clause, because it's + most important. There's the party of the first part shall be + known in this contract as the party of the first part. How do you + like that, that's pretty neat eh? + +Chico Marx: No, that's no good. +Groucho Marx: What's the matter with it? + +Chico Marx: I don't know, let's hear it again. +Groucho Marx: So the party of the first part shall be known in this contract as + the party of the first part. + +Chico Marx: Well it sounds a little better this time. +Groucho Marx: Well, it grows on you. Would you like to hear it once more? + +Chico Marx: Just the first part. +Groucho Marx: All right. It says the first part of the party of the first part + shall be known in this contract as the first part of the party of + the first part, shall be known in this contract - look, why + should we quarrel about a thing like this, we'll take it right + out, eh? + +Chico Marx: Yes, it's too long anyhow. Now what have we got left? +Groucho Marx: Well I've got about a foot and a half. Now what's the matter? + +Chico Marx: I don't like the second party either. +""" + +""" +I feel you deserved it after reading the above and try to debug your problem ;) +""" |