diff options
author | Kali Kaneko <kali@leap.se> | 2014-01-07 11:34:08 -0400 |
---|---|---|
committer | Kali Kaneko <kali@leap.se> | 2014-01-08 20:53:47 -0400 |
commit | 4ba5d5b405e3c6a6bc997df2073ffc8ea3fa75a9 (patch) | |
tree | 7519ccd4dec15240cc8a89ff34fdc61ee7236141 /src/leap/mail/walk.py | |
parent | a203337d155a6e7186980ef175642adc91d472fe (diff) |
Second stage of the new year's storage rewrite.
* documents of only three types:
* flags
* headers
* content
* add algorithm for walking the parsed message tree.
* treat special cases like a multipart with a single part.
* modify add_msg to use the walk routine
* modify twisted interfaces to use the new storage schema.
* tests for different multipart cases
* fix multipart detection typo in the fetch
This is a merge proposal for the 0.5.0-rc3.
known bugs
----------
Some things are still know not to work well at this point
(some cases of multipart messages do not display the bodies).
IMAP server also is left in a bad internal state after a logout/login.
Diffstat (limited to 'src/leap/mail/walk.py')
-rw-r--r-- | src/leap/mail/walk.py | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py new file mode 100644 index 0000000..820b8c7 --- /dev/null +++ b/src/leap/mail/walk.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +# walk.py +# Copyright (C) 2013 LEAP +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Utilities for walking along a message tree. +""" +import hashlib +import os + +from leap.mail.utils import first + +DEBUG = os.environ.get("BITMASK_MAIL_DEBUG") + +if DEBUG: + get_hash = lambda s: hashlib.sha256(s).hexdigest()[:10] +else: + get_hash = lambda s: hashlib.sha256(s).hexdigest() + + +""" +Get interesting message parts +""" +get_parts = lambda msg: [ + {'multi': part.is_multipart(), + 'ctype': part.get_content_type(), + 'size': len(part.as_string()), + 'parts': len(part.get_payload()) + if isinstance(part.get_payload(), list) + else 1, + 'headers': part.items(), + 'phash': get_hash(part.get_payload()) + if not part.is_multipart() else None} + for part in msg.walk()] + +""" +Utility lambda functions for getting the parts vector and the +payloads from the original message. +""" + +get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts) +get_payloads = lambda msg: ((x.get_payload(), + dict(((str.lower(k), v) for k, v in (x.items())))) + for x in msg.walk()) + +get_body_phash_simple = lambda payloads: first( + [get_hash(payload) for payload, headers in payloads + if "text/plain" in headers.get('content-type')]) + +get_body_phash_multi = lambda payloads: (first( + [get_hash(payload) for payload, headers in payloads + if "text/plain" in headers.get('content-type')]) + or get_body_phash_simple(payloads)) + +""" +On getting the raw docs, we get also some of the headers to be able to +index the content. Here we remove any mutable part, as the the filename +in the content disposition. +""" + +get_raw_docs = lambda msg, parts: ( + {"type": "cnt", # type content they'll be + "raw": payload if not DEBUG else payload[:100], + "phash": get_hash(payload), + "content-disposition": first(headers.get( + 'content-disposition', '').split(';')), + "content-type": headers.get( + 'content-type', ''), + "content-transfer-encoding": headers.get( + 'content-transfer-type', '')} + for payload, headers in get_payloads(msg) + if not isinstance(payload, list)) + + +def walk_msg_tree(parts, body_phash=None): + """ + Take a list of interesting items of a message subparts structure, + and return a dict of dicts almost ready to be written to the content + documents that will be stored in Soledad. + + It walks down the subparts in the parsed message tree, and collapses + the leaf docuents into a wrapper document until no multipart submessages + are left. To achieve this, it iteratively calculates a wrapper vector of + all documents in the sequence that have more than one part and have unitary + documents to their right. To collapse a multipart, take as many + unitary documents as parts the submessage contains, and replace the object + in the sequence with the new wrapper document. + + :param parts: A list of dicts containing the interesting properties for + the message structure. Normally this has been generated by + doing a message walk. + :type parts: list of dicts. + :param body_phash: the payload hash of the body part, to be included + in the outer content doc for convenience. + :type body_phash: basestring or None + """ + # parts vector + pv = list(get_parts_vector(parts)) + + if len(parts) == 2: + inner_headers = parts[1].get("headers", None) + + if DEBUG: + print "parts vector: ", pv + print + + # wrappers vector + getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False + for i in range(len(pv) - 1)] + wv = getwv(pv) + + # do until no wrapper document is left + while any(wv): + wind = wv.index(True) # wrapper index + nsub = pv[wind] # number of subparts to pick + slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts + + cwra = { + "multi": True, + "part_map": dict((index + 1, part) # content wrapper + for index, part in enumerate(slic)), + "headers": dict(parts[wind]['headers']) + } + + # remove subparts and substitue wrapper + map(lambda i: parts.remove(i), slic) + parts[wind] = cwra + + # refresh vectors for this iteration + pv = list(get_parts_vector(parts)) + wv = getwv(pv) + + outer = parts[0] + outer.pop('headers') + if not "part_map" in outer: + # we have a multipart with 1 part only, so kind of fix it + # although it would be prettier if I take this special case at + # the beginning of the walk. + pdoc = {"multi": True, + "part_map": {1: outer}} + pdoc["part_map"][1]["multi"] = False + if not pdoc["part_map"][1].get("phash", None): + pdoc["part_map"][1]["phash"] = body_phash + pdoc["part_map"][1]["headers"] = inner_headers + else: + pdoc = outer + pdoc["body"] = body_phash + return pdoc |