1 files changed, 212 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py
new file mode 100644
index 0000000..f747377
--- /dev/null
+++ b/src/leap/mail/walk.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+# walk.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+Utilities for walking along a message tree.
+"""
+import os
+
+from pycryptopp.hash import sha256
+
+from leap.mail.utils import first
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+if DEBUG:
+    get_hash = lambda s: sha256.SHA256(s).hexdigest()[:10]
+else:
+    get_hash = lambda s: sha256.SHA256(s).hexdigest()
+
+
+"""
+Get interesting message parts
+"""
+get_parts = lambda msg: [
+    {'multi': part.is_multipart(),
+     'ctype': part.get_content_type(),
+     'size': len(part.as_string()),
+     'parts': len(part.get_payload())
+        if isinstance(part.get_payload(), list)
+        else 1,
+     'headers': part.items(),
+     'phash': get_hash(part.get_payload())
+        if not part.is_multipart() else None}
+    for part in msg.walk()]
+
+"""
+Utility lambda functions for getting the parts vector and the
+payloads from the original message.
+"""
+
+get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts)
+get_payloads = lambda msg: ((x.get_payload(),
+                             dict(((str.lower(k), v) for k, v in (x.items()))))
+                            for x in msg.walk())
+
+get_body_phash_simple = lambda payloads: first(
+    [get_hash(payload) for payload, headers in payloads
+     if payloads])
+
+get_body_phash_multi = lambda payloads: (first(
+    [get_hash(payload) for payload, headers in payloads
+     if payloads
+     and "text/plain" in headers.get('content-type', '')])
+    or get_body_phash_simple(payloads))
+
+"""
+On getting the raw docs, we get also some of the headers to be able to
+index the content. Here we remove any mutable part, as the the filename
+in the content disposition.
+"""
+
+get_raw_docs = lambda msg, parts: (
+    {"type": "cnt",  # type content they'll be
+     "raw": payload if not DEBUG else payload[:100],
+     "phash": get_hash(payload),
+     "content-disposition": first(headers.get(
+         'content-disposition', '').split(';')),
+     "content-type": headers.get(
+         'content-type', ''),
+     "content-transfer-encoding": headers.get(
+         'content-transfer-type', '')}
+    for payload, headers in get_payloads(msg)
+    if not isinstance(payload, list))
+
+
+def walk_msg_tree(parts, body_phash=None):
+    """
+    Take a list of interesting items of a message subparts structure,
+    and return a dict of dicts almost ready to be written to the content
+    documents that will be stored in Soledad.
+
+    It walks down the subparts in the parsed message tree, and collapses
+    the leaf docuents into a wrapper document until no multipart submessages
+    are left. To achieve this, it iteratively calculates a wrapper vector of
+    all documents in the sequence that have more than one part and have unitary
+    documents to their right. To collapse a multipart, take as many
+    unitary documents as parts the submessage contains, and replace the object
+    in the sequence with the new wrapper document.
+
+    :param parts: A list of dicts containing the interesting properties for
+                  the message structure. Normally this has been generated by
+                  doing a message walk.
+    :type parts: list of dicts.
+    :param body_phash: the payload hash of the body part, to be included
+                       in the outer content doc for convenience.
+    :type body_phash: basestring or None
+    """
+    PART_MAP = "part_map"
+    MULTI = "multi"
+    HEADERS = "headers"
+    PHASH = "phash"
+    BODY = "body"
+
+    # parts vector
+    pv = list(get_parts_vector(parts))
+
+    inner_headers = parts[1].get(HEADERS, None) if (
+        len(parts) == 2) else None
+
+    if DEBUG:
+        print "parts vector: ", pv
+        print
+
+    # wrappers vector
+    getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False
+                        for i in range(len(pv) - 1)]
+    wv = getwv(pv)
+
+    # do until no wrapper document is left
+    while any(wv):
+        wind = wv.index(True)  # wrapper index
+        nsub = pv[wind]  # number of subparts to pick
+        slic = parts[wind + 1:wind + 1 + nsub]  # slice with subparts
+
+        cwra = {
+            MULTI: True,
+            PART_MAP: dict((index + 1, part)  # content wrapper
+                           for index, part in enumerate(slic)),
+            HEADERS: dict(parts[wind][HEADERS])
+        }
+
+        # remove subparts and substitue wrapper
+        map(lambda i: parts.remove(i), slic)
+        parts[wind] = cwra
+
+        # refresh vectors for this iteration
+        pv = list(get_parts_vector(parts))
+        wv = getwv(pv)
+
+    if all(x == 1 for x in pv):
+        # special case in the rightmost element
+        main_pmap = parts[0].get(PART_MAP, None)
+        if main_pmap is not None:
+            last_part = max(main_pmap.keys())
+            main_pmap[last_part][PART_MAP] = {}
+            for partind in range(len(pv) - 1):
+                print partind+1, len(parts)
+                main_pmap[last_part][PART_MAP][partind] = parts[partind + 1]
+
+    outer = parts[0]
+    outer.pop(HEADERS)
+    if not PART_MAP in outer:
+        # we have a multipart with 1 part only, so kind of fix it
+        # although it would be prettier if I take this special case at
+        # the beginning of the walk.
+        pdoc = {MULTI: True,
+                PART_MAP: {1: outer}}
+        pdoc[PART_MAP][1][MULTI] = False
+        if not pdoc[PART_MAP][1].get(PHASH, None):
+            pdoc[PART_MAP][1][PHASH] = body_phash
+        if inner_headers:
+            pdoc[PART_MAP][1][HEADERS] = inner_headers
+    else:
+        pdoc = outer
+    pdoc[BODY] = body_phash
+    return pdoc
+
+"""
+Groucho Marx: Now pay particular attention to this first clause, because it's
+              most important. There's the party of the first part shall be
+              known in this contract as the party of the first part. How do you
+              like that, that's pretty neat eh?
+
+Chico Marx: No, that's no good.
+Groucho Marx: What's the matter with it?
+
+Chico Marx: I don't know, let's hear it again.
+Groucho Marx: So the party of the first part shall be known in this contract as
+              the party of the first part.
+
+Chico Marx: Well it sounds a little better this time.
+Groucho Marx: Well, it grows on you. Would you like to hear it once more?
+
+Chico Marx: Just the first part.
+Groucho Marx: All right. It says the first part of the party of the first part
+              shall be known in this contract as the first part of the party of
+              the first part, shall be known in this contract - look, why
+              should we quarrel about a thing like this, we'll take it right
+              out, eh?
+
+Chico Marx: Yes, it's too long anyhow. Now what have we got left?
+Groucho Marx: Well I've got about a foot and a half. Now what's the matter?
+
+Chico Marx: I don't like the second party either.
+"""
+
+"""
+I feel you deserved it after reading the above and try to debug your problem ;)
+"""