1 files changed, 46 insertions, 165 deletions
diff --git a/mail/src/leap/mail/walk.py b/mail/src/leap/mail/walk.py
index c116601..d143d61 100644
--- a/mail/src/leap/mail/walk.py
+++ b/mail/src/leap/mail/walk.py
@@ -15,8 +15,11 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
-Utilities for walking along a message tree.
+Walk a message tree and generate documents that can be inserted in the backend
+store.
 """
+from email.parser import Parser
+
 from cryptography.hazmat.backends.multibackend import MultiBackend
 from cryptography.hazmat.backends.openssl.backend import (
     Backend as OpenSSLBackend)
@@ -26,49 +29,32 @@ from leap.mail.utils import first
 
 crypto_backend = MultiBackend([OpenSSLBackend()])
 
+_parser = Parser()
 
-def get_hash(s):
-    digest = hashes.Hash(hashes.SHA256(), crypto_backend)
-    digest.update(s)
-    return digest.finalize().encode("hex").upper()
-
-
-"""
-Get interesting message parts
-"""
-
-
-def get_parts(msg):
-    return [
-        {
-            'multi': part.is_multipart(),
-            'ctype': part.get_content_type(),
-            'size': len(part.as_string()),
-            'parts':
-                len(part.get_payload())
-                if isinstance(part.get_payload(), list)
-                else 1,
-            'headers': part.items(),
-            'phash':
-                get_hash(part.get_payload())
-                if not part.is_multipart()
-                else None
-        } for part in msg.walk()]
-
-"""
-Utility lambda functions for getting the parts vector and the
-payloads from the original message.
-"""
 
+def get_tree(msg):
+    p = {}
+    p['ctype'] = msg.get_content_type()
+    p['headers'] = msg.items()
 
-def get_parts_vector(parts):
-    return (x.get('parts', 1) for x in parts)
+    payload = msg.get_payload()
+    is_multi = msg.is_multipart()
+    if is_multi:
+        p['part_map'] = dict(
+            [(idx, get_tree(part)) for idx, part in enumerate(payload, 1)])
+        p['parts'] = len(payload)
+        p['phash'] = None
+    else:
+        p['parts'] = 0
+        p['size'] = len(payload)
+        p['phash'] = get_hash(payload)
+        p['part_map'] = {}
+    p['multi'] = is_multi
+    return p
 
 
-def get_payloads(msg):
-    return ((x.get_payload(),
-            dict(((str.lower(k), v) for k, v in (x.items()))))
-            for x in msg.walk())
+def get_tree_from_string(messagestr):
+    return get_tree(_parser.parsestr(messagestr))
 
 
 def get_body_phash(msg):
@@ -81,27 +67,29 @@ def get_body_phash(msg):
             # XXX avoid hashing again
             return get_hash(part.get_payload())
 
-"""
-On getting the raw docs, we get also some of the headers to be able to
-index the content. Here we remove any mutable part, as the the filename
-in the content disposition.
-"""
-
 
-def get_raw_docs(msg, parts):
+def get_raw_docs(msg):
+    """
+    We get also some of the headers to be able to
+    index the content. Here we remove any mutable part, as the the filename
+    in the content disposition.
+    """
     return (
-        {
-            "type": "cnt",  # type content they'll be
-            "raw": payload,
-            "phash": get_hash(payload),
-            "content-disposition": first(headers.get(
-                'content-disposition', '').split(';')),
-            "content-type": headers.get(
-                'content-type', ''),
-            "content-transfer-encoding": headers.get(
-                'content-transfer-encoding', '')
-        } for payload, headers in get_payloads(msg)
-        if not isinstance(payload, list))
+        {'type': 'cnt',
+         'raw': part.get_payload(),
+         'phash': get_hash(part.get_payload()),
+         'content-type': part.get_content_type(),
+         'content-disposition': first(part.get(
+             'content-disposition', '').split(';')),
+         'content-transfer-encoding': part.get(
+             'content-transfer-encoding', '')
+         } for part in msg.walk() if not isinstance(part.get_payload(), list))
+
+
+def get_hash(s):
+    digest = hashes.Hash(hashes.SHA256(), crypto_backend)
+    digest.update(s)
+    return digest.finalize().encode("hex").upper()
 
 
 """
@@ -116,111 +104,4 @@ Groucho Marx: What's the matter with it?
 Chico Marx: I don't know, let's hear it again.
 Groucho Marx: So the party of the first part shall be known in this contract as
               the party of the first part.
-
-Chico Marx: Well it sounds a little better this time.
-Groucho Marx: Well, it grows on you. Would you like to hear it once more?
-
-Chico Marx: Just the first part.
-Groucho Marx: All right. It says the first part of the party of the first part
-              shall be known in this contract as the first part of the party of
-              the first part, shall be known in this contract - look, why
-              should we quarrel about a thing like this, we'll take it right
-              out, eh?
-
-Chico Marx: Yes, it's too long anyhow. Now what have we got left?
-Groucho Marx: Well I've got about a foot and a half. Now what's the matter?
-
-Chico Marx: I don't like the second party either.
 """
-
-
-def walk_msg_tree(parts, body_phash=None):
-    """
-    Take a list of interesting items of a message subparts structure,
-    and return a dict of dicts almost ready to be written to the content
-    documents that will be stored in Soledad.
-
-    It walks down the subparts in the parsed message tree, and collapses
-    the leaf documents into a wrapper document until no multipart submessages
-    are left. To achieve this, it iteratively calculates a wrapper vector of
-    all documents in the sequence that have more than one part and have unitary
-    documents to their right. To collapse a multipart, take as many
-    unitary documents as parts the submessage contains, and replace the object
-    in the sequence with the new wrapper document.
-
-    :param parts: A list of dicts containing the interesting properties for
-                  the message structure. Normally this has been generated by
-                  doing a message walk.
-    :type parts: list of dicts.
-    :param body_phash: the payload hash of the body part, to be included
-                       in the outer content doc for convenience.
-    :type body_phash: basestring or None
-    """
-    PART_MAP = "part_map"
-    MULTI = "multi"
-    HEADERS = "headers"
-    PHASH = "phash"
-    BODY = "body"
-
-    # parts vector
-    pv = list(get_parts_vector(parts))
-
-    inner_headers = parts[1].get(HEADERS, None) if (
-        len(parts) == 2) else None
-
-    # wrappers vector
-    def getwv(pv):
-        return [
-            True if pv[i] != 1 and pv[i + 1] == 1
-            else False
-            for i in range(len(pv) - 1)
-        ]
-    wv = getwv(pv)
-
-    # do until no wrapper document is left
-    while any(wv):
-        wind = wv.index(True)  # wrapper index
-        nsub = pv[wind]  # number of subparts to pick
-        slic = parts[wind + 1:wind + 1 + nsub]  # slice with subparts
-
-        cwra = {
-            MULTI: True,
-            PART_MAP: dict((index + 1, part)  # content wrapper
-                           for index, part in enumerate(slic)),
-            HEADERS: dict(parts[wind][HEADERS])
-        }
-
-        # remove subparts and substitute wrapper
-        map(lambda i: parts.remove(i), slic)
-        parts[wind] = cwra
-
-        # refresh vectors for this iteration
-        pv = list(get_parts_vector(parts))
-        wv = getwv(pv)
-
-    if all(x == 1 for x in pv):
-        # special case in the rightmost element
-        main_pmap = parts[0].get(PART_MAP, None)
-        if main_pmap is not None:
-            last_part = max(main_pmap.keys())
-            main_pmap[last_part][PART_MAP] = {}
-            for partind in range(len(pv) - 1):
-                main_pmap[last_part][PART_MAP][partind] = parts[partind + 1]
-
-    outer = parts[0]
-    outer.pop(HEADERS)
-    if PART_MAP not in outer:
-        # we have a multipart with 1 part only, so kind of fix it
-        # although it would be prettier if I take this special case at
-        # the beginning of the walk.
-        pdoc = {MULTI: True,
-                PART_MAP: {1: outer}}
-        pdoc[PART_MAP][1][MULTI] = False
-        if not pdoc[PART_MAP][1].get(PHASH, None):
-            pdoc[PART_MAP][1][PHASH] = body_phash
-        if inner_headers:
-            pdoc[PART_MAP][1][HEADERS] = inner_headers
-    else:
-        pdoc = outer
-    pdoc[BODY] = body_phash
-    return pdoc