Second stage of the new year's storage rewrite.

* documents of only three types: * flags * headers * content * add algorithm for walking the parsed message tree. * treat special cases like a multipart with a single part. * modify add_msg to use the walk routine * modify twisted interfaces to use the new storage schema. * tests for different multipart cases * fix multipart detection typo in the fetch This is a merge proposal for the 0.5.0-rc3. known bugs ---------- Some things are still know not to work well at this point (some cases of multipart messages do not display the bodies). IMAP server also is left in a bad internal state after a logout/login.
author: Kali Kaneko <kali@leap.se> 2014-01-07 11:34:08 -0400
committer: Kali Kaneko <kali@leap.se> 2014-01-08 20:53:47 -0400
commit: 4ba5d5b405e3c6a6bc997df2073ffc8ea3fa75a9 (patch)
tree: 7519ccd4dec15240cc8a89ff34fdc61ee7236141 /src/leap/mail/walk.py
parent: a203337d155a6e7186980ef175642adc91d472fe (diff)
1 files changed, 160 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py
new file mode 100644
index 0000000..820b8c7
--- /dev/null
+++ b/src/leap/mail/walk.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+# walk.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+Utilities for walking along a message tree.
+"""
+import hashlib
+import os
+
+from leap.mail.utils import first
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+if DEBUG:
+    get_hash = lambda s: hashlib.sha256(s).hexdigest()[:10]
+else:
+    get_hash = lambda s: hashlib.sha256(s).hexdigest()
+
+
+"""
+Get interesting message parts
+"""
+get_parts = lambda msg: [
+    {'multi': part.is_multipart(),
+     'ctype': part.get_content_type(),
+     'size': len(part.as_string()),
+     'parts': len(part.get_payload())
+        if isinstance(part.get_payload(), list)
+        else 1,
+     'headers': part.items(),
+     'phash': get_hash(part.get_payload())
+        if not part.is_multipart() else None}
+    for part in msg.walk()]
+
+"""
+Utility lambda functions for getting the parts vector and the
+payloads from the original message.
+"""
+
+get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts)
+get_payloads = lambda msg: ((x.get_payload(),
+                             dict(((str.lower(k), v) for k, v in (x.items()))))
+                            for x in msg.walk())
+
+get_body_phash_simple = lambda payloads: first(
+    [get_hash(payload) for payload, headers in payloads
+     if "text/plain" in headers.get('content-type')])
+
+get_body_phash_multi = lambda payloads: (first(
+    [get_hash(payload) for payload, headers in payloads
+     if "text/plain" in headers.get('content-type')])
+    or get_body_phash_simple(payloads))
+
+"""
+On getting the raw docs, we get also some of the headers to be able to
+index the content. Here we remove any mutable part, as the the filename
+in the content disposition.
+"""
+
+get_raw_docs = lambda msg, parts: (
+    {"type": "cnt",  # type content they'll be
+     "raw": payload if not DEBUG else payload[:100],
+     "phash": get_hash(payload),
+     "content-disposition": first(headers.get(
+         'content-disposition', '').split(';')),
+     "content-type": headers.get(
+         'content-type', ''),
+     "content-transfer-encoding": headers.get(
+         'content-transfer-type', '')}
+    for payload, headers in get_payloads(msg)
+    if not isinstance(payload, list))
+
+
+def walk_msg_tree(parts, body_phash=None):
+    """
+    Take a list of interesting items of a message subparts structure,
+    and return a dict of dicts almost ready to be written to the content
+    documents that will be stored in Soledad.
+
+    It walks down the subparts in the parsed message tree, and collapses
+    the leaf docuents into a wrapper document until no multipart submessages
+    are left. To achieve this, it iteratively calculates a wrapper vector of
+    all documents in the sequence that have more than one part and have unitary
+    documents to their right. To collapse a multipart, take as many
+    unitary documents as parts the submessage contains, and replace the object
+    in the sequence with the new wrapper document.
+
+    :param parts: A list of dicts containing the interesting properties for
+                  the message structure. Normally this has been generated by
+                  doing a message walk.
+    :type parts: list of dicts.
+    :param body_phash: the payload hash of the body part, to be included
+                       in the outer content doc for convenience.
+    :type body_phash: basestring or None
+    """
+    # parts vector
+    pv = list(get_parts_vector(parts))
+
+    if len(parts) == 2:
+        inner_headers = parts[1].get("headers", None)
+
+    if DEBUG:
+        print "parts vector: ", pv
+        print
+
+    # wrappers vector
+    getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False
+                        for i in range(len(pv) - 1)]
+    wv = getwv(pv)
+
+    # do until no wrapper document is left
+    while any(wv):
+        wind = wv.index(True)  # wrapper index
+        nsub = pv[wind]  # number of subparts to pick
+        slic = parts[wind + 1:wind + 1 + nsub]  # slice with subparts
+
+        cwra = {
+            "multi": True,
+            "part_map": dict((index + 1, part)  # content wrapper
+                             for index, part in enumerate(slic)),
+            "headers": dict(parts[wind]['headers'])
+        }
+
+        # remove subparts and substitue wrapper
+        map(lambda i: parts.remove(i), slic)
+        parts[wind] = cwra
+
+        # refresh vectors for this iteration
+        pv = list(get_parts_vector(parts))
+        wv = getwv(pv)
+
+    outer = parts[0]
+    outer.pop('headers')
+    if not "part_map" in outer:
+        # we have a multipart with 1 part only, so kind of fix it
+        # although it would be prettier if I take this special case at
+        # the beginning of the walk.
+        pdoc = {"multi": True,
+                "part_map": {1: outer}}
+        pdoc["part_map"][1]["multi"] = False
+        if not pdoc["part_map"][1].get("phash", None):
+            pdoc["part_map"][1]["phash"] = body_phash
+        pdoc["part_map"][1]["headers"] = inner_headers
+    else:
+        pdoc = outer
+    pdoc["body"] = body_phash
+    return pdoc
author	Kali Kaneko <kali@leap.se>	2014-01-07 11:34:08 -0400
committer	Kali Kaneko <kali@leap.se>	2014-01-08 20:53:47 -0400
commit	4ba5d5b405e3c6a6bc997df2073ffc8ea3fa75a9 (patch)
tree	7519ccd4dec15240cc8a89ff34fdc61ee7236141 /src/leap/mail/walk.py
parent	a203337d155a6e7186980ef175642adc91d472fe (diff)