summaryrefslogtreecommitdiff
path: root/src/leap/mail/walk.py
diff options
context:
space:
mode:
authorKali Kaneko <kali@leap.se>2014-01-07 11:34:08 -0400
committerKali Kaneko <kali@leap.se>2014-01-08 20:53:47 -0400
commit4ba5d5b405e3c6a6bc997df2073ffc8ea3fa75a9 (patch)
tree7519ccd4dec15240cc8a89ff34fdc61ee7236141 /src/leap/mail/walk.py
parenta203337d155a6e7186980ef175642adc91d472fe (diff)
Second stage of the new year's storage rewrite.
* documents of only three types: * flags * headers * content * add algorithm for walking the parsed message tree. * treat special cases like a multipart with a single part. * modify add_msg to use the walk routine * modify twisted interfaces to use the new storage schema. * tests for different multipart cases * fix multipart detection typo in the fetch This is a merge proposal for the 0.5.0-rc3. known bugs ---------- Some things are still know not to work well at this point (some cases of multipart messages do not display the bodies). IMAP server also is left in a bad internal state after a logout/login.
Diffstat (limited to 'src/leap/mail/walk.py')
-rw-r--r--src/leap/mail/walk.py160
1 files changed, 160 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py
new file mode 100644
index 0000000..820b8c7
--- /dev/null
+++ b/src/leap/mail/walk.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+# walk.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Utilities for walking along a message tree.
+"""
+import hashlib
+import os
+
+from leap.mail.utils import first
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+if DEBUG:
+ get_hash = lambda s: hashlib.sha256(s).hexdigest()[:10]
+else:
+ get_hash = lambda s: hashlib.sha256(s).hexdigest()
+
+
+"""
+Get interesting message parts
+"""
+get_parts = lambda msg: [
+ {'multi': part.is_multipart(),
+ 'ctype': part.get_content_type(),
+ 'size': len(part.as_string()),
+ 'parts': len(part.get_payload())
+ if isinstance(part.get_payload(), list)
+ else 1,
+ 'headers': part.items(),
+ 'phash': get_hash(part.get_payload())
+ if not part.is_multipart() else None}
+ for part in msg.walk()]
+
+"""
+Utility lambda functions for getting the parts vector and the
+payloads from the original message.
+"""
+
+get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts)
+get_payloads = lambda msg: ((x.get_payload(),
+ dict(((str.lower(k), v) for k, v in (x.items()))))
+ for x in msg.walk())
+
+get_body_phash_simple = lambda payloads: first(
+ [get_hash(payload) for payload, headers in payloads
+ if "text/plain" in headers.get('content-type')])
+
+get_body_phash_multi = lambda payloads: (first(
+ [get_hash(payload) for payload, headers in payloads
+ if "text/plain" in headers.get('content-type')])
+ or get_body_phash_simple(payloads))
+
+"""
+On getting the raw docs, we get also some of the headers to be able to
+index the content. Here we remove any mutable part, as the the filename
+in the content disposition.
+"""
+
+get_raw_docs = lambda msg, parts: (
+ {"type": "cnt", # type content they'll be
+ "raw": payload if not DEBUG else payload[:100],
+ "phash": get_hash(payload),
+ "content-disposition": first(headers.get(
+ 'content-disposition', '').split(';')),
+ "content-type": headers.get(
+ 'content-type', ''),
+ "content-transfer-encoding": headers.get(
+ 'content-transfer-type', '')}
+ for payload, headers in get_payloads(msg)
+ if not isinstance(payload, list))
+
+
+def walk_msg_tree(parts, body_phash=None):
+ """
+ Take a list of interesting items of a message subparts structure,
+ and return a dict of dicts almost ready to be written to the content
+ documents that will be stored in Soledad.
+
+ It walks down the subparts in the parsed message tree, and collapses
+ the leaf docuents into a wrapper document until no multipart submessages
+ are left. To achieve this, it iteratively calculates a wrapper vector of
+ all documents in the sequence that have more than one part and have unitary
+ documents to their right. To collapse a multipart, take as many
+ unitary documents as parts the submessage contains, and replace the object
+ in the sequence with the new wrapper document.
+
+ :param parts: A list of dicts containing the interesting properties for
+ the message structure. Normally this has been generated by
+ doing a message walk.
+ :type parts: list of dicts.
+ :param body_phash: the payload hash of the body part, to be included
+ in the outer content doc for convenience.
+ :type body_phash: basestring or None
+ """
+ # parts vector
+ pv = list(get_parts_vector(parts))
+
+ if len(parts) == 2:
+ inner_headers = parts[1].get("headers", None)
+
+ if DEBUG:
+ print "parts vector: ", pv
+ print
+
+ # wrappers vector
+ getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False
+ for i in range(len(pv) - 1)]
+ wv = getwv(pv)
+
+ # do until no wrapper document is left
+ while any(wv):
+ wind = wv.index(True) # wrapper index
+ nsub = pv[wind] # number of subparts to pick
+ slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts
+
+ cwra = {
+ "multi": True,
+ "part_map": dict((index + 1, part) # content wrapper
+ for index, part in enumerate(slic)),
+ "headers": dict(parts[wind]['headers'])
+ }
+
+ # remove subparts and substitue wrapper
+ map(lambda i: parts.remove(i), slic)
+ parts[wind] = cwra
+
+ # refresh vectors for this iteration
+ pv = list(get_parts_vector(parts))
+ wv = getwv(pv)
+
+ outer = parts[0]
+ outer.pop('headers')
+ if not "part_map" in outer:
+ # we have a multipart with 1 part only, so kind of fix it
+ # although it would be prettier if I take this special case at
+ # the beginning of the walk.
+ pdoc = {"multi": True,
+ "part_map": {1: outer}}
+ pdoc["part_map"][1]["multi"] = False
+ if not pdoc["part_map"][1].get("phash", None):
+ pdoc["part_map"][1]["phash"] = body_phash
+ pdoc["part_map"][1]["headers"] = inner_headers
+ else:
+ pdoc = outer
+ pdoc["body"] = body_phash
+ return pdoc