summaryrefslogtreecommitdiff
path: root/src/leap/mail/walk.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/leap/mail/walk.py')
-rw-r--r--src/leap/mail/walk.py212
1 files changed, 212 insertions, 0 deletions
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py
new file mode 100644
index 0000000..f747377
--- /dev/null
+++ b/src/leap/mail/walk.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+# walk.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Utilities for walking along a message tree.
+"""
+import os
+
+from pycryptopp.hash import sha256
+
+from leap.mail.utils import first
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+if DEBUG:
+ get_hash = lambda s: sha256.SHA256(s).hexdigest()[:10]
+else:
+ get_hash = lambda s: sha256.SHA256(s).hexdigest()
+
+
+"""
+Get interesting message parts
+"""
+get_parts = lambda msg: [
+ {'multi': part.is_multipart(),
+ 'ctype': part.get_content_type(),
+ 'size': len(part.as_string()),
+ 'parts': len(part.get_payload())
+ if isinstance(part.get_payload(), list)
+ else 1,
+ 'headers': part.items(),
+ 'phash': get_hash(part.get_payload())
+ if not part.is_multipart() else None}
+ for part in msg.walk()]
+
+"""
+Utility lambda functions for getting the parts vector and the
+payloads from the original message.
+"""
+
+get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts)
+get_payloads = lambda msg: ((x.get_payload(),
+ dict(((str.lower(k), v) for k, v in (x.items()))))
+ for x in msg.walk())
+
+get_body_phash_simple = lambda payloads: first(
+ [get_hash(payload) for payload, headers in payloads
+ if payloads])
+
+get_body_phash_multi = lambda payloads: (first(
+ [get_hash(payload) for payload, headers in payloads
+ if payloads
+ and "text/plain" in headers.get('content-type', '')])
+ or get_body_phash_simple(payloads))
+
+"""
+On getting the raw docs, we get also some of the headers to be able to
+index the content. Here we remove any mutable part, as the the filename
+in the content disposition.
+"""
+
+get_raw_docs = lambda msg, parts: (
+ {"type": "cnt", # type content they'll be
+ "raw": payload if not DEBUG else payload[:100],
+ "phash": get_hash(payload),
+ "content-disposition": first(headers.get(
+ 'content-disposition', '').split(';')),
+ "content-type": headers.get(
+ 'content-type', ''),
+ "content-transfer-encoding": headers.get(
+ 'content-transfer-type', '')}
+ for payload, headers in get_payloads(msg)
+ if not isinstance(payload, list))
+
+
+def walk_msg_tree(parts, body_phash=None):
+ """
+ Take a list of interesting items of a message subparts structure,
+ and return a dict of dicts almost ready to be written to the content
+ documents that will be stored in Soledad.
+
+ It walks down the subparts in the parsed message tree, and collapses
+ the leaf docuents into a wrapper document until no multipart submessages
+ are left. To achieve this, it iteratively calculates a wrapper vector of
+ all documents in the sequence that have more than one part and have unitary
+ documents to their right. To collapse a multipart, take as many
+ unitary documents as parts the submessage contains, and replace the object
+ in the sequence with the new wrapper document.
+
+ :param parts: A list of dicts containing the interesting properties for
+ the message structure. Normally this has been generated by
+ doing a message walk.
+ :type parts: list of dicts.
+ :param body_phash: the payload hash of the body part, to be included
+ in the outer content doc for convenience.
+ :type body_phash: basestring or None
+ """
+ PART_MAP = "part_map"
+ MULTI = "multi"
+ HEADERS = "headers"
+ PHASH = "phash"
+ BODY = "body"
+
+ # parts vector
+ pv = list(get_parts_vector(parts))
+
+ inner_headers = parts[1].get(HEADERS, None) if (
+ len(parts) == 2) else None
+
+ if DEBUG:
+ print "parts vector: ", pv
+ print
+
+ # wrappers vector
+ getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False
+ for i in range(len(pv) - 1)]
+ wv = getwv(pv)
+
+ # do until no wrapper document is left
+ while any(wv):
+ wind = wv.index(True) # wrapper index
+ nsub = pv[wind] # number of subparts to pick
+ slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts
+
+ cwra = {
+ MULTI: True,
+ PART_MAP: dict((index + 1, part) # content wrapper
+ for index, part in enumerate(slic)),
+ HEADERS: dict(parts[wind][HEADERS])
+ }
+
+ # remove subparts and substitue wrapper
+ map(lambda i: parts.remove(i), slic)
+ parts[wind] = cwra
+
+ # refresh vectors for this iteration
+ pv = list(get_parts_vector(parts))
+ wv = getwv(pv)
+
+ if all(x == 1 for x in pv):
+ # special case in the rightmost element
+ main_pmap = parts[0].get(PART_MAP, None)
+ if main_pmap is not None:
+ last_part = max(main_pmap.keys())
+ main_pmap[last_part][PART_MAP] = {}
+ for partind in range(len(pv) - 1):
+ print partind+1, len(parts)
+ main_pmap[last_part][PART_MAP][partind] = parts[partind + 1]
+
+ outer = parts[0]
+ outer.pop(HEADERS)
+ if not PART_MAP in outer:
+ # we have a multipart with 1 part only, so kind of fix it
+ # although it would be prettier if I take this special case at
+ # the beginning of the walk.
+ pdoc = {MULTI: True,
+ PART_MAP: {1: outer}}
+ pdoc[PART_MAP][1][MULTI] = False
+ if not pdoc[PART_MAP][1].get(PHASH, None):
+ pdoc[PART_MAP][1][PHASH] = body_phash
+ if inner_headers:
+ pdoc[PART_MAP][1][HEADERS] = inner_headers
+ else:
+ pdoc = outer
+ pdoc[BODY] = body_phash
+ return pdoc
+
+"""
+Groucho Marx: Now pay particular attention to this first clause, because it's
+ most important. There's the party of the first part shall be
+ known in this contract as the party of the first part. How do you
+ like that, that's pretty neat eh?
+
+Chico Marx: No, that's no good.
+Groucho Marx: What's the matter with it?
+
+Chico Marx: I don't know, let's hear it again.
+Groucho Marx: So the party of the first part shall be known in this contract as
+ the party of the first part.
+
+Chico Marx: Well it sounds a little better this time.
+Groucho Marx: Well, it grows on you. Would you like to hear it once more?
+
+Chico Marx: Just the first part.
+Groucho Marx: All right. It says the first part of the party of the first part
+ shall be known in this contract as the first part of the party of
+ the first part, shall be known in this contract - look, why
+ should we quarrel about a thing like this, we'll take it right
+ out, eh?
+
+Chico Marx: Yes, it's too long anyhow. Now what have we got left?
+Groucho Marx: Well I've got about a foot and a half. Now what's the matter?
+
+Chico Marx: I don't like the second party either.
+"""
+
+"""
+I feel you deserved it after reading the above and try to debug your problem ;)
+"""