Second stage of the new year's storage rewrite.

* documents of only three types: * flags * headers * content * add algorithm for walking the parsed message tree. * treat special cases like a multipart with a single part. * modify add_msg to use the walk routine * modify twisted interfaces to use the new storage schema. * tests for different multipart cases * fix multipart detection typo in the fetch This is a merge proposal for the 0.5.0-rc3. known bugs ---------- Some things are still know not to work well at this point (some cases of multipart messages do not display the bodies). IMAP server also is left in a bad internal state after a logout/login.
author: Kali Kaneko <kali@leap.se> 2014-01-07 11:34:08 -0400
committer: Kali Kaneko <kali@leap.se> 2014-01-08 20:53:47 -0400
commit: da9b210c4bd16d67b4b47b299df7913b2d2f1066 (patch)
tree: 09e289c024573025c79fff515f9e3ae7b56655d7
parent: ac87c723f493737941246947b0833394bb186836 (diff)
9 files changed, 768 insertions, 348 deletions
diff --git a/mail/src/leap/mail/decorators.py b/mail/src/leap/mail/decorators.py
index 024a1399..d5eac97a 100644
--- a/mail/src/leap/mail/decorators.py
+++ b/mail/src/leap/mail/decorators.py
@@ -27,6 +27,11 @@ from twisted.internet.threads import deferToThread
 logger = logging.getLogger(__name__)
 
 
+# TODO
+# Should write a helper to be able to pass a timeout argument.
+# See this answer: http://stackoverflow.com/a/19019648/1157664
+# And the notes by glyph and jpcalderone
+
 def deferred(f):
     """
     Decorator, for deferring methods to Threads.
diff --git a/mail/src/leap/mail/imap/fetch.py b/mail/src/leap/mail/imap/fetch.py
index cb200be9..604a2ea1 100644
--- a/mail/src/leap/mail/imap/fetch.py
+++ b/mail/src/leap/mail/imap/fetch.py
@@ -404,7 +404,7 @@ class LeapIncomingMail(object):
         """
         log.msg('decrypting multipart encrypted msg')
         msg = copy.deepcopy(msg)
-        self._multipart_sanity_check(msg)
+        self._msg_multipart_sanity_check(msg)
 
         # parse message and get encrypted content
         pgpencmsg = msg.get_payload()[1]
diff --git a/mail/src/leap/mail/imap/fields.py b/mail/src/leap/mail/imap/fields.py
index bc536fec..2545adfa 100644
--- a/mail/src/leap/mail/imap/fields.py
+++ b/mail/src/leap/mail/imap/fields.py
@@ -43,17 +43,17 @@ class WithMsgFields(object):
 
     # headers
     HEADERS_KEY = "headers"
-    NUM_PARTS_KEY = "numparts"
-    PARTS_MAP_KEY = "partmap"
     DATE_KEY = "date"
     SUBJECT_KEY = "subject"
-
-    # attachment
-    PART_NUMBER_KEY = "part"
-    RAW_KEY = "raw"
+    # XXX DELETE-ME
+    #NUM_PARTS_KEY = "numparts"  # not needed?!
+    PARTS_MAP_KEY = "part_map"
+    BODY_KEY = "body"  # link to phash of body
 
     # content
-    BODY_KEY = "body"
+    LINKED_FROM_KEY = "lkf"
+    RAW_KEY = "raw"
+    CTYPE_KEY = "ctype"
 
     # Mailbox specific keys
     CLOSED_KEY = "closed"
@@ -65,11 +65,13 @@ class WithMsgFields(object):
     # Document Type, for indexing
     TYPE_KEY = "type"
     TYPE_MBOX_VAL = "mbox"
-    TYPE_MESSAGE_VAL = "msg"
     TYPE_FLAGS_VAL = "flags"
     TYPE_HEADERS_VAL = "head"
-    TYPE_ATTACHMENT_VAL = "attach"
-    # should add also a headers val
+    TYPE_CONTENT_VAL = "cnt"
+
+    # XXX DEPRECATE
+    #TYPE_MESSAGE_VAL = "msg"
+    #TYPE_ATTACHMENT_VAL = "attach"
 
     INBOX_VAL = "inbox"
 
@@ -109,7 +111,6 @@ class WithMsgFields(object):
     MBOX_VAL = TYPE_MBOX_VAL
     CHASH_VAL = CONTENT_HASH_KEY
     PHASH_VAL = PAYLOAD_HASH_KEY
-    PART_VAL = PART_NUMBER_KEY
 
     INDEXES = {
         # generic
@@ -122,8 +123,7 @@ class WithMsgFields(object):
 
         # content, headers doc
         TYPE_C_HASH_IDX: [KTYPE, CHASH_VAL],
-        # attachment docs
-        TYPE_C_HASH_PART_IDX: [KTYPE, CHASH_VAL, PART_VAL],
+
         # attachment payload dedup
         TYPE_P_HASH_IDX: [KTYPE, PHASH_VAL],
 
diff --git a/mail/src/leap/mail/imap/messages.py b/mail/src/leap/mail/imap/messages.py
index bfe913c4..37e43118 100644
--- a/mail/src/leap/mail/imap/messages.py
+++ b/mail/src/leap/mail/imap/messages.py
@@ -33,6 +33,7 @@ from zope.proxy import sameProxiedObjects
 from leap.common.check import leap_assert, leap_assert_type
 from leap.common.decorators import memoized_method
 from leap.common.mail import get_email_charset
+from leap.mail import walk
 from leap.mail.utils import first
 from leap.mail.decorators import deferred
 from leap.mail.imap.index import IndexedDB
@@ -43,65 +44,58 @@ from leap.mail.messageflow import IMessageConsumer, MessageProducer
 logger = logging.getLogger(__name__)
 
 
-class MessageBody(object):
-    """
-    IMessagePart implementor for the main
-    body of a multipart message.
-
-    Excusatio non petita: see the interface documentation.
-    """
+# TODO ------------------------------------------------------------
 
-    implements(imap4.IMessagePart)
-
-    def __init__(self, fdoc, bdoc):
-        self._fdoc = fdoc
-        self._bdoc = bdoc
-
-    def getSize(self):
-        return len(self._bdoc.content[fields.BODY_KEY])
+# [ ] Add linked-from info.
+# [ ] Delete incoming mail only after successful write!
+# [ ] Remove UID from syncable db. Store only those indexes locally.
+# [ ] Send patch to twisted for bug in imap4.py:5717  (content-type can be
+#     none? lower-case?)
 
-    def getBodyFile(self):
-        fd = StringIO.StringIO()
-
-        if self._bdoc:
-            body = self._bdoc.content[fields.BODY_KEY]
-        else:
-            body = ""
-        charset = self._get_charset(body)
-        try:
-            body = body.encode(charset)
-        except (UnicodeEncodeError, UnicodeDecodeError) as e:
-            logger.error("Unicode error {0}".format(e))
-            body = body.encode(charset, 'replace')
-        fd.write(body)
-        fd.seek(0)
-        return fd
-
-    @memoized_method
-    def _get_charset(self, stuff):
-        return get_email_charset(unicode(stuff))
-
-    def getHeaders(self, negate, *names):
-        return {}
+def lowerdict(_dict):
+    """
+    Return a dict with the keys in lowercase.
 
-    def isMultipart(self):
-        return False
+    :param _dict: the dict to convert
+    :rtype: dict
+    """
+    return dict((key.lower(), value)
+                for key, value in _dict.items())
 
-    def getSubPart(self, part):
-        return None
 
+class MessagePart(object):
+    """
+    IMessagePart implementor.
+    It takes a subpart message and is able to find
+    the inner parts.
 
-class MessageAttachment(object):
+    Excusatio non petita: see the interface documentation.
+    """
 
     implements(imap4.IMessagePart)
 
-    def __init__(self, msg):
+    def __init__(self, soledad, part_map):
         """
-        Initializes the messagepart with a Message instance.
-        :param msg: a message instance
-        :type msg: Message
+        Initializes the MessagePart.
+
+        :param part_map: a dictionary containing the parts map for this
+                         message
+        :type part_map: dict
         """
-        self._msg = msg
+        # TODO
+        # It would be good to pass the uid/mailbox also
+        # for references while debugging.
+
+        # We have a problem on bulk moves, and is
+        # that when the fetch on the new mailbox is done
+        # the parts maybe are not complete.
+        # So we should be able to fail with empty
+        # docs until we solve that. The ideal would be
+        # to gather the results of the deferred operations
+        # to signal the operation is complete.
+        #leap_assert(part_map, "part map dict cannot be null")
+        self._soledad = soledad
+        self._pmap = part_map
 
     def getSize(self):
         """
@@ -110,9 +104,12 @@ class MessageAttachment(object):
         :return: size of the message, in octets
         :rtype: int
         """
-        if not self._msg:
+        if not self._pmap:
             return 0
-        return len(self._msg.as_string())
+        size = self._pmap.get('size', None)
+        if not size:
+            logger.error("Message part cannot find size in the partmap")
+        return size
 
     def getBodyFile(self):
         """
@@ -122,24 +119,91 @@ class MessageAttachment(object):
         :rtype: StringIO
         """
         fd = StringIO.StringIO()
-        if self._msg:
-            body = self._msg.get_payload()
+        if self._pmap:
+            multi = self._pmap.get('multi')
+            if not multi:
+                phash = self._pmap.get("phash", None)
+            else:
+                pmap = self._pmap.get('part_map')
+                first_part = pmap.get('1', None)
+                if first_part:
+                    phash = first_part['phash']
+
+            if not phash:
+                logger.warning("Could not find phash for this subpart!")
+                payload = str("")
+            else:
+                payload = self._get_payload_from_document(phash)
+
         else:
-            logger.debug("Empty message!")
-            body = ""
-
-        # XXX should only do the dance if we're sure it's
-        # content/text-plain!!!
-        #charset = self._get_charset(body)
-        #try:
-            #body = body.encode(charset)
-        #except (UnicodeEncodeError, UnicodeDecodeError) as e:
-            #logger.error("Unicode error {0}".format(e))
-            #body = body.encode(charset, 'replace')
-        fd.write(body)
+            logger.warning("Message with no part_map!")
+            payload = str("")
+
+        if payload:
+            #headers = self.getHeaders(True)
+            #headers = lowerdict(headers)
+            #content_type = headers.get('content-type', "")
+            content_type = self._get_ctype_from_document(phash)
+            charset_split = content_type.split('charset=')
+            # XXX fuck all this, use a regex!
+            if len(charset_split) > 1:
+                charset = charset_split[1]
+                if charset:
+                    charset = charset.strip()
+            else:
+                charset = None
+            if not charset:
+                charset = self._get_charset(payload)
+            try:
+                payload = payload.encode(charset)
+            except (UnicodeEncodeError, UnicodeDecodeError) as e:
+                logger.error("Unicode error {0}".format(e))
+                payload = payload.encode(charset, 'replace')
+
+        fd.write(payload)
         fd.seek(0)
         return fd
 
+    # TODO cache the phash retrieval
+    def _get_payload_from_document(self, phash):
+        """
+        Gets the message payload from the content document.
+
+        :param phash: the payload hash to retrieve by.
+        :type phash: basestring
+        """
+        cdocs = self._soledad.get_from_index(
+            fields.TYPE_P_HASH_IDX,
+            fields.TYPE_CONTENT_VAL, str(phash))
+
+        cdoc = first(cdocs)
+        if not cdoc:
+            logger.warning(
+                "Could not find the content doc "
+                "for phash %s" % (phash,))
+        payload = cdoc.content.get(fields.RAW_KEY, "")
+        return payload
+
+    # TODO cache the pahash retrieval
+    def _get_ctype_from_document(self, phash):
+        """
+        Gets the content-type from the content document.
+
+        :param phash: the payload hash to retrieve by.
+        :type phash: basestring
+        """
+        cdocs = self._soledad.get_from_index(
+            fields.TYPE_P_HASH_IDX,
+            fields.TYPE_CONTENT_VAL, str(phash))
+
+        cdoc = first(cdocs)
+        if not cdoc:
+            logger.warning(
+                "Could not find the content doc "
+                "for phash %s" % (phash,))
+        ctype = cdoc.content.get('ctype', "")
+        return ctype
+
     @memoized_method
     def _get_charset(self, stuff):
         # TODO put in a common class with LeapMessage
@@ -150,8 +214,6 @@ class MessageAttachment(object):
         :type stuff: basestring
         :returns: charset
         """
-        # XXX existential doubt 1. wouldn't be smarter to
-        # peek into the mail headers?
         # XXX existential doubt 2. shouldn't we make the scope
         # of the decorator somewhat more persistent?
         # ah! yes! and put memory bounds.
@@ -172,9 +234,17 @@ class MessageAttachment(object):
         :return: A mapping of header field names to header field values
         :rtype: dict
         """
-        if not self._msg:
+        if not self._pmap:
+            logger.warning("No pmap in Subpart!")
             return {}
-        headers = dict(self._msg.items())
+        headers = dict(self._pmap.get("headers", []))
+
+        # twisted imap server expects *some* headers to be lowercase
+        # We could use a CaseInsensitiveDict here...
+        headers = dict(
+            (str(key), str(value)) if key.lower() != "content-type"
+            else (str(key.lower()), str(value))
+            for (key, value) in headers.items())
 
         names = map(lambda s: s.upper(), names)
         if negate:
@@ -187,13 +257,18 @@ class MessageAttachment(object):
             map(str, (key, val)) for
             key, val in headers.items()
             if cond(key)]
-        return dict(filter_by_cond)
+        filtered = dict(filter_by_cond)
+        return filtered
 
     def isMultipart(self):
         """
         Return True if this message is multipart.
         """
-        return self._msg.is_multipart()
+        if not self._pmap:
+            logger.warning("Could not get part map!")
+            return False
+        multi = self._pmap.get("multi", False)
+        return multi
 
     def getSubPart(self, part):
         """
@@ -206,10 +281,30 @@ class MessageAttachment(object):
         :rtype: Any object implementing C{IMessagePart}.
         :return: The specified sub-part.
         """
-        return self._msg.get_payload()
+        if not self.isMultipart():
+            raise TypeError
+        sub_pmap = self._pmap.get("part_map", {})
+        try:
+            part_map = sub_pmap[str(part + 1)]
+        except KeyError:
+            logger.debug("getSubpart for %s: KeyError" % (part,))
+            raise IndexError
+
+        # XXX check for validity
+        return MessagePart(self._soledad, part_map)
 
 
 class LeapMessage(fields, MailParser, MBoxParser):
+    """
+    The main representation of a message.
+
+    It indexes the messages in one mailbox by a combination
+    of uid+mailbox name.
+    """
+
+    # TODO this has to change.
+    # Should index primarily by chash, and keep a local-lonly
+    # UID table.
 
     implements(imap4.IMessage)
 
@@ -268,6 +363,8 @@ class LeapMessage(fields, MailParser, MBoxParser):
         """
         An accessor to the body document.
         """
+        if not self._hdoc:
+            return None
         if not self.__bdoc:
             self.__bdoc = self._get_body_doc()
         return self.__bdoc
@@ -320,6 +417,11 @@ class LeapMessage(fields, MailParser, MBoxParser):
         log.msg('setting flags: %s' % (self._uid))
 
         doc = self._fdoc
+        if not doc:
+            logger.warning(
+                "Could not find FDOC for %s:%s while setting flags!" %
+                (self._mbox, self._uid))
+            return
         doc.content[self.FLAGS_KEY] = flags
         doc.content[self.SEEN_KEY] = self.SEEN_FLAG in flags
         doc.content[self.RECENT_KEY] = self.RECENT_FLAG in flags
@@ -384,16 +486,25 @@ class LeapMessage(fields, MailParser, MBoxParser):
         fd = StringIO.StringIO()
         bdoc = self._bdoc
         if bdoc:
-            body = self._bdoc.content.get(self.BODY_KEY, "")
+            body = str(self._bdoc.content.get(self.RAW_KEY, ""))
         else:
-            body = ""
+            logger.warning("No BDOC found for message.")
+            body = str("")
+
+        # XXX not needed, isn't it? ---- ivan?
+        #if bdoc:
+            #content_type = bdoc.content.get('content-type', "")
+            #charset = content_type.split('charset=')[1]
+            #if charset:
+                #charset = charset.strip()
+            #if not charset:
+                #charset = self._get_charset(body)
+            #try:
+                #body = str(body.encode(charset))
+            #except (UnicodeEncodeError, UnicodeDecodeError) as e:
+                #logger.error("Unicode error {0}".format(e))
+                #body = str(body.encode(charset, 'replace'))
 
-        charset = self._get_charset(body)
-        try:
-            body = body.encode(charset)
-        except (UnicodeEncodeError, UnicodeDecodeError) as e:
-            logger.error("Unicode error {0}".format(e))
-            body = body.encode(charset, 'replace')
         fd.write(body)
         fd.seek(0)
         return fd
@@ -407,8 +518,7 @@ class LeapMessage(fields, MailParser, MBoxParser):
         :type stuff: basestring
         :returns: charset
         """
-        # XXX existential doubt 1. wouldn't be smarter to
-        # peek into the mail headers?
+        # TODO get from subpart headers
         # XXX existential doubt 2. shouldn't we make the scope
         # of the decorator somewhat more persistent?
         # ah! yes! and put memory bounds.
@@ -447,9 +557,11 @@ class LeapMessage(fields, MailParser, MBoxParser):
         :return: A mapping of header field names to header field values
         :rtype: dict
         """
+        # TODO split in smaller methods
         headers = self._get_headers()
         if not headers:
-            return {'content-type': ''}
+            logger.warning("No headers found")
+            return {str('content-type'): str('')}
 
         names = map(lambda s: s.upper(), names)
         if negate:
@@ -457,16 +569,20 @@ class LeapMessage(fields, MailParser, MBoxParser):
         else:
             cond = lambda key: key.upper() in names
 
-        head = copy.deepcopy(dict(headers.items()))
+        if isinstance(headers, list):
+            headers = dict(headers)
 
-            # twisted imap server expects headers to be lowercase
-        head = dict(
-            (str(key), map(str, value)) if key.lower() != "content-type"
-            else (str(key.lower(), map(str, value)))
-            for (key, value) in head.items())
+        # twisted imap server expects *some* headers to be lowercase
+        # XXX refactor together with MessagePart method
+        headers = dict(
+            (str(key), str(value)) if key.lower() != "content-type"
+            else (str(key.lower()), str(value))
+            for (key, value) in headers.items())
 
         # unpack and filter original dict by negate-condition
-        filter_by_cond = [(key, val) for key, val in head.items() if cond(key)]
+        filter_by_cond = [(key, val) for key, val
+                          in headers.items() if cond(key)]
+
         return dict(filter_by_cond)
 
     def _get_headers(self):
@@ -474,7 +590,9 @@ class LeapMessage(fields, MailParser, MBoxParser):
         Return the headers dict for this message.
         """
         if self._hdoc is not None:
-            return self._hdoc.content.get(self.HEADERS_KEY, {})
+            headers = self._hdoc.content.get(self.HEADERS_KEY, {})
+            return headers
+
         else:
             logger.warning(
                 "No HEADERS doc for msg %s:%s" % (
@@ -486,12 +604,13 @@ class LeapMessage(fields, MailParser, MBoxParser):
         Return True if this message is multipart.
         """
         if self._fdoc:
-            return self._fdoc.content.get(self.MULTIPART_KEY, False)
+            is_multipart = self._fdoc.content.get(self.MULTIPART_KEY, False)
+            return is_multipart
         else:
             logger.warning(
                 "No FLAGS doc for msg %s:%s" % (
-                    self.mbox,
-                    self.uid))
+                    self._mbox,
+                    self._uid))
 
     def getSubPart(self, part):
         """
@@ -504,27 +623,33 @@ class LeapMessage(fields, MailParser, MBoxParser):
         :rtype: Any object implementing C{IMessagePart}.
         :return: The specified sub-part.
         """
-        logger.debug("Getting subpart: %s" % part)
         if not self.isMultipart():
             raise TypeError
-
-        if part == 0:
-            # Let's get the first part, which
-            # is really the body.
-            return MessageBody(self._fdoc, self._bdoc)
-
-        attach_doc = self._get_attachment_doc(part)
-        if not attach_doc:
-            # so long and thanks for all the fish
-            logger.debug("...not today")
+        try:
+            pmap_dict = self._get_part_from_parts_map(part + 1)
+        except KeyError:
+            logger.debug("getSubpart for %s: KeyError" % (part,))
             raise IndexError
-        msg_part = self._get_parsed_msg(attach_doc.content[self.RAW_KEY])
-        return MessageAttachment(msg_part)
+        return MessagePart(self._soledad, pmap_dict)
 
     #
     # accessors
     #
 
+    def _get_part_from_parts_map(self, part):
+        """
+        Get a part map from the headers doc
+
+        :raises: KeyError if key does not exist
+        :rtype: dict
+        """
+        if not self._hdoc:
+            logger.warning("Tried to get part but no HDOC found!")
+            return None
+
+        pmap = self._hdoc.content.get(fields.PARTS_MAP_KEY, {})
+        return pmap[str(part)]
+
     def _get_flags_doc(self):
         """
         Return the document that keeps the flags for this
@@ -550,63 +675,16 @@ class LeapMessage(fields, MailParser, MBoxParser):
         Return the document that keeps the body for this
         message.
         """
-        body_docs = self._soledad.get_from_index(
-            fields.TYPE_C_HASH_IDX,
-            fields.TYPE_MESSAGE_VAL, str(self._chash))
-        return first(body_docs)
-
-    def _get_num_parts(self):
-        """
-        Return the number of parts for a multipart message.
-        """
-        if not self.isMultipart():
-            raise TypeError(
-                "Tried to get num parts in a non-multipart message")
-        if not self._hdoc:
-            return None
-        return self._hdoc.content.get(fields.NUM_PARTS_KEY, 2)
-
-    def _get_attachment_doc(self, part):
-        """
-        Return the document that keeps the headers for this
-        message.
-
-        :param part: the part number for the multipart message.
-        :type part: int
-        """
-        if not self._hdoc:
-            return None
-        try:
-            phash = self._hdoc.content[self.PARTS_MAP_KEY][str(part)]
-        except KeyError:
-            # this is the remnant of a debug session until
-            # I found that the index is actually a string...
-            # It should be safe to just raise the KeyError now,
-            # but leaving it here while the blood is fresh...
-            logger.warning("We expected a phash in the "
-                           "index %s, but noone found" % (part, ))
-            logger.debug(self._hdoc.content[self.PARTS_MAP_KEY])
+        body_phash = self._hdoc.content.get(
+            fields.BODY_KEY, None)
+        if not body_phash:
+            logger.warning("No body phash for this document!")
             return None
-        attach_docs = self._soledad.get_from_index(
+        body_docs = self._soledad.get_from_index(
             fields.TYPE_P_HASH_IDX,
-            fields.TYPE_ATTACHMENT_VAL, str(phash))
-
-        # The following is true for the fist owner.
-        # We could use this relationship to flag the "owner"
-        # and orphan when we delete it.
+            fields.TYPE_CONTENT_VAL, str(body_phash))
 
-        #attach_docs = self._soledad.get_from_index(
-            #fields.TYPE_C_HASH_PART_IDX,
-            #fields.TYPE_ATTACHMENT_VAL, str(self._chash), str(part))
-        return first(attach_docs)
-
-    def _get_raw_msg(self):
-        """
-        Return the raw msg.
-        :rtype: basestring
-        """
-        # TODO deprecate this.
-        return self._bdoc.content.get(self.RAW_KEY, '')
+        return first(body_docs)
 
     def __getitem__(self, key):
         """
@@ -658,27 +736,22 @@ class LeapMessage(fields, MailParser, MBoxParser):
         """
         Remove all docs associated with this message.
         """
-        # XXX this would ve more efficient if we can just pass
-        # a sequence of uids.
-
         # XXX For the moment we are only removing the flags and headers
         # docs. The rest we leave there polluting your hard disk,
         # until we think about a good way of deorphaning.
         # Maybe a crawler of unreferenced docs.
 
+        # XXX implement elijah's idea of using a PUT document as a
+        # token to ensure consistency in the removal.
+
         uid = self._uid
-        print "removing...", uid
 
         fd = self._get_flags_doc()
-        hd = self._get_headers_doc()
+        #hd = self._get_headers_doc()
         #bd = self._get_body_doc()
         #docs = [fd, hd, bd]
 
-        docs = [fd, hd]
-
-        #for pn in range(self._get_num_parts()[1:]):
-            #ad = self._get_attachment_doc(pn)
-            #docs.append(ad)
+        docs = [fd]
 
         for d in filter(None, docs):
             try:
@@ -703,8 +776,7 @@ SoledadWriterPayload = namedtuple(
 
 SoledadWriterPayload.CREATE = 1
 SoledadWriterPayload.PUT = 2
-SoledadWriterPayload.BODY_CREATE = 3
-SoledadWriterPayload.ATTACHMENT_CREATE = 4
+SoledadWriterPayload.CONTENT_CREATE = 3
 
 
 class SoledadDocWriter(object):
@@ -723,6 +795,38 @@ class SoledadDocWriter(object):
         """
         self._soledad = soledad
 
+    def _get_call_for_item(self, item):
+        """
+        Return the proper call type for a given item.
+
+        :param item: one of the types defined under the
+                     attributes of SoledadWriterPayload
+        :type item: int
+        """
+        call = None
+        payload = item.payload
+
+        if item.mode == SoledadWriterPayload.CREATE:
+            call = self._soledad.create_doc
+        elif (item.mode == SoledadWriterPayload.CONTENT_CREATE
+              and not self._content_does_exist(payload)):
+                call = self._soledad.create_doc
+        elif item.mode == SoledadWriterPayload.PUT:
+            call = self._soledad.put_doc
+        return call
+
+    def _process(self, queue):
+        """
+        Return the item and the proper call type for the next
+        item in the queue if any.
+
+        :param queue: the queue from where we'll pick item.
+        :type queue: Queue
+        """
+        item = queue.get()
+        call = self._get_call_for_item(item)
+        return item, call
+
     def consume(self, queue):
         """
         Creates a new document in soledad db.
@@ -733,24 +837,10 @@ class SoledadDocWriter(object):
         """
         empty = queue.empty()
         while not empty:
-            item = queue.get()
-            call = None
-            payload = item.payload
-
-            if item.mode == SoledadWriterPayload.CREATE:
-                call = self._soledad.create_doc
-            elif item.mode == SoledadWriterPayload.BODY_CREATE:
-                if not self._body_does_exist(payload):
-                    call = self._soledad.create_doc
-            elif item.mode == SoledadWriterPayload.ATTACHMENT_CREATE:
-                if not self._attachment_does_exist(payload):
-                    call = self._soledad.create_doc
-            elif item.mode == SoledadWriterPayload.PUT:
-                call = self._soledad.put_doc
-
-            # XXX delete?
+            item, call = self._process(queue)
 
             if call:
+                # XXX should handle the delete case
                 # should handle errors
                 try:
                     call(item.payload)
@@ -779,33 +869,10 @@ class SoledadDocWriter(object):
     Stack.
     """
 
-    def _body_does_exist(self, doc):
+    def _content_does_exist(self, doc):
         """
-        Check whether we already have a body payload with this hash in our
-        database.
-
-        :param doc: tentative body document
-        :type doc: dict
-        :returns: True if that happens, False otherwise.
-        """
-        if not doc:
-            return False
-        chash = doc[fields.CONTENT_HASH_KEY]
-        body_docs = self._soledad.get_from_index(
-            fields.TYPE_C_HASH_IDX,
-            fields.TYPE_MESSAGE_VAL, str(chash))
-        if not body_docs:
-            return False
-        if len(body_docs) != 1:
-            logger.warning("Found more than one copy of chash %s!"
-                           % (chash,))
-        logger.debug("Found body doc with that hash! Skipping save!")
-        return True
-
-    def _attachment_does_exist(self, doc):
-        """
-        Check whether we already have an attachment payload with this hash
-        in our database.
+        Check whether we already have a content document for a payload
+        with this hash in our database.
 
         :param doc: tentative body document
         :type doc: dict
@@ -816,7 +883,7 @@ class SoledadDocWriter(object):
         phash = doc[fields.PAYLOAD_HASH_KEY]
         attach_docs = self._soledad.get_from_index(
             fields.TYPE_P_HASH_IDX,
-            fields.TYPE_ATTACHMENT_VAL, str(phash))
+            fields.TYPE_CONTENT_VAL, str(phash))
         if not attach_docs:
             return False
 
@@ -840,15 +907,15 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
     # into a template for the class.
     FLAGS_DOC = "FLAGS"
     HEADERS_DOC = "HEADERS"
-    ATTACHMENT_DOC = "ATTACHMENT"
-    BODY_DOC = "BODY"
+    CONTENT_DOC = "CONTENT"
 
     templates = {
 
         FLAGS_DOC: {
             fields.TYPE_KEY: fields.TYPE_FLAGS_VAL,
-            fields.UID_KEY: 1,
+            fields.UID_KEY: 1,  # XXX moe to a local table
             fields.MBOX_KEY: fields.INBOX_VAL,
+            fields.CONTENT_HASH_KEY: "",
 
             fields.SEEN_KEY: False,
             fields.RECENT_KEY: True,
@@ -862,35 +929,28 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
             fields.TYPE_KEY: fields.TYPE_HEADERS_VAL,
             fields.CONTENT_HASH_KEY: "",
 
+            fields.DATE_KEY: "",
+            fields.SUBJECT_KEY: "",
+
             fields.HEADERS_KEY: {},
-            fields.NUM_PARTS_KEY: 0,
             fields.PARTS_MAP_KEY: {},
-            fields.DATE_KEY: "",
-            fields.SUBJECT_KEY: ""
         },
 
-        ATTACHMENT_DOC: {
-            fields.TYPE_KEY: fields.TYPE_ATTACHMENT_VAL,
-            fields.PART_NUMBER_KEY: 0,
-            fields.CONTENT_HASH_KEY:  "",
+        CONTENT_DOC: {
+            fields.TYPE_KEY: fields.TYPE_CONTENT_VAL,
             fields.PAYLOAD_HASH_KEY: "",
+            fields.LINKED_FROM_KEY: [],
+            fields.CTYPE_KEY: "",  # should index by this too
 
-            fields.RAW_KEY: ""
-        },
-
-        BODY_DOC: {
-            fields.TYPE_KEY: fields.TYPE_MESSAGE_VAL,
-            fields.CONTENT_HASH_KEY: "",
-
-            fields.BODY_KEY: "",
-
-            # this should not be needed,
-            # but let's keep the raw msg for some time
-            # until we are sure we can reconstruct
-            # the original msg from our disection.
+            # should only get inmutable headers parts
+            # (for indexing)
+            fields.HEADERS_KEY: {},
             fields.RAW_KEY: "",
+            fields.PARTS_MAP_KEY: {},
+            fields.HEADERS_KEY: {},
+            fields.MULTIPART_KEY: False,
+        },
 
-        }
     }
 
     def __init__(self, mbox=None, soledad=None):
@@ -938,128 +998,124 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
             raise TypeError("Improper type passed to _get_empty_doc")
         return copy.deepcopy(self.templates[_type])
 
-    @deferred
-    def add_msg(self, raw, subject=None, flags=None, date=None, uid=1):
+    def _do_parse(self, raw):
         """
-        Creates a new message document.
+        Parse raw message and return it along with
+        relevant information about its outer level.
 
         :param raw: the raw message
-        :type raw: str
-
-        :param subject: subject of the message.
-        :type subject: str
-
-        :param flags: flags
-        :type flags: list
-
-        :param date: the received date for the message
-        :type date: str
-
-        :param uid: the message uid for this mailbox
-        :type uid: int
+        :type raw: StringIO or basestring
+        :return: msg, chash, size, multi
+        :rtype: tuple
         """
-        # TODO: split in smaller methods
-        logger.debug('adding message')
-        if flags is None:
-            flags = tuple()
-        leap_assert_type(flags, tuple)
-
-        # docs for flags, headers, and body
-        fd, hd, bd = map(
-            lambda t: self._get_empty_doc(t),
-            (self.FLAGS_DOC, self.HEADERS_DOC, self.BODY_DOC))
-
         msg = self._get_parsed_msg(raw)
-        headers = defaultdict(list)
-        for k, v in msg.items():
-            headers[k].append(v)
-        raw_str = msg.as_string()
         chash = self._get_hash(msg)
+        size = len(msg.as_string())
         multi = msg.is_multipart()
+        return msg, chash, size, multi
 
-        attaches = []
-        inner_parts = []
-
-        if multi:
-            # XXX should walk down recursively
-            # in a better way.  but fixing this quick
-            # to have an rc.
-            # XXX should pick the content-type in txt
-            body = first(msg.get_payload()).get_payload()
-            if isinstance(body, list):
-                # allowing one nesting level for now...
-                body, rest = body[0].get_payload(), body[1:]
-                for p in rest:
-                    inner_parts.append(p)
-        else:
-            body = msg.get_payload()
-        logger.debug("adding msg with uid %s (multipart:%s)" % (
-            uid, multi))
+    def _populate_flags(self, flags, uid, chash, size, multi):
+        """
+        Return a flags doc.
+
+        XXX Missing DOC -----------
+        """
+        fd = self._get_empty_doc(self.FLAGS_DOC)
 
-        # flags doc ---------------------------------------
         fd[self.MBOX_KEY] = self.mbox
         fd[self.UID_KEY] = uid
         fd[self.CONTENT_HASH_KEY] = chash
+        fd[self.SIZE_KEY] = size
         fd[self.MULTIPART_KEY] = multi
-        fd[self.SIZE_KEY] = len(raw_str)
         if flags:
             fd[self.FLAGS_KEY] = map(self._stringify, flags)
             fd[self.SEEN_KEY] = self.SEEN_FLAG in flags
             fd[self.DEL_KEY] = self.DELETED_FLAG in flags
             fd[self.RECENT_KEY] = True  # set always by default
+        return fd
 
-        # headers doc ----------------------------------------
+    def _populate_headr(self, msg, chash, subject, date):
+        """
+        Return a headers doc.
+
+        XXX Missing DOC -----------
+        """
+        headers = defaultdict(list)
+        for k, v in msg.items():
+            headers[k].append(v)
+
+        # "fix" for repeated headers.
+        for k, v in headers.items():
+            newline = "\n%s: " % (k,)
+            headers[k] = newline.join(v)
+
+        hd = self._get_empty_doc(self.HEADERS_DOC)
         hd[self.CONTENT_HASH_KEY] = chash
         hd[self.HEADERS_KEY] = headers
 
-        print "headers"
-        import pprint
-        pprint.pprint(headers)
-
         if not subject and self.SUBJECT_FIELD in headers:
             hd[self.SUBJECT_KEY] = first(headers[self.SUBJECT_FIELD])
         else:
             hd[self.SUBJECT_KEY] = subject
+
         if not date and self.DATE_FIELD in headers:
             hd[self.DATE_KEY] = first(headers[self.DATE_FIELD])
         else:
             hd[self.DATE_KEY] = date
-        if multi:
-            # XXX fix for multipart nested case
-            hd[self.NUM_PARTS_KEY] = len(msg.get_payload())
-
-        # body doc
-        bd[self.CONTENT_HASH_KEY] = chash
-        bd[self.BODY_KEY] = body
-        # XXX in an ideal world, we would not need to save a copy of the
-        # raw message. But we'll keep it until we can be sure that
-        # we can rebuild the original message from the parts.
-        bd[self.RAW_KEY] = raw_str
+        return hd
+
+    @deferred
+    def add_msg(self, raw, subject=None, flags=None, date=None, uid=1):
+        """
+        Creates a new message document.
+
+        :param raw: the raw message
+        :type raw: str
+
+        :param subject: subject of the message.
+        :type subject: str
+
+        :param flags: flags
+        :type flags: list
+
+        :param date: the received date for the message
+        :type date: str
+
+        :param uid: the message uid for this mailbox
+        :type uid: int
+        """
+        # TODO signal that we can delete the original message!-----
+        # when all the processing is done.
+
+        # TODO add the linked-from info !
+
+        logger.debug('adding message')
+        if flags is None:
+            flags = tuple()
+        leap_assert_type(flags, tuple)
+
+        # parse
+        msg, chash, size, multi = self._do_parse(raw)
+
+        fd = self._populate_flags(flags, uid, chash, size, multi)
+        hd = self._populate_headr(msg, chash, subject, date)
+
+        parts = walk.get_parts(msg)
+        body_phash_fun = [walk.get_body_phash_simple,
+                          walk.get_body_phash_multi][int(multi)]
+        body_phash = body_phash_fun(walk.get_payloads(msg))
+        parts_map = walk.walk_msg_tree(parts, body_phash=body_phash)
+
+        # add parts map to header doc
+        # (body, multi, part_map)
+        for key in parts_map:
+            hd[key] = parts_map[key]
+        del parts_map
 
         docs = [fd, hd]
+        cdocs = walk.get_raw_docs(msg, parts)
 
-        # attachment docs
-        if multi:
-            outer_parts = msg.get_payload()
-            parts = outer_parts + inner_parts
-
-            # skip first part, we already got it in body
-            to_attach = ((i, m) for i, m in enumerate(parts) if i > 0)
-            for index, part_msg in to_attach:
-                att_doc = self._get_empty_doc(self.ATTACHMENT_DOC)
-                att_doc[self.PART_NUMBER_KEY] = index
-                att_doc[self.CONTENT_HASH_KEY] = chash
-                phash = self._get_hash(part_msg)
-                att_doc[self.PAYLOAD_HASH_KEY] = phash
-                att_doc[self.RAW_KEY] = part_msg.as_string()
-
-                # keep a pointer to the payload hash in the
-                # headers doc, under the parts_map
-                hd[self.PARTS_MAP_KEY][str(index)] = phash
-                attaches.append(att_doc)
-
-        # Saving ... -------------------------------
-        # ok, there we go...
+        # Saving
         logger.debug('enqueuing message docs for write')
         ptuple = SoledadWriterPayload
 
@@ -1067,14 +1123,12 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
         for doc in docs:
             self.soledad_writer.put(ptuple(
                 mode=ptuple.CREATE, payload=doc))
-        # second, try to create body doc.
-        self.soledad_writer.put(ptuple(
-            mode=ptuple.BODY_CREATE, payload=bd))
+
         # and last, but not least, try to create
-        # attachment docs if not already there.
-        for at in attaches:
+        # content docs if not already there.
+        for cd in cdocs:
             self.soledad_writer.put(ptuple(
-                mode=ptuple.ATTACHMENT_CREATE, payload=at))
+                mode=ptuple.CONTENT_CREATE, payload=cd))
 
     def _remove_cb(self, result):
         return result
diff --git a/mail/src/leap/mail/imap/service/imap.py b/mail/src/leap/mail/imap/service/imap.py
index 26e14c33..234996d1 100644
--- a/mail/src/leap/mail/imap/service/imap.py
+++ b/mail/src/leap/mail/imap/service/imap.py
@@ -87,6 +87,8 @@ class LeapIMAPServer(imap4.IMAP4Server):
         :param line: the line from the server, without the line delimiter.
         :type line: str
         """
+        print "RECV: STATE (%s)" % self.state
+
         if "login" in line.lower():
             # avoid to log the pass, even though we are using a dummy auth
             # by now.
diff --git a/mail/src/leap/mail/imap/tests/rfc822.multi-minimal.message b/mail/src/leap/mail/imap/tests/rfc822.multi-minimal.message
new file mode 100644
index 00000000..582297c6
--- /dev/null
+++ b/mail/src/leap/mail/imap/tests/rfc822.multi-minimal.message
@@ -0,0 +1,16 @@
+Content-Type: multipart/mixed; boundary="===============6203542367371144092=="
+MIME-Version: 1.0
+Subject: [TEST] 010 - Inceptos cum lorem risus congue
+From: testmailbitmaskspam@gmail.com
+To: test_c5@dev.bitmask.net
+
+--===============6203542367371144092==
+Content-Type: text/plain; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+
+Howdy from python!
+The subject: [TEST] 010 - Inceptos cum lorem risus congue
+Current date & time: Wed Jan  8 16:36:21 2014
+Trying to attach: []
+--===============6203542367371144092==--
diff --git a/mail/src/leap/mail/imap/tests/rfc822.plain.message b/mail/src/leap/mail/imap/tests/rfc822.plain.message
new file mode 100644
index 00000000..fc627c3a
--- /dev/null
+++ b/mail/src/leap/mail/imap/tests/rfc822.plain.message
@@ -0,0 +1,66 @@
+From pyar-bounces@python.org.ar Wed Jan  8 14:46:02 2014
+Return-Path: <pyar-bounces@python.org.ar>
+X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on spamd2.riseup.net
+X-Spam-Level: **
+X-Spam-Pyzor: Reported 0 times.
+X-Spam-Status: No, score=2.1 required=8.0 tests=AM_TRUNCATED,CK_419SIZE,
+	CK_NAIVER_NO_DNS,CK_NAIVE_NO_DNS,ENV_FROM_DIFF0,HAS_REPLY_TO,LINK_NR_TOP,
+	NO_REAL_NAME,RDNS_NONE,RISEUP_SPEAR_C shortcircuit=no autolearn=disabled
+	version=3.3.2
+Delivered-To: kali@leap.se
+Received: from mx1.riseup.net (mx1-pn.riseup.net [10.0.1.33])
+	(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))
+	(Client CN "*.riseup.net", Issuer "Gandi Standard SSL CA" (not verified))
+	by vireo.riseup.net (Postfix) with ESMTPS id 6C39A8F
+	for <kali@leap.se>; Wed,  8 Jan 2014 18:46:02 +0000 (UTC)
+Received: from pyar.usla.org.ar (unknown [190.228.30.157])
+	by mx1.riseup.net (Postfix) with ESMTP id F244C533F4
+	for <kali@leap.se>; Wed,  8 Jan 2014 10:46:01 -0800 (PST)
+Received: from [127.0.0.1] (localhost [127.0.0.1])
+	by pyar.usla.org.ar (Postfix) with ESMTP id CC51D26A4F
+	for <kali@leap.se>; Wed,  8 Jan 2014 15:46:00 -0300 (ART)
+MIME-Version: 1.0
+Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+From: pyar-request@python.org.ar
+To: kali@leap.se
+Subject: confirm 0e47e4342e4d42508e8c283175b05b3377148ac2
+Reply-To: pyar-request@python.org.ar
+Auto-Submitted: auto-replied
+Message-ID: <mailman.245.1389206759.1579.pyar@python.org.ar>
+Date: Wed, 08 Jan 2014 15:45:59 -0300
+Precedence: bulk
+X-BeenThere: pyar@python.org.ar
+X-Mailman-Version: 2.1.15
+List-Id: Python Argentina <pyar.python.org.ar>
+X-List-Administrivia: yes
+Errors-To: pyar-bounces@python.org.ar
+Sender: "pyar" <pyar-bounces@python.org.ar>
+X-Virus-Scanned: clamav-milter 0.97.8 at mx1
+X-Virus-Status: Clean
+
+Mailing list subscription confirmation notice for mailing list pyar
+
+We have received a request de kaliyuga@riseup.net for subscription of
+your email address, "kaliyuga@riseup.net", to the pyar@python.org.ar
+mailing list.  To confirm that you want to be added to this mailing
+list, simply reply to this message, keeping the Subject: header
+intact.  Or visit this web page:
+
+    http://listas.python.org.ar/confirm/pyar/0e47e4342e4d42508e8c283175b05b=
+3377148ac2
+
+
+Or include the following line -- and only the following line -- in a
+message to pyar-request@python.org.ar:
+
+    confirm 0e47e4342e4d42508e8c283175b05b3377148ac2
+
+Note that simply sending a `reply' to this message should work from
+most mail readers, since that usually leaves the Subject: line in the
+right form (additional "Re:" text in the Subject: is okay).
+
+If you do not wish to be subscribed to this list, please simply
+disregard this message.  If you think you are being maliciously
+subscribed to the list, or have any other questions, send them to
+pyar-owner@python.org.ar.
diff --git a/mail/src/leap/mail/imap/tests/walktree.py b/mail/src/leap/mail/imap/tests/walktree.py
new file mode 100644
index 00000000..1626f657
--- /dev/null
+++ b/mail/src/leap/mail/imap/tests/walktree.py
@@ -0,0 +1,117 @@
+#t -*- coding: utf-8 -*-
+# walktree.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+Tests for the walktree module.
+"""
+import os
+from email import parser
+
+from leap.mail import walk as W
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+p = parser.Parser()
+
+# TODO pass an argument of the type of message
+
+##################################################
+# Input from hell
+
+#msg = p.parse(open('rfc822.multi-signed.message'))
+#msg = p.parse(open('rfc822.plain.message'))
+msg = p.parse(open('rfc822.multi-minimal.message'))
+DO_CHECK = False
+#################################################
+
+parts = W.get_parts(msg)
+
+if DEBUG:
+    def trim(item):
+        item = item[:10]
+    [trim(part["phash"]) for part in parts if part.get('phash', None)]
+
+raw_docs = list(W.get_raw_docs(msg, parts))
+
+body_phash_fun = [W.get_body_phash_simple,
+                  W.get_body_phash_multi][int(msg.is_multipart())]
+body_phash = body_phash_fun(W.get_payloads(msg))
+parts_map = W.walk_msg_tree(parts, body_phash=body_phash)
+
+
+# TODO add missing headers!
+expected = {
+    'body': '1ddfa80485',
+    'multi': True,
+    'part_map': {
+        1: {
+            'headers': {'Content-Disposition': 'inline',
+                        'Content-Type': 'multipart/mixed; '
+                        'boundary="z0eOaCaDLjvTGF2l"'},
+            'multi': True,
+            'part_map': {1: {'ctype': 'text/plain',
+                             'headers': [
+                                 ('Content-Type',
+                                  'text/plain; charset=utf-8'),
+                                 ('Content-Disposition',
+                                  'inline'),
+                                 ('Content-Transfer-Encoding',
+                                  'quoted-printable')],
+                             'multi': False,
+                             'parts': 1,
+                             'phash': '1ddfa80485',
+                             'size': 206},
+                         2: {'ctype': 'text/plain',
+                             'headers': [('Content-Type',
+                                          'text/plain; charset=us-ascii'),
+                                         ('Content-Disposition',
+                                          'attachment; '
+                                          'filename="attach.txt"')],
+                             'multi': False,
+                             'parts': 1,
+                             'phash': '7a94e4d769',
+                             'size': 133},
+                         3: {'ctype': 'application/octet-stream',
+                             'headers': [('Content-Type',
+                                          'application/octet-stream'),
+                                         ('Content-Disposition',
+                                          'attachment; filename="hack.ico"'),
+                                         ('Content-Transfer-Encoding',
+                                          'base64')],
+                             'multi': False,
+                             'parts': 1,
+                             'phash': 'c42cccebbd',
+                             'size': 12736}}},
+        2: {'ctype': 'application/pgp-signature',
+            'headers': [('Content-Type', 'application/pgp-signature')],
+            'multi': False,
+            'parts': 1,
+            'phash': '8f49fbf749',
+            'size': 877}}}
+
+if DEBUG and DO_CHECK:
+    # TODO turn this into a proper unittest
+    assert(parts_map == expected)
+    print "Structure: OK"
+
+
+import pprint
+print
+print "RAW DOCS"
+pprint.pprint(raw_docs)
+print
+print "PARTS MAP"
+pprint.pprint(parts_map)
diff --git a/mail/src/leap/mail/walk.py b/mail/src/leap/mail/walk.py
new file mode 100644
index 00000000..820b8c77
--- /dev/null
+++ b/mail/src/leap/mail/walk.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+# walk.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+Utilities for walking along a message tree.
+"""
+import hashlib
+import os
+
+from leap.mail.utils import first
+
+DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
+
+if DEBUG:
+    get_hash = lambda s: hashlib.sha256(s).hexdigest()[:10]
+else:
+    get_hash = lambda s: hashlib.sha256(s).hexdigest()
+
+
+"""
+Get interesting message parts
+"""
+get_parts = lambda msg: [
+    {'multi': part.is_multipart(),
+     'ctype': part.get_content_type(),
+     'size': len(part.as_string()),
+     'parts': len(part.get_payload())
+        if isinstance(part.get_payload(), list)
+        else 1,
+     'headers': part.items(),
+     'phash': get_hash(part.get_payload())
+        if not part.is_multipart() else None}
+    for part in msg.walk()]
+
+"""
+Utility lambda functions for getting the parts vector and the
+payloads from the original message.
+"""
+
+get_parts_vector = lambda parts: (x.get('parts', 1) for x in parts)
+get_payloads = lambda msg: ((x.get_payload(),
+                             dict(((str.lower(k), v) for k, v in (x.items()))))
+                            for x in msg.walk())
+
+get_body_phash_simple = lambda payloads: first(
+    [get_hash(payload) for payload, headers in payloads
+     if "text/plain" in headers.get('content-type')])
+
+get_body_phash_multi = lambda payloads: (first(
+    [get_hash(payload) for payload, headers in payloads
+     if "text/plain" in headers.get('content-type')])
+    or get_body_phash_simple(payloads))
+
+"""
+On getting the raw docs, we get also some of the headers to be able to
+index the content. Here we remove any mutable part, as the the filename
+in the content disposition.
+"""
+
+get_raw_docs = lambda msg, parts: (
+    {"type": "cnt",  # type content they'll be
+     "raw": payload if not DEBUG else payload[:100],
+     "phash": get_hash(payload),
+     "content-disposition": first(headers.get(
+         'content-disposition', '').split(';')),
+     "content-type": headers.get(
+         'content-type', ''),
+     "content-transfer-encoding": headers.get(
+         'content-transfer-type', '')}
+    for payload, headers in get_payloads(msg)
+    if not isinstance(payload, list))
+
+
+def walk_msg_tree(parts, body_phash=None):
+    """
+    Take a list of interesting items of a message subparts structure,
+    and return a dict of dicts almost ready to be written to the content
+    documents that will be stored in Soledad.
+
+    It walks down the subparts in the parsed message tree, and collapses
+    the leaf docuents into a wrapper document until no multipart submessages
+    are left. To achieve this, it iteratively calculates a wrapper vector of
+    all documents in the sequence that have more than one part and have unitary
+    documents to their right. To collapse a multipart, take as many
+    unitary documents as parts the submessage contains, and replace the object
+    in the sequence with the new wrapper document.
+
+    :param parts: A list of dicts containing the interesting properties for
+                  the message structure. Normally this has been generated by
+                  doing a message walk.
+    :type parts: list of dicts.
+    :param body_phash: the payload hash of the body part, to be included
+                       in the outer content doc for convenience.
+    :type body_phash: basestring or None
+    """
+    # parts vector
+    pv = list(get_parts_vector(parts))
+
+    if len(parts) == 2:
+        inner_headers = parts[1].get("headers", None)
+
+    if DEBUG:
+        print "parts vector: ", pv
+        print
+
+    # wrappers vector
+    getwv = lambda pv: [True if pv[i] != 1 and pv[i + 1] == 1 else False
+                        for i in range(len(pv) - 1)]
+    wv = getwv(pv)
+
+    # do until no wrapper document is left
+    while any(wv):
+        wind = wv.index(True)  # wrapper index
+        nsub = pv[wind]  # number of subparts to pick
+        slic = parts[wind + 1:wind + 1 + nsub]  # slice with subparts
+
+        cwra = {
+            "multi": True,
+            "part_map": dict((index + 1, part)  # content wrapper
+                             for index, part in enumerate(slic)),
+            "headers": dict(parts[wind]['headers'])
+        }
+
+        # remove subparts and substitue wrapper
+        map(lambda i: parts.remove(i), slic)
+        parts[wind] = cwra
+
+        # refresh vectors for this iteration
+        pv = list(get_parts_vector(parts))
+        wv = getwv(pv)
+
+    outer = parts[0]
+    outer.pop('headers')
+    if not "part_map" in outer:
+        # we have a multipart with 1 part only, so kind of fix it
+        # although it would be prettier if I take this special case at
+        # the beginning of the walk.
+        pdoc = {"multi": True,
+                "part_map": {1: outer}}
+        pdoc["part_map"][1]["multi"] = False
+        if not pdoc["part_map"][1].get("phash", None):
+            pdoc["part_map"][1]["phash"] = body_phash
+        pdoc["part_map"][1]["headers"] = inner_headers
+    else:
+        pdoc = outer
+    pdoc["body"] = body_phash
+    return pdoc
author	Kali Kaneko <kali@leap.se>	2014-01-07 11:34:08 -0400
committer	Kali Kaneko <kali@leap.se>	2014-01-08 20:53:47 -0400
commit	da9b210c4bd16d67b4b47b299df7913b2d2f1066 (patch)
tree	09e289c024573025c79fff515f9e3ae7b56655d7
parent	ac87c723f493737941246947b0833394bb186836 (diff)