summaryrefslogtreecommitdiff
path: root/mail/src/leap
diff options
context:
space:
mode:
Diffstat (limited to 'mail/src/leap')
-rw-r--r--mail/src/leap/mail/adaptors/soledad.py19
-rw-r--r--mail/src/leap/mail/mail.py66
-rw-r--r--mail/src/leap/mail/tests/rfc822.bounce.message152
-rw-r--r--mail/src/leap/mail/tests/test_walk.py81
-rw-r--r--mail/src/leap/mail/walk.py211
5 files changed, 298 insertions, 231 deletions
diff --git a/mail/src/leap/mail/adaptors/soledad.py b/mail/src/leap/mail/adaptors/soledad.py
index 298d017..46c5a2c 100644
--- a/mail/src/leap/mail/adaptors/soledad.py
+++ b/mail/src/leap/mail/adaptors/soledad.py
@@ -1185,14 +1185,13 @@ def _split_into_parts(raw):
# TODO populate Default FLAGS/TAGS (unseen?)
# TODO seed propely the content_docs with defaults??
- msg, parts, chash, multi = _parse_msg(raw)
+ msg, chash, multi = _parse_msg(raw)
size = len(msg.as_string())
- body_phash = walk.get_body_phash(msg)
-
- parts_map = walk.walk_msg_tree(parts, body_phash=body_phash)
- cdocs_list = list(walk.get_raw_docs(msg, parts))
+ parts_map = walk.get_tree(msg)
+ cdocs_list = list(walk.get_raw_docs(msg))
cdocs_phashes = [c['phash'] for c in cdocs_list]
+ body_phash = walk.get_body_phash(msg)
mdoc = _build_meta_doc(chash, cdocs_phashes)
fdoc = _build_flags_doc(chash, size, multi)
@@ -1206,10 +1205,9 @@ def _split_into_parts(raw):
def _parse_msg(raw):
msg = message_from_string(raw)
- parts = walk.get_parts(msg)
chash = walk.get_hash(raw)
multi = msg.is_multipart()
- return msg, parts, chash, multi
+ return msg, chash, multi
def _build_meta_doc(chash, cdocs_phashes):
@@ -1220,6 +1218,7 @@ def _build_meta_doc(chash, cdocs_phashes):
_mdoc.fdoc = constants.FDOCID.format(mbox_uuid=INBOX_NAME, chash=chash)
_mdoc.hdoc = constants.HDOCID.format(chash=chash)
_mdoc.cdocs = [constants.CDOCID.format(phash=p) for p in cdocs_phashes]
+
return _mdoc.serialize()
@@ -1259,8 +1258,8 @@ def _build_headers_doc(msg, chash, body_phash, parts_map):
copy_attr(lower_headers, "date", _hdoc)
hdoc = _hdoc.serialize()
- # add parts map to header doc
- # (body, multi, part_map)
+ # add some of the attr from the parts map to header doc
for key in parts_map:
- hdoc[key] = parts_map[key]
+ if key in ('body', 'multi', 'part_map'):
+ hdoc[key] = parts_map[key]
return stringify_parts_map(hdoc)
diff --git a/mail/src/leap/mail/mail.py b/mail/src/leap/mail/mail.py
index d3659de..2fde3a1 100644
--- a/mail/src/leap/mail/mail.py
+++ b/mail/src/leap/mail/mail.py
@@ -36,7 +36,6 @@ from twisted.python import log
from leap.common.check import leap_assert_type
from leap.common.events import emit_async, catalog
-from leap.common.mail import get_email_charset
from leap.mail.adaptors.soledad import SoledadMailAdaptor
from leap.mail.constants import INBOX_NAME
@@ -124,33 +123,6 @@ def _unpack_headers(headers_dict):
return headers_l
-def _get_index_for_cdoc(part_map, cdocs_dict):
- """
- Get, if possible, the index for a given content-document matching the phash
- of the passed part_map.
-
- This is used when we are initializing a MessagePart, because we just pass a
- reference to the parent message cdocs container and we need to iterate
- through the cdocs to figure out which content-doc matches the phash of the
- part we're currently rendering.
-
- It is also used when recursing through a nested multipart message, because
- in the initialization of the child MessagePart we pass a dictionary only
- for the referenced cdoc.
-
- :param part_map: a dict describing the mapping of the parts for the current
- message-part.
- :param cdocs: a dict of content-documents, 0-indexed.
- :rtype: int
- """
- phash = part_map.get('phash', None)
- if phash:
- for i, cdoc_wrapper in cdocs_dict.items():
- if cdoc_wrapper.phash == phash:
- return i
- return None
-
-
class MessagePart(object):
# TODO This class should be better abstracted from the data model.
# TODO support arbitrarily nested multiparts (right now we only support
@@ -159,7 +131,7 @@ class MessagePart(object):
Represents a part of a multipart MIME Message.
"""
- def __init__(self, part_map, cdocs={}, nested=False):
+ def __init__(self, part_map, cdocs=None, nested=False):
"""
:param part_map: a dictionary mapping the subparts for
this MessagePart (1-indexed).
@@ -178,13 +150,12 @@ class MessagePart(object):
:param cdocs: optional, a reference to the top-level dict of wrappers
for content-docs (1-indexed).
"""
+ if cdocs is None:
+ cdocs = {}
self._pmap = part_map
self._cdocs = cdocs
self._nested = nested
- index = _get_index_for_cdoc(part_map, self._cdocs) or 1
- self._index = index
-
def get_size(self):
"""
Size of the body, in octets.
@@ -199,13 +170,10 @@ class MessagePart(object):
def get_body_file(self):
payload = ""
pmap = self._pmap
+
multi = pmap.get('multi')
if not multi:
- payload = self._get_payload(self._index)
- else:
- # XXX uh, multi also... should recurse.
- # This needs to be implemented in a more general and elegant way.
- raise NotImplementedError
+ payload = self._get_payload(pmap.get('phash'))
if payload:
payload = _encode_payload(payload)
@@ -220,33 +188,19 @@ class MessagePart(object):
def get_subpart(self, part):
if not self.is_multipart():
raise TypeError
-
sub_pmap = self._pmap.get("part_map", {})
- # XXX BUG --- workaround. Subparts with more than 1 subparts
- # need to get the requested index for the subpart decremented.
- # Off-by-one error, should investigate which is the real reason and
- # fix it, this is only a quick workaround.
- num_parts = self._pmap.get("parts", 0)
- if num_parts > 1:
- part = part - 1
- # -------------------------------------------------------------
-
try:
part_map = sub_pmap[str(part)]
except KeyError:
log.msg("getSubpart for %s: KeyError" % (part,))
raise IndexError
+ return MessagePart(part_map, cdocs=self._cdocs, nested=True)
- cdoc_index = _get_index_for_cdoc(part_map, self._cdocs)
- cdoc = self._cdocs.get(cdoc_index, {})
-
- return MessagePart(part_map, cdocs={1: cdoc}, nested=True)
-
- def _get_payload(self, index):
- cdoc_wrapper = self._cdocs.get(index, None)
- if cdoc_wrapper:
- return cdoc_wrapper.raw
+ def _get_payload(self, phash):
+ for cdocw in self._cdocs.values():
+ if cdocw.phash == phash:
+ return cdocw.raw
return ""
diff --git a/mail/src/leap/mail/tests/rfc822.bounce.message b/mail/src/leap/mail/tests/rfc822.bounce.message
new file mode 100644
index 0000000..7a51ac0
--- /dev/null
+++ b/mail/src/leap/mail/tests/rfc822.bounce.message
@@ -0,0 +1,152 @@
+Return-Path: <>
+X-Original-To: yoyo@dev.pixelated-project.org
+Delivered-To: a6973ec1af0a6d1e2a1e4db4ff85f6c2@deliver.local
+Received: by dev1.dev.pixelated-project.org (Postfix)
+ id 92CEA83164; Thu, 16 Jun 2016 14:53:34 +0200 (CEST)
+Date: Thu, 16 Jun 2016 14:53:34 +0200 (CEST)
+From: MAILER-DAEMON@dev1.dev.pixelated-project.org (Mail Delivery System)
+Subject: Undelivered Mail Returned to Sender
+To: yoyo@dev.pixelated-project.org
+Auto-Submitted: auto-replied
+MIME-Version: 1.0
+Content-Type: multipart/report; report-type=delivery-status;
+ boundary="8F60183010.1466081614/dev1.dev.pixelated-project.org"
+Message-Id: <20160616125334.92CEA83164@dev1.dev.pixelated-project.org>
+
+This is a MIME-encapsulated message.
+
+--8F60183010.1466081614/dev1.dev.pixelated-project.org
+Content-Description: Notification
+Content-Type: text/plain; charset=us-ascii
+
+This is the mail system at host dev1.dev.pixelated-project.org.
+
+I'm sorry to have to inform you that your message could not
+be delivered to one or more recipients. It's attached below.
+
+For further assistance, please send mail to postmaster.
+
+If you do so, please include this problem report. You can
+delete your own text from the attached returned message.
+
+ The mail system
+
+<nobody@leap.se>: host caribou.leap.se[176.53.69.122] said: 550 5.1.1
+ <nobody@leap.se>: Recipient address rejected: User unknown in virtual alias
+ table (in reply to RCPT TO command)
+
+--8F60183010.1466081614/dev1.dev.pixelated-project.org
+Content-Description: Delivery report
+Content-Type: message/delivery-status
+
+Reporting-MTA: dns; dev1.dev.pixelated-project.org
+X-Postfix-Queue-ID: 8F60183010
+X-Postfix-Sender: rfc822; yoyo@dev.pixelated-project.org
+Arrival-Date: Thu, 16 Jun 2016 14:53:33 +0200 (CEST)
+
+Final-Recipient: rfc822; nobody@leap.se
+Original-Recipient: rfc822;nobody@leap.se
+Action: failed
+Status: 5.1.1
+Remote-MTA: dns; caribou.leap.se
+Diagnostic-Code: smtp; 550 5.1.1 <nobody@leap.se>: Recipient address rejected:
+ User unknown in virtual alias table
+
+--8F60183010.1466081614/dev1.dev.pixelated-project.org
+Content-Description: Undelivered Message
+Content-Type: message/rfc822
+
+Return-Path: <yoyo@dev.pixelated-project.org>
+Received: from leap.mail-0.4.0rc1+111.g736ea86 (localhost [127.0.0.1])
+ (using TLSv1 with cipher ECDHE-RSA-AES128-SHA (128/128 bits))
+ (Client CN "yoyo@dev.pixelated-project.org", Issuer "Pixelated Project Root CA (client certificates only!)" (verified OK))
+ by dev1.dev.pixelated-project.org (Postfix) with ESMTPS id 8F60183010
+ for <nobody@leap.se>; Thu, 16 Jun 2016 14:53:33 +0200 (CEST)
+MIME-Version: 1.0
+Content-Type: multipart/signed; protocol="application/pgp-signature";
+ micalg="pgp-sha512"; boundary="===============7598747164910592838=="
+To: nobody@leap.se
+Subject: vrgg
+From: yoyo@dev.pixelated-project.org
+Date: Thu, 16 Jun 2016 13:53:32 -0000
+Message-Id: <20160616125332.16961.677041909.5@dev1.dev.pixelated-project.org>
+OpenPGP: id=CB546109E857BC34DFF2BCB3288870B39C400C24;
+ url="https://dev.pixelated-project.org/key/yoyo"; preference="signencrypt"
+
+--===============7598747164910592838==
+Content-Type: multipart/mixed; boundary="===============3737055506052708210=="
+MIME-Version: 1.0
+To: nobody@leap.se
+Subject: vrgg
+From: yoyo@dev.pixelated-project.org
+Date: Thu, 16 Jun 2016 13:53:32 -0000
+
+--===============3737055506052708210==
+Content-Type: text/plain; charset="utf-8"
+MIME-Version: 1.0
+Content-Transfer-Encoding: base64
+
+
+--===============3737055506052708210==
+Content-Type: application/pgp-keys
+MIME-Version: 1.0
+content-disposition: attachment; filename="yoyo@dev.pixelated-project.org-email-key.asc"
+Content-Transfer-Encoding: base64
+
+LS0tLS1CRUdJTiBQR1AgUFVCTElDIEtFWSBCTE9DSy0tLS0tCgptUUlOQkZkZ01BZ0JFQURIWWpU
+T20wcTdOT0lYVUpoTmlHVXg2S05OZ1M0Q0I2VlMvbGtab2UvYjZuRjdCSENmCkFnRVkxeFlxMkIv
+MzA3YzBtNTZWMEZvOWt2ZmZCUWhQckU5WG9rckI5blRlN1RsSDZUNTdiV09LSWMyMHhNSy8KSlVU
+djZ3UEpybjdLN0VyNEdxbzdrUmpWcFVBcWlBbGFxMkhVYllGd2NEMnBIb0VENmU2L01CZDBVUTFX
+b2s4QQpPNURDc2ZmeWhBZ0NFU1poK2w2VHlsVEJXYTJDTmJvUTl0SWtPZ0ZWTk9kTW9uWkxoTk1N
+Y0tIeU54dmF5bUdCCjhjQlRISVE2UWhGRThvR2JDRTdvczdZWWhyTmNmcUsyMzJJQllzTHNXN3Vk
+QmdwRTA0YkpwQWlvbW1zTHBCYmwKV0pCSjdqeEhwWmhJR3JGL1ltejNsSXpkbm9Mb3BSSWJyS0pC
+MmxaVDhIUHBlTVVJdVE2eHErd3RhQXFJVzlPTgo5U29uZWYyVU5BL3VseW1LeDRkOFhxbEwxY3hE
+aDFQU1E5YVlPcVg0RDlrMklmOXZmR2hET0xVMzR2Y2VFOC8vCnM1WGdTY2ZFbHg2SWlEVWZHdGx2
+aE5zQUM4TmhhUU1sOHJjUXVoRDA2RFdvSUowMVhkeFJVM2JSVVZkc0I1NWMKcXRWSHJMbVBVb256
+NU13MGFURzlTZzZudUlQcU1QOVNKRlBzbVpzR3ZYVnZWbCtSNzl1SFBlc25yWkoyTjZqOQpNaUth
+S045NFBhL1dJUnRoYWdzVnpHeHNtd2orTVZCRkZKRmh0TUtnNlFzYUsvbzRLNGJFR1ZLdWNXQk1i
+MnNxCldmd0o0SndTcHcrOHgyS3p6aXhWTllTZXhRdm9oMkc3RDRmRXdISDJzazNST3k3dTlldjhs
+bEVqUFFBUkFRQUIKdEQ5NWIzbHZRR1JsZGk1d2FYaGxiR0YwWldRdGNISnZhbVZqZEM1dmNtY2dQ
+SGx2ZVc5QVpHVjJMbkJwZUdWcwpZWFJsWkMxd2NtOXFaV04wTG05eVp6NkpBajRFRXdFQ0FDZ0ZB
+bGRnTUFnQ0d5OEZDUUhnTUZnR0N3a0lCd01DCkJoVUlBZ2tLQ3dRV0FnTUJBaDRCQWhlQUFBb0pF
+Q2lJY0xPY1FBd2s4djBQL2o2MmNyNjRUMlZPMVNKdHp1RlEKWjVpeVJsVFVHSGN2NW5hQjlUSDdI
+VVB3cTVwekZiTkg5SnhNRjVFRWtvZjdvV0hWeldWVTFBM1NDdzVNZ2FFbwppWTk5ZFBGNzdHazJ4
+ZEczNXZlWmIwWkg2WkVLdks1S042VXBucG5IeStxaVZVc1FLcE9DdUZKNkF0UlVEOTRJClJ2YnUv
+S1hsMHdORDlzVXFlYkJZN1BBSlRNY1RjLzVEdWpIT1Erd3VlSkFtaFZZbEozVnpZK1lBS2t5U05B
+QVoKZ3VVenNyUm5xQWU5SmU5TGgrcERpcVpHT2tEK1Z3b2kvRlVPQXJwbWFnNzZONTVjR3hiK2VG
+QUlzRHYrM1NNOQpjUDFyQkFON2lEaGgvdkdJeHgzMFlrYUlpMmpmcXg3VXUydnNwSXh6K0NsWWdi
+dm1wZm1CWmFqVzYzR0FsK3YvCngrby92eFZmVTMraTZ3alFjRS8vRTBTR2pvY3lQdUw0ZTZLNERy
+S3k2SHQycjBQckdHVFZ0dUZPaWU2dnVzbVcKL09sdVB1dGszU3o1S1BmRDFpRXBobmpPQ0pNRkZx
+Z2xRM1pPa3MweG00WGdwWW1ycnpQcXc1WWlzK1NEVjhobwp6anlrSzRWUlcrcC9IcUVzU29GQm5a
+MG5XSmg2Q1pZOExIeVNiMVJwaFlMRFpWd21JRXd1OW12Vm1ISVIyWUZVCllNZEx4UExiOFZNei9t
+QWpMb2Q0OGNSSzdSTzBSZ1RoMTUyK0VieXRGR3k5Y2tiS3VzRmJzVTFCQjN2MFJyUlUKenozTTcx
+T3hjcFhVQ0tpWlI0MEVYZnErSnVtZVFudm1wSWdZdUNaQkh5MzJwQUJuOHNDdUlrMStyQnp4bXdt
+bgp0WGh0K0RvNlExYXYyVjZYR00xV2xoKzEKPU8zaHEKLS0tLS1FTkQgUEdQIFBVQkxJQyBLRVkg
+QkxPQ0stLS0tLQo=
+--===============3737055506052708210==--
+
+--===============7598747164910592838==
+Content-Type: application/pgp-signature; name="signature.asc"
+MIME-Version: 1.0
+Content-Description: OpenPGP Digital Signature
+
+-----BEGIN PGP SIGNATURE-----
+
+iQIcBAABCgAGBQJXYqFNAAoJECiIcLOcQAwkDEIQAL67/XJXDv+lusoy18jr7Ony
+WQEP0pIRLp4GywGpH3dAITFAkuamO4VX3QEdVGjOHNoaT8VkSVWf9mnsYLl+Mh2v
+1OIwMv0u8WyVtrcxyXijIznnJv8X1RgyCzpUJcmOh04VZcDyxKbnFHWSDMfJ4Jtq
+qnXDONcfEeT8pwrGjP5qzTgcF/irG3w5svyQjEtj6kycddYtqUc9Hx3cMaRIzsHg
+kuUzznSzU/6P0Z345q/kXyYvU9rlcsP9vogrsqL2ueLwYSipxUJQUrRWG82FYoCo
+PAKNdGIt0xl2gEW+xWZkJqFarPiUFCx//+bVBelKrqj6rjwbj+E7mHJW318JYVHQ
+en3Smv7pEWlT4hZHXnoe8ng6TAvKzQjf7/bUxq2JpKSycp2hDO3Qz3Tv+kc+jC/r
+5UDWe/flR+syq8lAQTRSn6057g3BgDG2RtAwsjedg1aTFSrljSxbKlK4vsj5Muek
+Olq9+MUdMFSE3Jj/JC2COcS3rlt/Qt+JLDYXKahU3CodaSgF2dobikDe1bW0/QNS
+7O4Ng2PK0pA416RCFRUgPXerUnMGiWAiq7BoRHeym9y7fkHYhIYGpPVKXJ6t67y5
+JjvuzwfwG8SZTp4Wy2pg1Mr6znm6uVBxUDxTHyP3BjciI1zpEigOIg9UwJ9nCDxL
+uUGz4VqipNKbkpRkjLLW
+=3IaF
+-----END PGP SIGNATURE-----
+
+--===============7598747164910592838==--
+
+--8F60183010.1466081614/dev1.dev.pixelated-project.org--
diff --git a/mail/src/leap/mail/tests/test_walk.py b/mail/src/leap/mail/tests/test_walk.py
new file mode 100644
index 0000000..826ec10
--- /dev/null
+++ b/mail/src/leap/mail/tests/test_walk.py
@@ -0,0 +1,81 @@
+"""
+Tests for leap.mail.walk module
+"""
+import os.path
+from email.parser import Parser
+
+from leap.mail import walk
+
+CORPUS = {
+ 'simple': 'rfc822.message',
+ 'multimin': 'rfc822.multi-minimal.message',
+ 'multisigned': 'rfc822.multi-signed.message',
+ 'bounced': 'rfc822.bounce.message',
+}
+
+_here = os.path.dirname(__file__)
+_parser = Parser()
+
+
+# tests
+
+
+def test_simple_mail():
+ msg = _parse('simple')
+ tree = walk.get_tree(msg)
+ assert len(tree['part_map']) == 0
+ assert tree['ctype'] == 'text/plain'
+ assert tree['multi'] is False
+
+
+def test_multipart_minimal():
+ msg = _parse('multimin')
+ tree = walk.get_tree(msg)
+
+ assert tree['multi'] is True
+ assert len(tree['part_map']) == 1
+ first = tree['part_map'][1]
+ assert first['multi'] is False
+ assert first['ctype'] == 'text/plain'
+
+
+def test_multi_signed():
+ msg = _parse('multisigned')
+ tree = walk.get_tree(msg)
+ assert tree['multi'] is True
+ assert len(tree['part_map']) == 2
+
+ _first = tree['part_map'][1]
+ _second = tree['part_map'][2]
+ assert len(_first['part_map']) == 3
+ assert(_second['multi'] is False)
+
+
+def test_bounce_mime():
+ msg = _parse('bounced')
+ tree = walk.get_tree(msg)
+
+ ctypes = [tree['part_map'][index]['ctype']
+ for index in sorted(tree['part_map'].keys())]
+ third = tree['part_map'][3]
+ three_one_ctype = third['part_map'][1]['ctype']
+ assert three_one_ctype == 'multipart/signed'
+
+ assert ctypes == [
+ 'text/plain',
+ 'message/delivery-status',
+ 'message/rfc822']
+
+
+# utils
+
+def _parse(name):
+ _str = _get_string_for_message(name)
+ return _parser.parsestr(_str)
+
+
+def _get_string_for_message(name):
+ filename = os.path.join(_here, CORPUS[name])
+ with open(filename) as f:
+ msgstr = f.read()
+ return msgstr
diff --git a/mail/src/leap/mail/walk.py b/mail/src/leap/mail/walk.py
index c116601..d143d61 100644
--- a/mail/src/leap/mail/walk.py
+++ b/mail/src/leap/mail/walk.py
@@ -15,8 +15,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
-Utilities for walking along a message tree.
+Walk a message tree and generate documents that can be inserted in the backend
+store.
"""
+from email.parser import Parser
+
from cryptography.hazmat.backends.multibackend import MultiBackend
from cryptography.hazmat.backends.openssl.backend import (
Backend as OpenSSLBackend)
@@ -26,49 +29,32 @@ from leap.mail.utils import first
crypto_backend = MultiBackend([OpenSSLBackend()])
+_parser = Parser()
-def get_hash(s):
- digest = hashes.Hash(hashes.SHA256(), crypto_backend)
- digest.update(s)
- return digest.finalize().encode("hex").upper()
-
-
-"""
-Get interesting message parts
-"""
-
-
-def get_parts(msg):
- return [
- {
- 'multi': part.is_multipart(),
- 'ctype': part.get_content_type(),
- 'size': len(part.as_string()),
- 'parts':
- len(part.get_payload())
- if isinstance(part.get_payload(), list)
- else 1,
- 'headers': part.items(),
- 'phash':
- get_hash(part.get_payload())
- if not part.is_multipart()
- else None
- } for part in msg.walk()]
-
-"""
-Utility lambda functions for getting the parts vector and the
-payloads from the original message.
-"""
+def get_tree(msg):
+ p = {}
+ p['ctype'] = msg.get_content_type()
+ p['headers'] = msg.items()
-def get_parts_vector(parts):
- return (x.get('parts', 1) for x in parts)
+ payload = msg.get_payload()
+ is_multi = msg.is_multipart()
+ if is_multi:
+ p['part_map'] = dict(
+ [(idx, get_tree(part)) for idx, part in enumerate(payload, 1)])
+ p['parts'] = len(payload)
+ p['phash'] = None
+ else:
+ p['parts'] = 0
+ p['size'] = len(payload)
+ p['phash'] = get_hash(payload)
+ p['part_map'] = {}
+ p['multi'] = is_multi
+ return p
-def get_payloads(msg):
- return ((x.get_payload(),
- dict(((str.lower(k), v) for k, v in (x.items()))))
- for x in msg.walk())
+def get_tree_from_string(messagestr):
+ return get_tree(_parser.parsestr(messagestr))
def get_body_phash(msg):
@@ -81,27 +67,29 @@ def get_body_phash(msg):
# XXX avoid hashing again
return get_hash(part.get_payload())
-"""
-On getting the raw docs, we get also some of the headers to be able to
-index the content. Here we remove any mutable part, as the the filename
-in the content disposition.
-"""
-
-def get_raw_docs(msg, parts):
+def get_raw_docs(msg):
+ """
+ We get also some of the headers to be able to
+ index the content. Here we remove any mutable part, as the the filename
+ in the content disposition.
+ """
return (
- {
- "type": "cnt", # type content they'll be
- "raw": payload,
- "phash": get_hash(payload),
- "content-disposition": first(headers.get(
- 'content-disposition', '').split(';')),
- "content-type": headers.get(
- 'content-type', ''),
- "content-transfer-encoding": headers.get(
- 'content-transfer-encoding', '')
- } for payload, headers in get_payloads(msg)
- if not isinstance(payload, list))
+ {'type': 'cnt',
+ 'raw': part.get_payload(),
+ 'phash': get_hash(part.get_payload()),
+ 'content-type': part.get_content_type(),
+ 'content-disposition': first(part.get(
+ 'content-disposition', '').split(';')),
+ 'content-transfer-encoding': part.get(
+ 'content-transfer-encoding', '')
+ } for part in msg.walk() if not isinstance(part.get_payload(), list))
+
+
+def get_hash(s):
+ digest = hashes.Hash(hashes.SHA256(), crypto_backend)
+ digest.update(s)
+ return digest.finalize().encode("hex").upper()
"""
@@ -116,111 +104,4 @@ Groucho Marx: What's the matter with it?
Chico Marx: I don't know, let's hear it again.
Groucho Marx: So the party of the first part shall be known in this contract as
the party of the first part.
-
-Chico Marx: Well it sounds a little better this time.
-Groucho Marx: Well, it grows on you. Would you like to hear it once more?
-
-Chico Marx: Just the first part.
-Groucho Marx: All right. It says the first part of the party of the first part
- shall be known in this contract as the first part of the party of
- the first part, shall be known in this contract - look, why
- should we quarrel about a thing like this, we'll take it right
- out, eh?
-
-Chico Marx: Yes, it's too long anyhow. Now what have we got left?
-Groucho Marx: Well I've got about a foot and a half. Now what's the matter?
-
-Chico Marx: I don't like the second party either.
"""
-
-
-def walk_msg_tree(parts, body_phash=None):
- """
- Take a list of interesting items of a message subparts structure,
- and return a dict of dicts almost ready to be written to the content
- documents that will be stored in Soledad.
-
- It walks down the subparts in the parsed message tree, and collapses
- the leaf documents into a wrapper document until no multipart submessages
- are left. To achieve this, it iteratively calculates a wrapper vector of
- all documents in the sequence that have more than one part and have unitary
- documents to their right. To collapse a multipart, take as many
- unitary documents as parts the submessage contains, and replace the object
- in the sequence with the new wrapper document.
-
- :param parts: A list of dicts containing the interesting properties for
- the message structure. Normally this has been generated by
- doing a message walk.
- :type parts: list of dicts.
- :param body_phash: the payload hash of the body part, to be included
- in the outer content doc for convenience.
- :type body_phash: basestring or None
- """
- PART_MAP = "part_map"
- MULTI = "multi"
- HEADERS = "headers"
- PHASH = "phash"
- BODY = "body"
-
- # parts vector
- pv = list(get_parts_vector(parts))
-
- inner_headers = parts[1].get(HEADERS, None) if (
- len(parts) == 2) else None
-
- # wrappers vector
- def getwv(pv):
- return [
- True if pv[i] != 1 and pv[i + 1] == 1
- else False
- for i in range(len(pv) - 1)
- ]
- wv = getwv(pv)
-
- # do until no wrapper document is left
- while any(wv):
- wind = wv.index(True) # wrapper index
- nsub = pv[wind] # number of subparts to pick
- slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts
-
- cwra = {
- MULTI: True,
- PART_MAP: dict((index + 1, part) # content wrapper
- for index, part in enumerate(slic)),
- HEADERS: dict(parts[wind][HEADERS])
- }
-
- # remove subparts and substitute wrapper
- map(lambda i: parts.remove(i), slic)
- parts[wind] = cwra
-
- # refresh vectors for this iteration
- pv = list(get_parts_vector(parts))
- wv = getwv(pv)
-
- if all(x == 1 for x in pv):
- # special case in the rightmost element
- main_pmap = parts[0].get(PART_MAP, None)
- if main_pmap is not None:
- last_part = max(main_pmap.keys())
- main_pmap[last_part][PART_MAP] = {}
- for partind in range(len(pv) - 1):
- main_pmap[last_part][PART_MAP][partind] = parts[partind + 1]
-
- outer = parts[0]
- outer.pop(HEADERS)
- if PART_MAP not in outer:
- # we have a multipart with 1 part only, so kind of fix it
- # although it would be prettier if I take this special case at
- # the beginning of the walk.
- pdoc = {MULTI: True,
- PART_MAP: {1: outer}}
- pdoc[PART_MAP][1][MULTI] = False
- if not pdoc[PART_MAP][1].get(PHASH, None):
- pdoc[PART_MAP][1][PHASH] = body_phash
- if inner_headers:
- pdoc[PART_MAP][1][HEADERS] = inner_headers
- else:
- pdoc = outer
- pdoc[BODY] = body_phash
- return pdoc