diff options
Diffstat (limited to 'mail/src/leap/mail/walk.py')
-rw-r--r-- | mail/src/leap/mail/walk.py | 211 |
1 files changed, 46 insertions, 165 deletions
diff --git a/mail/src/leap/mail/walk.py b/mail/src/leap/mail/walk.py index c116601..d143d61 100644 --- a/mail/src/leap/mail/walk.py +++ b/mail/src/leap/mail/walk.py @@ -15,8 +15,11 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ -Utilities for walking along a message tree. +Walk a message tree and generate documents that can be inserted in the backend +store. """ +from email.parser import Parser + from cryptography.hazmat.backends.multibackend import MultiBackend from cryptography.hazmat.backends.openssl.backend import ( Backend as OpenSSLBackend) @@ -26,49 +29,32 @@ from leap.mail.utils import first crypto_backend = MultiBackend([OpenSSLBackend()]) +_parser = Parser() -def get_hash(s): - digest = hashes.Hash(hashes.SHA256(), crypto_backend) - digest.update(s) - return digest.finalize().encode("hex").upper() - - -""" -Get interesting message parts -""" - - -def get_parts(msg): - return [ - { - 'multi': part.is_multipart(), - 'ctype': part.get_content_type(), - 'size': len(part.as_string()), - 'parts': - len(part.get_payload()) - if isinstance(part.get_payload(), list) - else 1, - 'headers': part.items(), - 'phash': - get_hash(part.get_payload()) - if not part.is_multipart() - else None - } for part in msg.walk()] - -""" -Utility lambda functions for getting the parts vector and the -payloads from the original message. -""" +def get_tree(msg): + p = {} + p['ctype'] = msg.get_content_type() + p['headers'] = msg.items() -def get_parts_vector(parts): - return (x.get('parts', 1) for x in parts) + payload = msg.get_payload() + is_multi = msg.is_multipart() + if is_multi: + p['part_map'] = dict( + [(idx, get_tree(part)) for idx, part in enumerate(payload, 1)]) + p['parts'] = len(payload) + p['phash'] = None + else: + p['parts'] = 0 + p['size'] = len(payload) + p['phash'] = get_hash(payload) + p['part_map'] = {} + p['multi'] = is_multi + return p -def get_payloads(msg): - return ((x.get_payload(), - dict(((str.lower(k), v) for k, v in (x.items())))) - for x in msg.walk()) +def get_tree_from_string(messagestr): + return get_tree(_parser.parsestr(messagestr)) def get_body_phash(msg): @@ -81,27 +67,29 @@ def get_body_phash(msg): # XXX avoid hashing again return get_hash(part.get_payload()) -""" -On getting the raw docs, we get also some of the headers to be able to -index the content. Here we remove any mutable part, as the the filename -in the content disposition. -""" - -def get_raw_docs(msg, parts): +def get_raw_docs(msg): + """ + We get also some of the headers to be able to + index the content. Here we remove any mutable part, as the the filename + in the content disposition. + """ return ( - { - "type": "cnt", # type content they'll be - "raw": payload, - "phash": get_hash(payload), - "content-disposition": first(headers.get( - 'content-disposition', '').split(';')), - "content-type": headers.get( - 'content-type', ''), - "content-transfer-encoding": headers.get( - 'content-transfer-encoding', '') - } for payload, headers in get_payloads(msg) - if not isinstance(payload, list)) + {'type': 'cnt', + 'raw': part.get_payload(), + 'phash': get_hash(part.get_payload()), + 'content-type': part.get_content_type(), + 'content-disposition': first(part.get( + 'content-disposition', '').split(';')), + 'content-transfer-encoding': part.get( + 'content-transfer-encoding', '') + } for part in msg.walk() if not isinstance(part.get_payload(), list)) + + +def get_hash(s): + digest = hashes.Hash(hashes.SHA256(), crypto_backend) + digest.update(s) + return digest.finalize().encode("hex").upper() """ @@ -116,111 +104,4 @@ Groucho Marx: What's the matter with it? Chico Marx: I don't know, let's hear it again. Groucho Marx: So the party of the first part shall be known in this contract as the party of the first part. - -Chico Marx: Well it sounds a little better this time. -Groucho Marx: Well, it grows on you. Would you like to hear it once more? - -Chico Marx: Just the first part. -Groucho Marx: All right. It says the first part of the party of the first part - shall be known in this contract as the first part of the party of - the first part, shall be known in this contract - look, why - should we quarrel about a thing like this, we'll take it right - out, eh? - -Chico Marx: Yes, it's too long anyhow. Now what have we got left? -Groucho Marx: Well I've got about a foot and a half. Now what's the matter? - -Chico Marx: I don't like the second party either. """ - - -def walk_msg_tree(parts, body_phash=None): - """ - Take a list of interesting items of a message subparts structure, - and return a dict of dicts almost ready to be written to the content - documents that will be stored in Soledad. - - It walks down the subparts in the parsed message tree, and collapses - the leaf documents into a wrapper document until no multipart submessages - are left. To achieve this, it iteratively calculates a wrapper vector of - all documents in the sequence that have more than one part and have unitary - documents to their right. To collapse a multipart, take as many - unitary documents as parts the submessage contains, and replace the object - in the sequence with the new wrapper document. - - :param parts: A list of dicts containing the interesting properties for - the message structure. Normally this has been generated by - doing a message walk. - :type parts: list of dicts. - :param body_phash: the payload hash of the body part, to be included - in the outer content doc for convenience. - :type body_phash: basestring or None - """ - PART_MAP = "part_map" - MULTI = "multi" - HEADERS = "headers" - PHASH = "phash" - BODY = "body" - - # parts vector - pv = list(get_parts_vector(parts)) - - inner_headers = parts[1].get(HEADERS, None) if ( - len(parts) == 2) else None - - # wrappers vector - def getwv(pv): - return [ - True if pv[i] != 1 and pv[i + 1] == 1 - else False - for i in range(len(pv) - 1) - ] - wv = getwv(pv) - - # do until no wrapper document is left - while any(wv): - wind = wv.index(True) # wrapper index - nsub = pv[wind] # number of subparts to pick - slic = parts[wind + 1:wind + 1 + nsub] # slice with subparts - - cwra = { - MULTI: True, - PART_MAP: dict((index + 1, part) # content wrapper - for index, part in enumerate(slic)), - HEADERS: dict(parts[wind][HEADERS]) - } - - # remove subparts and substitute wrapper - map(lambda i: parts.remove(i), slic) - parts[wind] = cwra - - # refresh vectors for this iteration - pv = list(get_parts_vector(parts)) - wv = getwv(pv) - - if all(x == 1 for x in pv): - # special case in the rightmost element - main_pmap = parts[0].get(PART_MAP, None) - if main_pmap is not None: - last_part = max(main_pmap.keys()) - main_pmap[last_part][PART_MAP] = {} - for partind in range(len(pv) - 1): - main_pmap[last_part][PART_MAP][partind] = parts[partind + 1] - - outer = parts[0] - outer.pop(HEADERS) - if PART_MAP not in outer: - # we have a multipart with 1 part only, so kind of fix it - # although it would be prettier if I take this special case at - # the beginning of the walk. - pdoc = {MULTI: True, - PART_MAP: {1: outer}} - pdoc[PART_MAP][1][MULTI] = False - if not pdoc[PART_MAP][1].get(PHASH, None): - pdoc[PART_MAP][1][PHASH] = body_phash - if inner_headers: - pdoc[PART_MAP][1][HEADERS] = inner_headers - else: - pdoc = outer - pdoc[BODY] = body_phash - return pdoc |