src/leap/bitmask/mail/walk.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

# -*- coding: utf-8 -*-
# walk.py
# Copyright (C) 2013-2015 LEAP
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
Walk a message tree and generate documents that can be inserted in the backend
store.
"""
from email.parser import Parser

from cryptography.hazmat.backends.multibackend import MultiBackend
from cryptography.hazmat.backends.openssl.backend import (
    Backend as OpenSSLBackend)
from cryptography.hazmat.primitives import hashes

from leap.bitmask.mail.utils import first

crypto_backend = MultiBackend([OpenSSLBackend()])

_parser = Parser()


def get_tree(msg):
    p = {}
    p['ctype'] = msg.get_content_type()
    p['headers'] = msg.items()

    payload = msg.get_payload()
    is_multi = msg.is_multipart()
    if is_multi:
        p['part_map'] = dict(
            [(idx, get_tree(part)) for idx, part in enumerate(payload, 1)])
        p['parts'] = len(payload)
        p['phash'] = None
    else:
        p['parts'] = 0
        p['size'] = len(payload)
        p['phash'] = get_hash(payload)
        p['part_map'] = {}
    p['multi'] = is_multi
    return p


def get_tree_from_string(messagestr):
    return get_tree(_parser.parsestr(messagestr))


def get_body_phash(msg):
    """
    Find the body payload-hash for this message.
    """
    for part in msg.walk():
        # XXX what other ctypes should be considered body?
        if part.get_content_type() in ("text/plain", "text/html"):
            # XXX avoid hashing again
            return get_hash(part.get_payload())


def get_raw_docs(msg):
    """
    We get also some of the headers to be able to
    index the content. Here we remove any mutable part, as the the filename
    in the content disposition.
    """
    return (
        {'type': 'cnt',
         'raw': part.get_payload(),
         'phash': get_hash(part.get_payload()),
         'content-type': part.get_content_type(),
         'charset': part.get_content_charset(),
         'content-disposition': first(part.get(
             'content-disposition', '').split(';')),
         'content-transfer-encoding': part.get(
             'content-transfer-encoding', '')
         } for part in msg.walk() if not isinstance(part.get_payload(), list))


def get_hash(s):
    digest = hashes.Hash(hashes.SHA256(), crypto_backend)
    digest.update(s)
    return digest.finalize().encode("hex").upper()


"""
Groucho Marx: Now pay particular attention to this first clause, because it's
              most important. There's the party of the first part shall be
              known in this contract as the party of the first part. How do you
              like that, that's pretty neat eh?

Chico Marx: No, that's no good.
Groucho Marx: What's the matter with it?

Chico Marx: I don't know, let's hear it again.
Groucho Marx: So the party of the first part shall be known in this contract as
              the party of the first part.
"""