summaryrefslogtreecommitdiff
path: root/src/leap/mail/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/leap/mail/utils.py')
-rw-r--r--src/leap/mail/utils.py327
1 files changed, 327 insertions, 0 deletions
diff --git a/src/leap/mail/utils.py b/src/leap/mail/utils.py
new file mode 100644
index 0000000..fed24b3
--- /dev/null
+++ b/src/leap/mail/utils.py
@@ -0,0 +1,327 @@
+# -*- coding: utf-8 -*-
+# utils.py
+# Copyright (C) 2013 LEAP
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Mail utilities.
+"""
+import json
+import re
+import traceback
+import Queue
+
+from leap.soledad.common.document import SoledadDocument
+
+
+CHARSET_PATTERN = r"""charset=([\w-]+)"""
+CHARSET_RE = re.compile(CHARSET_PATTERN, re.IGNORECASE)
+
+
+def first(things):
+ """
+ Return the head of a collection.
+ """
+ try:
+ return things[0]
+ except (IndexError, TypeError):
+ return None
+
+
+def empty(thing):
+ """
+ Return True if a thing is None or its length is zero.
+ """
+ if thing is None:
+ return True
+ if isinstance(thing, SoledadDocument):
+ thing = thing.content
+ try:
+ return len(thing) == 0
+ except (ReferenceError, TypeError):
+ return True
+
+
+def maybe_call(thing):
+ """
+ Return the same thing, or the result of its invocation if it is a
+ callable.
+ """
+ return thing() if callable(thing) else thing
+
+
+def find_charset(thing, default=None):
+ """
+ Looks into the object 'thing' for a charset specification.
+ It searchs into the object's `repr`.
+
+ :param thing: the object to look into.
+ :type thing: object
+ :param default: the dafault charset to return if no charset is found.
+ :type default: str
+
+ :return: the charset or 'default'
+ :rtype: str or None
+ """
+ charset = first(CHARSET_RE.findall(repr(thing)))
+ if charset is None:
+ charset = default
+ return charset
+
+
+def lowerdict(_dict):
+ """
+ Return a dict with the keys in lowercase.
+
+ :param _dict: the dict to convert
+ :rtype: dict
+ """
+ # TODO should properly implement a CaseInsensitive dict.
+ # Look into requests code.
+ return dict((key.lower(), value)
+ for key, value in _dict.items())
+
+
+PART_MAP = "part_map"
+PHASH = "phash"
+
+
+def _str_dict(d, k):
+ """
+ Convert the dictionary key to string if it was a string.
+
+ :param d: the dict
+ :type d: dict
+ :param k: the key
+ :type k: object
+ """
+ if isinstance(k, int):
+ val = d[k]
+ d[str(k)] = val
+ del(d[k])
+
+
+def stringify_parts_map(d):
+ """
+ Modify a dictionary making all the nested dicts under "part_map" keys
+ having strings as keys.
+
+ :param d: the dictionary to modify
+ :type d: dictionary
+ :rtype: dictionary
+ """
+ for k in d:
+ if k == PART_MAP:
+ pmap = d[k]
+ for kk in pmap.keys():
+ _str_dict(d[k], kk)
+ for kk in pmap.keys():
+ stringify_parts_map(d[k][str(kk)])
+ return d
+
+
+def phash_iter(d):
+ """
+ A recursive generator that extracts all the payload-hashes
+ from an arbitrary nested parts-map dictionary.
+
+ :param d: the dictionary to walk
+ :type d: dictionary
+ :return: a list of all the phashes found
+ :rtype: list
+ """
+ if PHASH in d:
+ yield d[PHASH]
+ if PART_MAP in d:
+ for key in d[PART_MAP]:
+ for phash in phash_iter(d[PART_MAP][key]):
+ yield phash
+
+
+def accumulator(fun, lim):
+ """
+ A simple accumulator that uses a closure and a mutable
+ object to collect items.
+ When the count of items is greater than `lim`, the
+ collection is flushed after invoking a map of the function `fun`
+ over it.
+
+ The returned accumulator can also be flushed at any moment
+ by passing a boolean as a second parameter.
+
+ :param fun: the function to call over the collection
+ when its size is greater than `lim`
+ :type fun: callable
+ :param lim: the turning point for the collection
+ :type lim: int
+ :rtype: function
+
+ >>> from pprint import pprint
+ >>> acc = accumulator(pprint, 2)
+ >>> acc(1)
+ >>> acc(2)
+ [1, 2]
+ >>> acc(3)
+ >>> acc(4)
+ [3, 4]
+ >>> acc = accumulator(pprint, 5)
+ >>> acc(1)
+ >>> acc(2)
+ >>> acc(3)
+ >>> acc(None, flush=True)
+ [1,2,3]
+ """
+ KEY = "items"
+ _o = {KEY: []}
+
+ def _accumulator(item, flush=False):
+ collection = _o[KEY]
+ collection.append(item)
+ if len(collection) >= lim or flush:
+ map(fun, filter(None, collection))
+ _o[KEY] = []
+
+ return _accumulator
+
+
+def accumulator_queue(fun, lim):
+ """
+ A version of the accumulator that uses a queue.
+
+ When the count of items is greater than `lim`, the
+ queue is flushed after invoking the function `fun`
+ over its items.
+
+ The returned accumulator can also be flushed at any moment
+ by passing a boolean as a second parameter.
+
+ :param fun: the function to call over the collection
+ when its size is greater than `lim`
+ :type fun: callable
+ :param lim: the turning point for the collection
+ :type lim: int
+ :rtype: function
+ """
+ _q = Queue.Queue()
+
+ def _accumulator(item, flush=False):
+ _q.put(item)
+ if _q.qsize() >= lim or flush:
+ collection = [_q.get() for i in range(_q.qsize())]
+ map(fun, filter(None, collection))
+
+ return _accumulator
+
+
+#
+# String manipulation
+#
+
+class CustomJsonScanner(object):
+ """
+ This class is a context manager definition used to monkey patch the default
+ json string parsing behavior.
+ The emails can have more than one encoding, so the `str` objects have more
+ than one encoding and json does not support direct work with `str`
+ (only `unicode`).
+ """
+
+ def _parse_string_str(self, s, idx, *args, **kwargs):
+ """
+ Parses the string "s" starting at the point idx and returns an `str`
+ object. Which basically means it works exactly the same as the regular
+ JSON string parsing, except that it doesn't try to decode utf8.
+ We need this because mail raw strings might have bytes in multiple
+ encodings.
+
+ :param s: the string we want to parse
+ :type s: str
+ :param idx: the starting point for parsing
+ :type idx: int
+
+ :returns: the parsed string and the index where the
+ string ends.
+ :rtype: tuple (str, int)
+ """
+ # NOTE: we just want to use this monkey patched version if we are
+ # calling the loads from our custom method. Otherwise, we use the
+ # json's default parser.
+ monkey_patched = False
+ for i in traceback.extract_stack():
+ # look for json_loads method in the call stack
+ if i[2] == json_loads.__name__:
+ monkey_patched = True
+ break
+
+ if not monkey_patched:
+ return self._orig_scanstring(s, idx, *args, **kwargs)
+
+ # TODO profile to see if a compiled regex can get us some
+ # benefit here.
+ found = False
+ end = s.find("\"", idx)
+ while not found:
+ try:
+ if s[end-1] != "\\":
+ found = True
+ else:
+ end = s.find("\"", end+1)
+ except Exception:
+ found = True
+ return s[idx:end].decode("string-escape"), end+1
+
+ def __enter__(self):
+ """
+ Replace the json methods with the needed ones.
+ Also make a backup to restore them later.
+ """
+ # backup original values
+ self._orig_make_scanner = json.scanner.make_scanner
+ self._orig_scanstring = json.decoder.scanstring
+
+ # We need the make_scanner function to be the python one so we can
+ # monkey_patch the json string parsing
+ json.scanner.make_scanner = json.scanner.py_make_scanner
+
+ # And now we monkey patch the money method
+ json.decoder.scanstring = self._parse_string_str
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ """
+ Restores the backuped methods.
+ """
+ # restore original values
+ json.scanner.make_scanner = self._orig_make_scanner
+ json.decoder.scanstring = self._orig_scanstring
+
+
+def json_loads(data):
+ """
+ It works as json.loads but supporting multiple encodings in the same
+ string and accepting an `str` parameter that won't be converted to unicode.
+
+ :param data: the string to load the objects from
+ :type data: str
+
+ :returns: the corresponding python object result of parsing 'data', this
+ behaves similarly as json.loads, with the exception of that
+ returns always `str` instead of `unicode`.
+ """
+ obj = None
+ with CustomJsonScanner():
+ # We need to use the cls parameter in order to trigger the code
+ # that will let us control the string parsing method.
+ obj = json.loads(data, cls=json.JSONDecoder)
+
+ return obj