From 40c5c6d7828362a826f022203cd6d50a5bfe6f0b Mon Sep 17 00:00:00 2001 From: Ivan Alejandro Date: Fri, 17 Jan 2014 14:59:09 -0300 Subject: Add custom json.loads method. This allows us to support the use of an `str` parameter that won't be converted to unicode. So in the case of a string containing bytes with different encodings this won't break. --- mail/src/leap/mail/utils.py | 101 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) (limited to 'mail') diff --git a/mail/src/leap/mail/utils.py b/mail/src/leap/mail/utils.py index 2480efc..93388d3 100644 --- a/mail/src/leap/mail/utils.py +++ b/mail/src/leap/mail/utils.py @@ -15,8 +15,10 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ -Small utilities. +Mail utilities. """ +import json +import traceback def first(things): @@ -27,3 +29,100 @@ def first(things): return things[0] except (IndexError, TypeError): return None + + +class CustomJsonScanner(object): + """ + This class is a context manager definition used to monkey patch the default + json string parsing behavior. + The emails can have more than one encoding, so the `str` objects have more + than one encoding and json does not support direct work with `str` + (only `unicode`). + """ + + def _parse_string_str(self, s, idx, *args, **kwargs): + """ + Parses the string "s" starting at the point idx and returns an `str` + object. Which basically means it works exactly the same as the regular + JSON string parsing, except that it doesn't try to decode utf8. + We need this because mail raw strings might have bytes in multiple + encodings. + + :param s: the string we want to parse + :type s: str + :param idx: the starting point for parsing + :type idx: int + + :returns: the parsed string and the index where the + string ends. + :rtype: tuple (str, int) + """ + # NOTE: we just want to use this monkey patched version if we are + # calling the loads from our custom method. Otherwise, we use the + # json's default parser. + monkey_patched = False + for i in traceback.extract_stack(): + # look for json_loads method in the call stack + if i[2] == json_loads.__name__: + monkey_patched = True + break + + if not monkey_patched: + return self._orig_scanstring(s, idx, *args, **kwargs) + + found = False + end = s.find("\"", idx) + while not found: + try: + if s[end-1] != "\\": + found = True + else: + end = s.find("\"", end+1) + except Exception: + found = True + return s[idx:end].decode("string-escape"), end+1 + + def __enter__(self): + """ + Replace the json methods with the needed ones. + Also make a backup to restore them later. + """ + # backup original values + self._orig_make_scanner = json.scanner.make_scanner + self._orig_scanstring = json.decoder.scanstring + + # We need the make_scanner function to be the python one so we can + # monkey_patch the json string parsing + json.scanner.make_scanner = json.scanner.py_make_scanner + + # And now we monkey patch the money method + json.decoder.scanstring = self._parse_string_str + + def __exit__(self, exc_type, exc_value, traceback): + """ + Restores the backuped methods. + """ + # restore original values + json.scanner.make_scanner = self._orig_make_scanner + json.decoder.scanstring = self._orig_scanstring + + +def json_loads(data): + """ + It works as json.loads but supporting multiple encodings in the same + string and accepting an `str` parameter that won't be converted to unicode. + + :param data: the string to load the objects from + :type data: str + + :returns: the corresponding python object result of parsing 'data', this + behaves similarly as json.loads, with the exception of that + returns always `str` instead of `unicode`. + """ + obj = None + with CustomJsonScanner(): + # We need to use the cls parameter in order to trigger the code + # that will let us control the string parsing method. + obj = json.loads(data, cls=json.JSONDecoder) + + return obj -- cgit v1.2.3