From 7b558eb23208d6de0b115fa453334421cc941e44 Mon Sep 17 00:00:00 2001 From: Ivan Alejandro Date: Fri, 17 Jan 2014 14:59:09 -0300 Subject: Add custom json.loads method. This allows us to support the use of an `str` parameter that won't be converted to unicode. So in the case of a string containing bytes with different encodings this won't break. --- src/leap/mail/utils.py | 101 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/src/leap/mail/utils.py b/src/leap/mail/utils.py index 2480efc..93388d3 100644 --- a/src/leap/mail/utils.py +++ b/src/leap/mail/utils.py @@ -15,8 +15,10 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ -Small utilities. +Mail utilities. """ +import json +import traceback def first(things): @@ -27,3 +29,100 @@ def first(things): return things[0] except (IndexError, TypeError): return None + + +class CustomJsonScanner(object): + """ + This class is a context manager definition used to monkey patch the default + json string parsing behavior. + The emails can have more than one encoding, so the `str` objects have more + than one encoding and json does not support direct work with `str` + (only `unicode`). + """ + + def _parse_string_str(self, s, idx, *args, **kwargs): + """ + Parses the string "s" starting at the point idx and returns an `str` + object. Which basically means it works exactly the same as the regular + JSON string parsing, except that it doesn't try to decode utf8. + We need this because mail raw strings might have bytes in multiple + encodings. + + :param s: the string we want to parse + :type s: str + :param idx: the starting point for parsing + :type idx: int + + :returns: the parsed string and the index where the + string ends. + :rtype: tuple (str, int) + """ + # NOTE: we just want to use this monkey patched version if we are + # calling the loads from our custom method. Otherwise, we use the + # json's default parser. + monkey_patched = False + for i in traceback.extract_stack(): + # look for json_loads method in the call stack + if i[2] == json_loads.__name__: + monkey_patched = True + break + + if not monkey_patched: + return self._orig_scanstring(s, idx, *args, **kwargs) + + found = False + end = s.find("\"", idx) + while not found: + try: + if s[end-1] != "\\": + found = True + else: + end = s.find("\"", end+1) + except Exception: + found = True + return s[idx:end].decode("string-escape"), end+1 + + def __enter__(self): + """ + Replace the json methods with the needed ones. + Also make a backup to restore them later. + """ + # backup original values + self._orig_make_scanner = json.scanner.make_scanner + self._orig_scanstring = json.decoder.scanstring + + # We need the make_scanner function to be the python one so we can + # monkey_patch the json string parsing + json.scanner.make_scanner = json.scanner.py_make_scanner + + # And now we monkey patch the money method + json.decoder.scanstring = self._parse_string_str + + def __exit__(self, exc_type, exc_value, traceback): + """ + Restores the backuped methods. + """ + # restore original values + json.scanner.make_scanner = self._orig_make_scanner + json.decoder.scanstring = self._orig_scanstring + + +def json_loads(data): + """ + It works as json.loads but supporting multiple encodings in the same + string and accepting an `str` parameter that won't be converted to unicode. + + :param data: the string to load the objects from + :type data: str + + :returns: the corresponding python object result of parsing 'data', this + behaves similarly as json.loads, with the exception of that + returns always `str` instead of `unicode`. + """ + obj = None + with CustomJsonScanner(): + # We need to use the cls parameter in order to trigger the code + # that will let us control the string parsing method. + obj = json.loads(data, cls=json.JSONDecoder) + + return obj -- cgit v1.2.3 From 28694a321a81f4cbe5f4873cdc55e6d3f471dd48 Mon Sep 17 00:00:00 2001 From: Ivan Alejandro Date: Fri, 17 Jan 2014 15:07:37 -0300 Subject: Fix encodings usage, use custom json.loads method. Also remove some unused imports. --- src/leap/mail/imap/fetch.py | 19 +++++-------------- src/leap/mail/imap/messages.py | 4 ++-- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/leap/mail/imap/fetch.py b/src/leap/mail/imap/fetch.py index 604a2ea..817ad6a 100644 --- a/src/leap/mail/imap/fetch.py +++ b/src/leap/mail/imap/fetch.py @@ -18,9 +18,7 @@ Incoming mail fetcher. """ import copy -import json import logging -#import ssl import threading import time import sys @@ -34,7 +32,6 @@ from StringIO import StringIO from twisted.python import log from twisted.internet import defer from twisted.internet.task import LoopingCall -#from twisted.internet.threads import deferToThread from zope.proxy import sameProxiedObjects from leap.common import events as leap_events @@ -49,6 +46,7 @@ from leap.common.mail import get_email_charset from leap.keymanager import errors as keymanager_errors from leap.keymanager.openpgp import OpenPGPKey from leap.mail.decorators import deferred +from leap.mail.utils import json_loads from leap.soledad.client import Soledad from leap.soledad.common.crypto import ENC_SCHEME_KEY, ENC_JSON_KEY @@ -321,7 +319,8 @@ class LeapIncomingMail(object): """ log.msg('processing decrypted doc') doc, data = msgtuple - msg = json.loads(data) + msg = json_loads(data) + if not isinstance(msg, dict): defer.returnValue(False) if not msg.get(self.INCOMING_KEY, False): @@ -338,16 +337,15 @@ class LeapIncomingMail(object): Tries to decrypt a gpg message if data looks like one. :param data: the text to be decrypted. - :type data: unicode + :type data: str :return: data, possibly descrypted. :rtype: str """ + leap_assert_type(data, str) log.msg('maybe decrypting doc') - leap_assert_type(data, unicode) # parse the original message encoding = get_email_charset(data) - data = data.encode(encoding) msg = self._parser.parsestr(data) # try to obtain sender public key @@ -420,13 +418,6 @@ class LeapIncomingMail(object): # Bailing out! return (msg, False) - # decrypted successully, now fix encoding and parse - try: - decrdata = decrdata.encode(encoding) - except (UnicodeEncodeError, UnicodeDecodeError) as e: - logger.error("Unicode error {0}".format(e)) - decrdata = decrdata.encode(encoding, 'replace') - decrmsg = self._parser.parsestr(decrdata) # remove original message's multipart/encrypted content-type del(msg['content-type']) diff --git a/src/leap/mail/imap/messages.py b/src/leap/mail/imap/messages.py index 22de356..28bd272 100644 --- a/src/leap/mail/imap/messages.py +++ b/src/leap/mail/imap/messages.py @@ -494,8 +494,8 @@ class LeapMessage(fields, MailParser, MBoxParser): if not charset: charset = self._get_charset(body) try: - body = body.decode(charset).encode(charset) - except (UnicodeEncodeError, UnicodeDecodeError) as e: + body = body.encode(charset) + except UnicodeError as e: logger.error("Unicode error {0}".format(e)) body = body.encode(charset, 'replace') -- cgit v1.2.3 From 98fa323ef8220a6ca330972e45ee56e811c03f69 Mon Sep 17 00:00:00 2001 From: Ivan Alejandro Date: Fri, 17 Jan 2014 15:08:49 -0300 Subject: Update VERSION_COMPAT, add changes file for #4838. --- changes/VERSION_COMPAT | 3 ++- changes/handle-unicode-characters | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 changes/handle-unicode-characters diff --git a/changes/VERSION_COMPAT b/changes/VERSION_COMPAT index 1d5643f..03caa3e 100644 --- a/changes/VERSION_COMPAT +++ b/changes/VERSION_COMPAT @@ -9,4 +9,5 @@ # BEGIN DEPENDENCY LIST ------------------------- # leap.foo.bar>=x.y.z leap.soledad.client 0.5.0 # get_count_by_index - +leap.common 0.3.7 # get_email_charset +leap.keymanager 0.3.8 # openpgp.decrypt diff --git a/changes/handle-unicode-characters b/changes/handle-unicode-characters new file mode 100644 index 0000000..052c543 --- /dev/null +++ b/changes/handle-unicode-characters @@ -0,0 +1 @@ + o Handle correctly unicode characters in emails. Closes #4838. -- cgit v1.2.3