diff options
| author | Tomás Touceda <chiiph@leap.se> | 2014-01-17 15:55:09 -0300 | 
|---|---|---|
| committer | Tomás Touceda <chiiph@leap.se> | 2014-01-17 15:55:09 -0300 | 
| commit | bdbb8b00c9184c9942133c7968bbe4de3ff9be33 (patch) | |
| tree | 690ab7161f9f44b28ca255d3331a02e268656fac | |
| parent | 6235f30ed0fa1b26c473f3b8dbd65a05a13b0522 (diff) | |
| parent | 398bfb539215c74c4e77e9a64e7b70627caa6c53 (diff) | |
Merge remote-tracking branch 'refs/remotes/ivan/bug/handle-unicode-characters' into develop
| -rw-r--r-- | mail/changes/VERSION_COMPAT | 3 | ||||
| -rw-r--r-- | mail/changes/handle-unicode-characters | 1 | ||||
| -rw-r--r-- | mail/src/leap/mail/imap/fetch.py | 19 | ||||
| -rw-r--r-- | mail/src/leap/mail/imap/messages.py | 4 | ||||
| -rw-r--r-- | mail/src/leap/mail/utils.py | 101 | 
5 files changed, 110 insertions, 18 deletions
| diff --git a/mail/changes/VERSION_COMPAT b/mail/changes/VERSION_COMPAT index 1d5643f9..03caa3eb 100644 --- a/mail/changes/VERSION_COMPAT +++ b/mail/changes/VERSION_COMPAT @@ -9,4 +9,5 @@  # BEGIN DEPENDENCY LIST -------------------------  # leap.foo.bar>=x.y.z  leap.soledad.client 0.5.0 # get_count_by_index - +leap.common 0.3.7 # get_email_charset +leap.keymanager 0.3.8 # openpgp.decrypt diff --git a/mail/changes/handle-unicode-characters b/mail/changes/handle-unicode-characters new file mode 100644 index 00000000..052c5438 --- /dev/null +++ b/mail/changes/handle-unicode-characters @@ -0,0 +1 @@ +  o Handle correctly unicode characters in emails. Closes #4838. diff --git a/mail/src/leap/mail/imap/fetch.py b/mail/src/leap/mail/imap/fetch.py index 604a2ea1..817ad6a2 100644 --- a/mail/src/leap/mail/imap/fetch.py +++ b/mail/src/leap/mail/imap/fetch.py @@ -18,9 +18,7 @@  Incoming mail fetcher.  """  import copy -import json  import logging -#import ssl  import threading  import time  import sys @@ -34,7 +32,6 @@ from StringIO import StringIO  from twisted.python import log  from twisted.internet import defer  from twisted.internet.task import LoopingCall -#from twisted.internet.threads import deferToThread  from zope.proxy import sameProxiedObjects  from leap.common import events as leap_events @@ -49,6 +46,7 @@ from leap.common.mail import get_email_charset  from leap.keymanager import errors as keymanager_errors  from leap.keymanager.openpgp import OpenPGPKey  from leap.mail.decorators import deferred +from leap.mail.utils import json_loads  from leap.soledad.client import Soledad  from leap.soledad.common.crypto import ENC_SCHEME_KEY, ENC_JSON_KEY @@ -321,7 +319,8 @@ class LeapIncomingMail(object):          """          log.msg('processing decrypted doc')          doc, data = msgtuple -        msg = json.loads(data) +        msg = json_loads(data) +          if not isinstance(msg, dict):              defer.returnValue(False)          if not msg.get(self.INCOMING_KEY, False): @@ -338,16 +337,15 @@ class LeapIncomingMail(object):          Tries to decrypt a gpg message if data looks like one.          :param data: the text to be decrypted. -        :type data: unicode +        :type data: str          :return: data, possibly descrypted.          :rtype: str          """ +        leap_assert_type(data, str)          log.msg('maybe decrypting doc') -        leap_assert_type(data, unicode)          # parse the original message          encoding = get_email_charset(data) -        data = data.encode(encoding)          msg = self._parser.parsestr(data)          # try to obtain sender public key @@ -420,13 +418,6 @@ class LeapIncomingMail(object):              # Bailing out!              return (msg, False) -        # decrypted successully, now fix encoding and parse -        try: -            decrdata = decrdata.encode(encoding) -        except (UnicodeEncodeError, UnicodeDecodeError) as e: -            logger.error("Unicode error {0}".format(e)) -            decrdata = decrdata.encode(encoding, 'replace') -          decrmsg = self._parser.parsestr(decrdata)          # remove original message's multipart/encrypted content-type          del(msg['content-type']) diff --git a/mail/src/leap/mail/imap/messages.py b/mail/src/leap/mail/imap/messages.py index 7a210099..d2c09504 100644 --- a/mail/src/leap/mail/imap/messages.py +++ b/mail/src/leap/mail/imap/messages.py @@ -532,8 +532,8 @@ class LeapMessage(fields, MailParser, MBoxParser):              if not charset:                  charset = self._get_charset(body)              try: -                body = body.decode(charset).encode(charset) -            except (UnicodeEncodeError, UnicodeDecodeError) as e: +                body = body.encode(charset) +            except UnicodeError as e:                  logger.error("Unicode error {0}".format(e))                  body = body.encode(charset, 'replace') diff --git a/mail/src/leap/mail/utils.py b/mail/src/leap/mail/utils.py index 2480efc8..93388d31 100644 --- a/mail/src/leap/mail/utils.py +++ b/mail/src/leap/mail/utils.py @@ -15,8 +15,10 @@  # You should have received a copy of the GNU General Public License  # along with this program.  If not, see <http://www.gnu.org/licenses/>.  """ -Small utilities. +Mail utilities.  """ +import json +import traceback  def first(things): @@ -27,3 +29,100 @@ def first(things):          return things[0]      except (IndexError, TypeError):          return None + + +class CustomJsonScanner(object): +    """ +    This class is a context manager definition used to monkey patch the default +    json string parsing behavior. +    The emails can have more than one encoding, so the `str` objects have more +    than one encoding and json does not support direct work with `str` +    (only `unicode`). +    """ + +    def _parse_string_str(self, s, idx, *args, **kwargs): +        """ +        Parses the string "s" starting at the point idx and returns an `str` +        object. Which basically means it works exactly the same as the regular +        JSON string parsing, except that it doesn't try to decode utf8. +        We need this because mail raw strings might have bytes in multiple +        encodings. + +        :param s: the string we want to parse +        :type s: str +        :param idx: the starting point for parsing +        :type idx: int + +        :returns: the parsed string and the index where the +                  string ends. +        :rtype: tuple (str, int) +        """ +        # NOTE: we just want to use this monkey patched version if we are +        # calling the loads from our custom method. Otherwise, we use the +        # json's default parser. +        monkey_patched = False +        for i in traceback.extract_stack(): +            # look for json_loads method in the call stack +            if i[2] == json_loads.__name__: +                monkey_patched = True +                break + +        if not monkey_patched: +            return self._orig_scanstring(s, idx, *args, **kwargs) + +        found = False +        end = s.find("\"", idx) +        while not found: +            try: +                if s[end-1] != "\\": +                    found = True +                else: +                    end = s.find("\"", end+1) +            except Exception: +                found = True +        return s[idx:end].decode("string-escape"), end+1 + +    def __enter__(self): +        """ +        Replace the json methods with the needed ones. +        Also make a backup to restore them later. +        """ +        # backup original values +        self._orig_make_scanner = json.scanner.make_scanner +        self._orig_scanstring = json.decoder.scanstring + +        # We need the make_scanner function to be the python one so we can +        # monkey_patch the json string parsing +        json.scanner.make_scanner = json.scanner.py_make_scanner + +        # And now we monkey patch the money method +        json.decoder.scanstring = self._parse_string_str + +    def __exit__(self, exc_type, exc_value, traceback): +        """ +        Restores the backuped methods. +        """ +        # restore original values +        json.scanner.make_scanner = self._orig_make_scanner +        json.decoder.scanstring = self._orig_scanstring + + +def json_loads(data): +    """ +    It works as json.loads but supporting multiple encodings in the same +    string and accepting an `str` parameter that won't be converted to unicode. + +    :param data: the string to load the objects from +    :type data: str + +    :returns: the corresponding python object result of parsing 'data', this +              behaves similarly as json.loads, with the exception of that +              returns always `str` instead of `unicode`. +    """ +    obj = None +    with CustomJsonScanner(): +        # We need to use the cls parameter in order to trigger the code +        # that will let us control the string parsing method. +        obj = json.loads(data, cls=json.JSONDecoder) + +    return obj | 
