summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomás Touceda <chiiph@leap.se>2014-01-17 15:55:09 -0300
committerTomás Touceda <chiiph@leap.se>2014-01-17 15:55:09 -0300
commit7e79576ce81ebfb055947e987355a43c8b050f0f (patch)
tree52db8ce08c26d079823477fb7875a78a883a67cd
parent6dee4ec9790f25335d18ec3b13b7df45d2a8c98f (diff)
parent98fa323ef8220a6ca330972e45ee56e811c03f69 (diff)
Merge remote-tracking branch 'refs/remotes/ivan/bug/handle-unicode-characters' into develop
-rw-r--r--changes/VERSION_COMPAT3
-rw-r--r--changes/handle-unicode-characters1
-rw-r--r--src/leap/mail/imap/fetch.py19
-rw-r--r--src/leap/mail/imap/messages.py4
-rw-r--r--src/leap/mail/utils.py101
5 files changed, 110 insertions, 18 deletions
diff --git a/changes/VERSION_COMPAT b/changes/VERSION_COMPAT
index 1d5643f..03caa3e 100644
--- a/changes/VERSION_COMPAT
+++ b/changes/VERSION_COMPAT
@@ -9,4 +9,5 @@
# BEGIN DEPENDENCY LIST -------------------------
# leap.foo.bar>=x.y.z
leap.soledad.client 0.5.0 # get_count_by_index
-
+leap.common 0.3.7 # get_email_charset
+leap.keymanager 0.3.8 # openpgp.decrypt
diff --git a/changes/handle-unicode-characters b/changes/handle-unicode-characters
new file mode 100644
index 0000000..052c543
--- /dev/null
+++ b/changes/handle-unicode-characters
@@ -0,0 +1 @@
+ o Handle correctly unicode characters in emails. Closes #4838.
diff --git a/src/leap/mail/imap/fetch.py b/src/leap/mail/imap/fetch.py
index 604a2ea..817ad6a 100644
--- a/src/leap/mail/imap/fetch.py
+++ b/src/leap/mail/imap/fetch.py
@@ -18,9 +18,7 @@
Incoming mail fetcher.
"""
import copy
-import json
import logging
-#import ssl
import threading
import time
import sys
@@ -34,7 +32,6 @@ from StringIO import StringIO
from twisted.python import log
from twisted.internet import defer
from twisted.internet.task import LoopingCall
-#from twisted.internet.threads import deferToThread
from zope.proxy import sameProxiedObjects
from leap.common import events as leap_events
@@ -49,6 +46,7 @@ from leap.common.mail import get_email_charset
from leap.keymanager import errors as keymanager_errors
from leap.keymanager.openpgp import OpenPGPKey
from leap.mail.decorators import deferred
+from leap.mail.utils import json_loads
from leap.soledad.client import Soledad
from leap.soledad.common.crypto import ENC_SCHEME_KEY, ENC_JSON_KEY
@@ -321,7 +319,8 @@ class LeapIncomingMail(object):
"""
log.msg('processing decrypted doc')
doc, data = msgtuple
- msg = json.loads(data)
+ msg = json_loads(data)
+
if not isinstance(msg, dict):
defer.returnValue(False)
if not msg.get(self.INCOMING_KEY, False):
@@ -338,16 +337,15 @@ class LeapIncomingMail(object):
Tries to decrypt a gpg message if data looks like one.
:param data: the text to be decrypted.
- :type data: unicode
+ :type data: str
:return: data, possibly descrypted.
:rtype: str
"""
+ leap_assert_type(data, str)
log.msg('maybe decrypting doc')
- leap_assert_type(data, unicode)
# parse the original message
encoding = get_email_charset(data)
- data = data.encode(encoding)
msg = self._parser.parsestr(data)
# try to obtain sender public key
@@ -420,13 +418,6 @@ class LeapIncomingMail(object):
# Bailing out!
return (msg, False)
- # decrypted successully, now fix encoding and parse
- try:
- decrdata = decrdata.encode(encoding)
- except (UnicodeEncodeError, UnicodeDecodeError) as e:
- logger.error("Unicode error {0}".format(e))
- decrdata = decrdata.encode(encoding, 'replace')
-
decrmsg = self._parser.parsestr(decrdata)
# remove original message's multipart/encrypted content-type
del(msg['content-type'])
diff --git a/src/leap/mail/imap/messages.py b/src/leap/mail/imap/messages.py
index 7a21009..d2c0950 100644
--- a/src/leap/mail/imap/messages.py
+++ b/src/leap/mail/imap/messages.py
@@ -532,8 +532,8 @@ class LeapMessage(fields, MailParser, MBoxParser):
if not charset:
charset = self._get_charset(body)
try:
- body = body.decode(charset).encode(charset)
- except (UnicodeEncodeError, UnicodeDecodeError) as e:
+ body = body.encode(charset)
+ except UnicodeError as e:
logger.error("Unicode error {0}".format(e))
body = body.encode(charset, 'replace')
diff --git a/src/leap/mail/utils.py b/src/leap/mail/utils.py
index 2480efc..93388d3 100644
--- a/src/leap/mail/utils.py
+++ b/src/leap/mail/utils.py
@@ -15,8 +15,10 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
-Small utilities.
+Mail utilities.
"""
+import json
+import traceback
def first(things):
@@ -27,3 +29,100 @@ def first(things):
return things[0]
except (IndexError, TypeError):
return None
+
+
+class CustomJsonScanner(object):
+ """
+ This class is a context manager definition used to monkey patch the default
+ json string parsing behavior.
+ The emails can have more than one encoding, so the `str` objects have more
+ than one encoding and json does not support direct work with `str`
+ (only `unicode`).
+ """
+
+ def _parse_string_str(self, s, idx, *args, **kwargs):
+ """
+ Parses the string "s" starting at the point idx and returns an `str`
+ object. Which basically means it works exactly the same as the regular
+ JSON string parsing, except that it doesn't try to decode utf8.
+ We need this because mail raw strings might have bytes in multiple
+ encodings.
+
+ :param s: the string we want to parse
+ :type s: str
+ :param idx: the starting point for parsing
+ :type idx: int
+
+ :returns: the parsed string and the index where the
+ string ends.
+ :rtype: tuple (str, int)
+ """
+ # NOTE: we just want to use this monkey patched version if we are
+ # calling the loads from our custom method. Otherwise, we use the
+ # json's default parser.
+ monkey_patched = False
+ for i in traceback.extract_stack():
+ # look for json_loads method in the call stack
+ if i[2] == json_loads.__name__:
+ monkey_patched = True
+ break
+
+ if not monkey_patched:
+ return self._orig_scanstring(s, idx, *args, **kwargs)
+
+ found = False
+ end = s.find("\"", idx)
+ while not found:
+ try:
+ if s[end-1] != "\\":
+ found = True
+ else:
+ end = s.find("\"", end+1)
+ except Exception:
+ found = True
+ return s[idx:end].decode("string-escape"), end+1
+
+ def __enter__(self):
+ """
+ Replace the json methods with the needed ones.
+ Also make a backup to restore them later.
+ """
+ # backup original values
+ self._orig_make_scanner = json.scanner.make_scanner
+ self._orig_scanstring = json.decoder.scanstring
+
+ # We need the make_scanner function to be the python one so we can
+ # monkey_patch the json string parsing
+ json.scanner.make_scanner = json.scanner.py_make_scanner
+
+ # And now we monkey patch the money method
+ json.decoder.scanstring = self._parse_string_str
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ """
+ Restores the backuped methods.
+ """
+ # restore original values
+ json.scanner.make_scanner = self._orig_make_scanner
+ json.decoder.scanstring = self._orig_scanstring
+
+
+def json_loads(data):
+ """
+ It works as json.loads but supporting multiple encodings in the same
+ string and accepting an `str` parameter that won't be converted to unicode.
+
+ :param data: the string to load the objects from
+ :type data: str
+
+ :returns: the corresponding python object result of parsing 'data', this
+ behaves similarly as json.loads, with the exception of that
+ returns always `str` instead of `unicode`.
+ """
+ obj = None
+ with CustomJsonScanner():
+ # We need to use the cls parameter in order to trigger the code
+ # that will let us control the string parsing method.
+ obj = json.loads(data, cls=json.JSONDecoder)
+
+ return obj