Restore the encoding of the messages. Fixes: #4956

We still are getting wrong output with unicode chars, but this at least avoids breaking the fetch command.
author: Kali Kaneko <kali@leap.se> 2014-01-13 14:51:13 -0400
committer: Kali Kaneko <kali@leap.se> 2014-01-13 15:57:46 -0400
commit: 5adc6b66839b15c23980355774d8d24aba4918bd (patch)
tree: e9b1149e92257799fa3285e1af401b9b0a612f78
parent: 7a66627816733c207e8a8c16b769870079b23772 (diff)
1 files changed, 21 insertions, 26 deletions
diff --git a/src/leap/mail/imap/messages.py b/src/leap/mail/imap/messages.py
index a3fcd87..7b49c80 100644
--- a/src/leap/mail/imap/messages.py
+++ b/src/leap/mail/imap/messages.py
@@ -19,6 +19,7 @@ LeapMessage and MessageCollection.
 """
 import copy
 import logging
+import re
 import StringIO
 
 from collections import defaultdict, namedtuple
@@ -63,6 +64,10 @@ def lowerdict(_dict):
                 for key, value in _dict.items())
 
 
+CHARSET_PATTERN = r"""charset=([\w-]+)"""
+CHARSET_RE = re.compile(CHARSET_PATTERN, re.IGNORECASE)
+
+
 class MessagePart(object):
     """
     IMessagePart implementor.
@@ -140,18 +145,9 @@ class MessagePart(object):
             payload = str("")
 
         if payload:
-            #headers = self.getHeaders(True)
-            #headers = lowerdict(headers)
-            #content_type = headers.get('content-type', "")
             content_type = self._get_ctype_from_document(phash)
-            charset_split = content_type.split('charset=')
-            # XXX fuck all this, use a regex!
-            if len(charset_split) > 1:
-                charset = charset_split[1]
-                if charset:
-                    charset = charset.strip()
-            else:
-                charset = None
+            charset = first(CHARSET_RE.findall(content_type))
+            logger.debug("Got charset from header: %s" % (charset,))
             if not charset:
                 charset = self._get_charset(payload)
             try:
@@ -483,28 +479,27 @@ class LeapMessage(fields, MailParser, MBoxParser):
         :return: file-like object opened for reading
         :rtype: StringIO
         """
+        # TODO refactor with getBodyFile in MessagePart
         fd = StringIO.StringIO()
         bdoc = self._bdoc
         if bdoc:
-            body = str(self._bdoc.content.get(self.RAW_KEY, ""))
+            body = self._bdoc.content.get(self.RAW_KEY, "")
+            content_type = bdoc.content.get('content-type', "")
+            charset = first(CHARSET_RE.findall(content_type))
+            logger.debug("Got charset from header: %s" % (charset,))
+            if not charset:
+                charset = self._get_charset(body)
+            try:
+                body = body.decode(charset).encode(charset)
+            except (UnicodeEncodeError, UnicodeDecodeError) as e:
+                logger.error("Unicode error {0}".format(e))
+                body = body.encode(charset, 'replace')
+
+        # We are still returning funky characters from here.
         else:
             logger.warning("No BDOC found for message.")
             body = str("")
 
-        # XXX not needed, isn't it? ---- ivan?
-        #if bdoc:
-            #content_type = bdoc.content.get('content-type', "")
-            #charset = content_type.split('charset=')[1]
-            #if charset:
-                #charset = charset.strip()
-            #if not charset:
-                #charset = self._get_charset(body)
-            #try:
-                #body = str(body.encode(charset))
-            #except (UnicodeEncodeError, UnicodeDecodeError) as e:
-                #logger.error("Unicode error {0}".format(e))
-                #body = str(body.encode(charset, 'replace'))
-
         fd.write(body)
         fd.seek(0)
         return fd
author	Kali Kaneko <kali@leap.se>	2014-01-13 14:51:13 -0400
committer	Kali Kaneko <kali@leap.se>	2014-01-13 15:57:46 -0400
commit	5adc6b66839b15c23980355774d8d24aba4918bd (patch)
tree	e9b1149e92257799fa3285e1af401b9b0a612f78
parent	7a66627816733c207e8a8c16b769870079b23772 (diff)