summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKali Kaneko <kali@leap.se>2014-01-13 14:51:13 -0400
committerKali Kaneko <kali@leap.se>2014-01-13 15:57:46 -0400
commit42b2a7d5dda807b48d6d08acd4de427979500f12 (patch)
treeb0fdfab23a13a3351aad90b4e0f8c75737e18de3
parentb9636aee88801640ba95fd3cc16cb571c68877f9 (diff)
Restore the encoding of the messages. Fixes: #4956
We still are getting wrong output with unicode chars, but this at least avoids breaking the fetch command.
-rw-r--r--mail/src/leap/mail/imap/messages.py47
1 files changed, 21 insertions, 26 deletions
diff --git a/mail/src/leap/mail/imap/messages.py b/mail/src/leap/mail/imap/messages.py
index a3fcd872..7b49c807 100644
--- a/mail/src/leap/mail/imap/messages.py
+++ b/mail/src/leap/mail/imap/messages.py
@@ -19,6 +19,7 @@ LeapMessage and MessageCollection.
"""
import copy
import logging
+import re
import StringIO
from collections import defaultdict, namedtuple
@@ -63,6 +64,10 @@ def lowerdict(_dict):
for key, value in _dict.items())
+CHARSET_PATTERN = r"""charset=([\w-]+)"""
+CHARSET_RE = re.compile(CHARSET_PATTERN, re.IGNORECASE)
+
+
class MessagePart(object):
"""
IMessagePart implementor.
@@ -140,18 +145,9 @@ class MessagePart(object):
payload = str("")
if payload:
- #headers = self.getHeaders(True)
- #headers = lowerdict(headers)
- #content_type = headers.get('content-type', "")
content_type = self._get_ctype_from_document(phash)
- charset_split = content_type.split('charset=')
- # XXX fuck all this, use a regex!
- if len(charset_split) > 1:
- charset = charset_split[1]
- if charset:
- charset = charset.strip()
- else:
- charset = None
+ charset = first(CHARSET_RE.findall(content_type))
+ logger.debug("Got charset from header: %s" % (charset,))
if not charset:
charset = self._get_charset(payload)
try:
@@ -483,28 +479,27 @@ class LeapMessage(fields, MailParser, MBoxParser):
:return: file-like object opened for reading
:rtype: StringIO
"""
+ # TODO refactor with getBodyFile in MessagePart
fd = StringIO.StringIO()
bdoc = self._bdoc
if bdoc:
- body = str(self._bdoc.content.get(self.RAW_KEY, ""))
+ body = self._bdoc.content.get(self.RAW_KEY, "")
+ content_type = bdoc.content.get('content-type', "")
+ charset = first(CHARSET_RE.findall(content_type))
+ logger.debug("Got charset from header: %s" % (charset,))
+ if not charset:
+ charset = self._get_charset(body)
+ try:
+ body = body.decode(charset).encode(charset)
+ except (UnicodeEncodeError, UnicodeDecodeError) as e:
+ logger.error("Unicode error {0}".format(e))
+ body = body.encode(charset, 'replace')
+
+ # We are still returning funky characters from here.
else:
logger.warning("No BDOC found for message.")
body = str("")
- # XXX not needed, isn't it? ---- ivan?
- #if bdoc:
- #content_type = bdoc.content.get('content-type', "")
- #charset = content_type.split('charset=')[1]
- #if charset:
- #charset = charset.strip()
- #if not charset:
- #charset = self._get_charset(body)
- #try:
- #body = str(body.encode(charset))
- #except (UnicodeEncodeError, UnicodeDecodeError) as e:
- #logger.error("Unicode error {0}".format(e))
- #body = str(body.encode(charset, 'replace'))
-
fd.write(body)
fd.seek(0)
return fd