From 048418454631066426e53078740c009e729fd8ae Mon Sep 17 00:00:00 2001 From: Folker Bernitt Date: Mon, 12 Oct 2015 10:52:14 +0200 Subject: Log problem when ignoring characters - Issue #473 - Previous fix only fixes symptoms, but we want to be aware that this problem still exists --- service/pixelated/adapter/mailstore/body_parser.py | 9 ++++++++- service/test/unit/adapter/mailstore/test_body_parser.py | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'service') diff --git a/service/pixelated/adapter/mailstore/body_parser.py b/service/pixelated/adapter/mailstore/body_parser.py index 2193b8e8..a6017833 100644 --- a/service/pixelated/adapter/mailstore/body_parser.py +++ b/service/pixelated/adapter/mailstore/body_parser.py @@ -16,6 +16,9 @@ from email.parser import Parser import re +import logging + +logger = logging.getLogger(__name__) def _parse_charset_header(content_type_and_charset_header, default_charset='us-ascii'): @@ -56,6 +59,10 @@ class BodyParser(object): text += u'\n' encoded_text = text.encode(charset) if isinstance(self._content, unicode): - return encoded_text + self._content.encode(charset, 'ignore') + try: + return encoded_text + self._content.encode(charset) + except UnicodeError, e: + logger.warn('Failed to encode content for charset %s. Ignoring invalid chars: %s' % (charset, e)) + return encoded_text + self._content.encode(charset, 'ignore') else: return encoded_text + self._content diff --git a/service/test/unit/adapter/mailstore/test_body_parser.py b/service/test/unit/adapter/mailstore/test_body_parser.py index 3c2d17fb..9d58637c 100644 --- a/service/test/unit/adapter/mailstore/test_body_parser.py +++ b/service/test/unit/adapter/mailstore/test_body_parser.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU Affero General Public License # along with Pixelated. If not, see . import unittest +from mock import patch from pixelated.adapter.mailstore.body_parser import BodyParser @@ -46,3 +47,11 @@ class BodyParserTest(unittest.TestCase): parser = BodyParser('dGVzdCB0ZXh0\n', content_type='text/plain', content_transfer_encoding='base64') self.assertEqual('test text', parser.parsed_content()) + + @patch('pixelated.adapter.mailstore.body_parser.logger') + def test_body_parser_logs_problems_and_then_ignores_invalid_chars(self, logger_mock): + data = u'unkown char: \ufffd' + parser = BodyParser(data, content_type='text/plain; charset=iso-8859-1', content_transfer_encoding='8bit') + + self.assertEqual(u'unkown char: ', parser.parsed_content()) + logger_mock.warn.assert_called_with(u'Failed to encode content for charset iso-8859-1. Ignoring invalid chars: \'latin-1\' codec can\'t encode character u\'\\ufffd\' in position 13: ordinal not in range(256)') -- cgit v1.2.3