From 41b34cc0d8bd6c2ae22547bc02845e68cab12c3b Mon Sep 17 00:00:00 2001 From: Kali Kaneko Date: Fri, 20 Feb 2015 16:01:57 -0400 Subject: cutoff for encoding detection --- client/changes/bug_cutoff-chardet-guessing | 1 + client/src/leap/soledad/client/api.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 client/changes/bug_cutoff-chardet-guessing diff --git a/client/changes/bug_cutoff-chardet-guessing b/client/changes/bug_cutoff-chardet-guessing new file mode 100644 index 00000000..9535a413 --- /dev/null +++ b/client/changes/bug_cutoff-chardet-guessing @@ -0,0 +1 @@ +- Fallback to utf-8 if confidence on chardet guessing is too low. diff --git a/client/src/leap/soledad/client/api.py b/client/src/leap/soledad/client/api.py index 88bb4969..b8409cbe 100644 --- a/client/src/leap/soledad/client/api.py +++ b/client/src/leap/soledad/client/api.py @@ -416,6 +416,10 @@ class Soledad(object): :return: A deferred whose callback will be invoked with a document. :rtype: twisted.internet.defer.Deferred """ + # TODO we probably should pass an optional "encoding" parameter to + # create_doc (and probably to put_doc too). There are cases (mail + # payloads for example) in which we already have the encoding in the + # headers, so we don't need to guess it. return self._defer( "create_doc", _convert_to_unicode(content), doc_id=doc_id) @@ -803,12 +807,17 @@ def _convert_to_unicode(content): :rtype: object """ + # Chardet doesn't guess very well with some smallish payloads. + # This parameter might need some empirical tweaking. + CUTOFF_CONFIDENCE = 0.90 + if isinstance(content, unicode): return content elif isinstance(content, str): + encoding = "utf-8" result = chardet.detect(content) - default = "utf-8" - encoding = result["encoding"] or default + if result["confidence"] > CUTOFF_CONFIDENCE: + encoding = result["encoding"] try: content = content.decode(encoding) except UnicodeError as e: -- cgit v1.2.3