diff options
| -rw-r--r-- | client/changes/bug_cutoff-chardet-guessing | 1 | ||||
| -rw-r--r-- | client/src/leap/soledad/client/api.py | 13 | 
2 files changed, 12 insertions, 2 deletions
diff --git a/client/changes/bug_cutoff-chardet-guessing b/client/changes/bug_cutoff-chardet-guessing new file mode 100644 index 00000000..9535a413 --- /dev/null +++ b/client/changes/bug_cutoff-chardet-guessing @@ -0,0 +1 @@ +- Fallback to utf-8 if confidence on chardet guessing is too low. diff --git a/client/src/leap/soledad/client/api.py b/client/src/leap/soledad/client/api.py index 88bb4969..b8409cbe 100644 --- a/client/src/leap/soledad/client/api.py +++ b/client/src/leap/soledad/client/api.py @@ -416,6 +416,10 @@ class Soledad(object):          :return: A deferred whose callback will be invoked with a document.          :rtype: twisted.internet.defer.Deferred          """ +        # TODO we probably should pass an optional "encoding" parameter to +        # create_doc (and probably to put_doc too). There are cases (mail +        # payloads for example) in which we already have the encoding in the +        # headers, so we don't need to guess it.          return self._defer(              "create_doc", _convert_to_unicode(content), doc_id=doc_id) @@ -803,12 +807,17 @@ def _convert_to_unicode(content):      :rtype: object      """ +    # Chardet doesn't guess very well with some smallish payloads. +    # This parameter might need some empirical tweaking. +    CUTOFF_CONFIDENCE = 0.90 +      if isinstance(content, unicode):          return content      elif isinstance(content, str): +        encoding = "utf-8"          result = chardet.detect(content) -        default = "utf-8" -        encoding = result["encoding"] or default +        if result["confidence"] > CUTOFF_CONFIDENCE: +            encoding = result["encoding"]          try:              content = content.decode(encoding)          except UnicodeError as e:  | 
