2 files changed, 12 insertions, 2 deletions
diff --git a/client/changes/bug_cutoff-chardet-guessing b/client/changes/bug_cutoff-chardet-guessing
new file mode 100644
index 00000000..9535a413
--- /dev/null
+++ b/client/changes/bug_cutoff-chardet-guessing
@@ -0,0 +1 @@
+- Fallback to utf-8 if confidence on chardet guessing is too low.
diff --git a/client/src/leap/soledad/client/api.py b/client/src/leap/soledad/client/api.py
index 88bb4969..b8409cbe 100644
--- a/client/src/leap/soledad/client/api.py
+++ b/client/src/leap/soledad/client/api.py
@@ -416,6 +416,10 @@ class Soledad(object):
         :return: A deferred whose callback will be invoked with a document.
         :rtype: twisted.internet.defer.Deferred
         """
+        # TODO we probably should pass an optional "encoding" parameter to
+        # create_doc (and probably to put_doc too). There are cases (mail
+        # payloads for example) in which we already have the encoding in the
+        # headers, so we don't need to guess it.
         return self._defer(
             "create_doc", _convert_to_unicode(content), doc_id=doc_id)
 
@@ -803,12 +807,17 @@ def _convert_to_unicode(content):
 
     :rtype: object
     """
+    # Chardet doesn't guess very well with some smallish payloads.
+    # This parameter might need some empirical tweaking.
+    CUTOFF_CONFIDENCE = 0.90
+
     if isinstance(content, unicode):
         return content
     elif isinstance(content, str):
+        encoding = "utf-8"
         result = chardet.detect(content)
-        default = "utf-8"
-        encoding = result["encoding"] or default
+        if result["confidence"] > CUTOFF_CONFIDENCE:
+            encoding = result["encoding"]
         try:
             content = content.decode(encoding)
         except UnicodeError as e: