summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKali Kaneko <kali@leap.se>2015-02-20 16:01:57 -0400
committerKali Kaneko <kali@leap.se>2015-02-20 16:01:57 -0400
commit41b34cc0d8bd6c2ae22547bc02845e68cab12c3b (patch)
tree1d0d45ac0824da8dbaeac5fcddef30f0d7d60516
parent58256957af8329f49d983852063eeaec74179c4d (diff)
cutoff for encoding detection
-rw-r--r--client/changes/bug_cutoff-chardet-guessing1
-rw-r--r--client/src/leap/soledad/client/api.py13
2 files changed, 12 insertions, 2 deletions
diff --git a/client/changes/bug_cutoff-chardet-guessing b/client/changes/bug_cutoff-chardet-guessing
new file mode 100644
index 00000000..9535a413
--- /dev/null
+++ b/client/changes/bug_cutoff-chardet-guessing
@@ -0,0 +1 @@
+- Fallback to utf-8 if confidence on chardet guessing is too low.
diff --git a/client/src/leap/soledad/client/api.py b/client/src/leap/soledad/client/api.py
index 88bb4969..b8409cbe 100644
--- a/client/src/leap/soledad/client/api.py
+++ b/client/src/leap/soledad/client/api.py
@@ -416,6 +416,10 @@ class Soledad(object):
:return: A deferred whose callback will be invoked with a document.
:rtype: twisted.internet.defer.Deferred
"""
+ # TODO we probably should pass an optional "encoding" parameter to
+ # create_doc (and probably to put_doc too). There are cases (mail
+ # payloads for example) in which we already have the encoding in the
+ # headers, so we don't need to guess it.
return self._defer(
"create_doc", _convert_to_unicode(content), doc_id=doc_id)
@@ -803,12 +807,17 @@ def _convert_to_unicode(content):
:rtype: object
"""
+ # Chardet doesn't guess very well with some smallish payloads.
+ # This parameter might need some empirical tweaking.
+ CUTOFF_CONFIDENCE = 0.90
+
if isinstance(content, unicode):
return content
elif isinstance(content, str):
+ encoding = "utf-8"
result = chardet.detect(content)
- default = "utf-8"
- encoding = result["encoding"] or default
+ if result["confidence"] > CUTOFF_CONFIDENCE:
+ encoding = result["encoding"]
try:
content = content.decode(encoding)
except UnicodeError as e: