[bug] remove doc content conversion to unicode

Theoretically (until now), Soledad inherits from U1DB the behaviour of only accepting valid JSON for documents contents. JSON documents only allow for unicode strings. Despite that, until now we had implemented lossy convertion to unicode to avoid encoding errors when dumping/loading JSON content. This allowed for API users to pass non-unicode to Soledad, but caused the application to take more time because of conversion. There were 2 problem with this: (1) conversion may take a long time and a lot of memory when convertin large payloads; and (2) conversion was being made before deferring to the adbapi, and this was blocking the reactor. This commit completelly removes the conversion to unicode, thus leaving the responsibility of unicode conversion to users of the Soledad API.
author: drebs <drebs@leap.se> 2016-05-01 13:34:33 -0300
committer: Kali Kaneko <kali@leap.se> 2016-06-06 19:58:50 -0400
commit: 06f3c80e848b14d3fff1a6edd2cd58f998b976db (patch)
tree: 8b4748aa73a3f9329c6bcf17f582c921bbce7d7d
parent: 9eac24ba91ccbbd335e2cf6d8f59c518659348e6 (diff)
2 files changed, 3 insertions, 45 deletions
diff --git a/client/changes/next-changelog.rst b/client/changes/next-changelog.rst
index 6e97386c..050d84be 100644
--- a/client/changes/next-changelog.rst
+++ b/client/changes/next-changelog.rst
@@ -16,6 +16,8 @@ Features
 Bugfixes
 ~~~~~~~~
 - `#1235 <https://leap.se/code/issues/1235>`_: Description for the fixed stuff corresponding with issue #1235.
+- Remove document content conversion to unicode. Users of API are responsible
+  for only passing valid JSON to Soledad for storage.
 - Bugfix without related issue number.
 
 Misc
diff --git a/client/src/leap/soledad/client/api.py b/client/src/leap/soledad/client/api.py
index 2477350e..d83291e7 100644
--- a/client/src/leap/soledad/client/api.py
+++ b/client/src/leap/soledad/client/api.py
@@ -35,10 +35,6 @@ import ssl
 import uuid
 import urlparse
 
-try:
-    import cchardet as chardet
-except ImportError:
-    import chardet
 from itertools import chain
 
 from StringIO import StringIO
@@ -357,7 +353,6 @@ class Soledad(object):
             also be updated.
         :rtype: twisted.internet.defer.Deferred
         """
-        doc.content = _convert_to_unicode(doc.content)
         return self._defer("put_doc", doc)
 
     def delete_doc(self, doc):
@@ -452,8 +447,7 @@ class Soledad(object):
         # create_doc (and probably to put_doc too). There are cases (mail
         # payloads for example) in which we already have the encoding in the
         # headers, so we don't need to guess it.
-        return self._defer(
-            "create_doc", _convert_to_unicode(content), doc_id=doc_id)
+        return self._defer("create_doc", content, doc_id=doc_id)
 
     def create_doc_from_json(self, json, doc_id=None):
         """
@@ -974,44 +968,6 @@ class Soledad(object):
         return self.create_doc(doc)
 
 
-def _convert_to_unicode(content):
-    """
-    Convert content to unicode (or all the strings in content).
-
-    NOTE: Even though this method supports any type, it will
-    currently ignore contents of lists, tuple or any other
-    iterable than dict. We don't need support for these at the
-    moment
-
-    :param content: content to convert
-    :type content: object
-
-    :rtype: object
-    """
-    # Chardet doesn't guess very well with some smallish payloads.
-    # This parameter might need some empirical tweaking.
-    CUTOFF_CONFIDENCE = 0.90
-
-    if isinstance(content, unicode):
-        return content
-    elif isinstance(content, str):
-        encoding = "utf-8"
-        result = chardet.detect(content)
-        if result["confidence"] > CUTOFF_CONFIDENCE:
-            encoding = result["encoding"]
-        try:
-            content = content.decode(encoding)
-        except UnicodeError as e:
-            logger.error("Unicode error: {0!r}. Using 'replace'".format(e))
-            content = content.decode(encoding, 'replace')
-        return content
-    else:
-        if isinstance(content, dict):
-            for key in content.keys():
-                content[key] = _convert_to_unicode(content[key])
-    return content
-
-
 def create_path_if_not_exists(path):
     try:
         if not os.path.isdir(path):
author	drebs <drebs@leap.se>	2016-05-01 13:34:33 -0300
committer	Kali Kaneko <kali@leap.se>	2016-06-06 19:58:50 -0400
commit	06f3c80e848b14d3fff1a6edd2cd58f998b976db (patch)
tree	8b4748aa73a3f9329c6bcf17f582c921bbce7d7d
parent	9eac24ba91ccbbd335e2cf6d8f59c518659348e6 (diff)