[feat] use couch _all_docs for get_docs() and get_all_docs()

The previous solution would make use of concurrent get's to couch backend in a pool of threads to implement the get_docs() and get_all_docs() CouchDatabase backend methods. This commit replaces those by a simpler implementation use the `_all_docs` couchdb view api. It passes all needed IDs to the view and r etrieves all documents with content in the same request. A comparison between both implementations shows an improvement of at least 15 times for large number of documents. The table below shows the time for different implementations of get_all_docs() for different number of documents and threads versus _all_docs implementation: +-------+-----------------+------------------+-------------+ | | threads | _all_docs | improvement | +-------+-----------------+------------------+-------------+ | 10 | 0.0728030204773 | 0.00782012939453 | 9.3 | | 100 | 0.609349966049 | 0.0377721786499 | 16.1 | | 1000 | 5.86522197723 | 0.370730876923 | 15.8 | | 10000 | 66.1713931561 | 3.61764383316 | 18.3 | +-------+-----------------+------------------+-------------+
author: drebs <drebs@leap.se> 2016-07-31 10:37:35 -0300
committer: drebs <drebs@leap.se> 2016-08-01 21:09:05 -0300
commit: 3b237bb46743a93feed4bb6f3c839d72fc28df48 (patch)
tree: 7f09ba9d2954bbc94c659e2af497efb187789c8f /common/src
parent: 027d0b5f40944973807e1a4fc497c496e78b3eeb (diff)
1 files changed, 33 insertions, 26 deletions
diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py
index 9edbe380..d0c1a7ba 100644
--- a/common/src/leap/soledad/common/couch/__init__.py
+++ b/common/src/leap/soledad/common/couch/__init__.py
@@ -23,15 +23,12 @@ import json
 import re
 import uuid
 import binascii
-import time
-import functools
 
 
 from collections import defaultdict
 from StringIO import StringIO
 from urlparse import urljoin
 from contextlib import contextmanager
-from multiprocessing.pool import ThreadPool
 from threading import Lock
 
 
@@ -98,9 +95,6 @@ def couch_server(url):
     yield server
 
 
-THREAD_POOL = ThreadPool(20)
-
-
 def _get_gen_doc_id(gen):
     return 'gen-%s' % str(gen).zfill(10)
 
@@ -307,8 +301,8 @@ class CouchDatabase(object):
         """
 
         generation, _ = self.get_generation_info()
-        results = list(self.get_docs(self._database,
-                                     include_deleted=include_deleted))
+        results = list(
+            self._get_docs(None, True, include_deleted))
         return (generation, results)
 
     def get_docs(self, doc_ids, check_for_conflicts=True,
@@ -329,24 +323,37 @@ class CouchDatabase(object):
                  in matching doc_ids order.
         :rtype: iterable
         """
-        # Workaround for:
-        #
-        #   http://bugs.python.org/issue7980
-        #   https://leap.se/code/issues/5449
-        #
-        # python-couchdb uses time.strptime, which is not thread safe. In
-        # order to avoid the problem described on the issues above, we preload
-        # strptime here by evaluating the conversion of an arbitrary date.
-        # This will not be needed when/if we switch from python-couchdb to
-        # paisley.
-        time.strptime('Mar 8 1917', '%b %d %Y')
-        get_one = functools.partial(
-            self.get_doc, check_for_conflicts=check_for_conflicts)
-        docs = [THREAD_POOL.apply_async(get_one, [doc_id])
-                for doc_id in doc_ids]
-        for doc in docs:
-            doc = doc.get()
-            if not doc or not include_deleted and doc.is_tombstone():
+        return self._get_docs(doc_ids, check_for_conflicts, include_deleted)
+
+    def _get_docs(self, doc_ids, check_for_conflicts, include_deleted):
+        """
+        Use couch's `_all_docs` view to get the documents indicated in
+        `doc_ids`,
+
+        :param doc_ids: A list of document identifiers or None for all.
+        :type doc_ids: list
+        :param check_for_conflicts: If set to False, then the conflict check
+                                    will be skipped, and 'None' will be
+                                    returned instead of True/False.
+        :type check_for_conflicts: bool
+        :param include_deleted: If set to True, deleted documents will be
+                                returned with empty content. Otherwise deleted
+                                documents will not be included in the results.
+
+        :return: iterable giving the Document object for each document id
+                 in matching doc_ids order.
+        :rtype: iterable
+        """
+        params = {'include_docs': 'true', 'attachments': 'true'}
+        if doc_ids is not None:
+            params['keys'] = doc_ids
+        view = self._database.view("_all_docs", **params)
+        for row in view.rows:
+            result = row['doc']
+            doc = self.__parse_doc_from_couch(
+                result, result['_id'], check_for_conflicts=check_for_conflicts)
+            # filter out non-u1db or deleted documents
+            if not doc or (not include_deleted and doc.is_tombstone()):
                 continue
             yield doc
author	drebs <drebs@leap.se>	2016-07-31 10:37:35 -0300
committer	drebs <drebs@leap.se>	2016-08-01 21:09:05 -0300
commit	3b237bb46743a93feed4bb6f3c839d72fc28df48 (patch)
tree	7f09ba9d2954bbc94c659e2af497efb187789c8f /common/src
parent	027d0b5f40944973807e1a4fc497c496e78b3eeb (diff)