summaryrefslogtreecommitdiff
path: root/common/src
diff options
context:
space:
mode:
authordrebs <drebs@leap.se>2016-07-31 10:37:35 -0300
committerdrebs <drebs@leap.se>2016-08-01 21:09:05 -0300
commit3b237bb46743a93feed4bb6f3c839d72fc28df48 (patch)
tree7f09ba9d2954bbc94c659e2af497efb187789c8f /common/src
parent027d0b5f40944973807e1a4fc497c496e78b3eeb (diff)
[feat] use couch _all_docs for get_docs() and get_all_docs()
The previous solution would make use of concurrent get's to couch backend in a pool of threads to implement the get_docs() and get_all_docs() CouchDatabase backend methods. This commit replaces those by a simpler implementation use the `_all_docs` couchdb view api. It passes all needed IDs to the view and r etrieves all documents with content in the same request. A comparison between both implementations shows an improvement of at least 15 times for large number of documents. The table below shows the time for different implementations of get_all_docs() for different number of documents and threads versus _all_docs implementation: +-------+-----------------+------------------+-------------+ | | threads | _all_docs | improvement | +-------+-----------------+------------------+-------------+ | 10 | 0.0728030204773 | 0.00782012939453 | 9.3 | | 100 | 0.609349966049 | 0.0377721786499 | 16.1 | | 1000 | 5.86522197723 | 0.370730876923 | 15.8 | | 10000 | 66.1713931561 | 3.61764383316 | 18.3 | +-------+-----------------+------------------+-------------+
Diffstat (limited to 'common/src')
-rw-r--r--common/src/leap/soledad/common/couch/__init__.py59
1 files changed, 33 insertions, 26 deletions
diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py
index 9edbe380..d0c1a7ba 100644
--- a/common/src/leap/soledad/common/couch/__init__.py
+++ b/common/src/leap/soledad/common/couch/__init__.py
@@ -23,15 +23,12 @@ import json
import re
import uuid
import binascii
-import time
-import functools
from collections import defaultdict
from StringIO import StringIO
from urlparse import urljoin
from contextlib import contextmanager
-from multiprocessing.pool import ThreadPool
from threading import Lock
@@ -98,9 +95,6 @@ def couch_server(url):
yield server
-THREAD_POOL = ThreadPool(20)
-
-
def _get_gen_doc_id(gen):
return 'gen-%s' % str(gen).zfill(10)
@@ -307,8 +301,8 @@ class CouchDatabase(object):
"""
generation, _ = self.get_generation_info()
- results = list(self.get_docs(self._database,
- include_deleted=include_deleted))
+ results = list(
+ self._get_docs(None, True, include_deleted))
return (generation, results)
def get_docs(self, doc_ids, check_for_conflicts=True,
@@ -329,24 +323,37 @@ class CouchDatabase(object):
in matching doc_ids order.
:rtype: iterable
"""
- # Workaround for:
- #
- # http://bugs.python.org/issue7980
- # https://leap.se/code/issues/5449
- #
- # python-couchdb uses time.strptime, which is not thread safe. In
- # order to avoid the problem described on the issues above, we preload
- # strptime here by evaluating the conversion of an arbitrary date.
- # This will not be needed when/if we switch from python-couchdb to
- # paisley.
- time.strptime('Mar 8 1917', '%b %d %Y')
- get_one = functools.partial(
- self.get_doc, check_for_conflicts=check_for_conflicts)
- docs = [THREAD_POOL.apply_async(get_one, [doc_id])
- for doc_id in doc_ids]
- for doc in docs:
- doc = doc.get()
- if not doc or not include_deleted and doc.is_tombstone():
+ return self._get_docs(doc_ids, check_for_conflicts, include_deleted)
+
+ def _get_docs(self, doc_ids, check_for_conflicts, include_deleted):
+ """
+ Use couch's `_all_docs` view to get the documents indicated in
+ `doc_ids`,
+
+ :param doc_ids: A list of document identifiers or None for all.
+ :type doc_ids: list
+ :param check_for_conflicts: If set to False, then the conflict check
+ will be skipped, and 'None' will be
+ returned instead of True/False.
+ :type check_for_conflicts: bool
+ :param include_deleted: If set to True, deleted documents will be
+ returned with empty content. Otherwise deleted
+ documents will not be included in the results.
+
+ :return: iterable giving the Document object for each document id
+ in matching doc_ids order.
+ :rtype: iterable
+ """
+ params = {'include_docs': 'true', 'attachments': 'true'}
+ if doc_ids is not None:
+ params['keys'] = doc_ids
+ view = self._database.view("_all_docs", **params)
+ for row in view.rows:
+ result = row['doc']
+ doc = self.__parse_doc_from_couch(
+ result, result['_id'], check_for_conflicts=check_for_conflicts)
+ # filter out non-u1db or deleted documents
+ if not doc or (not include_deleted and doc.is_tombstone()):
continue
yield doc