From 3b237bb46743a93feed4bb6f3c839d72fc28df48 Mon Sep 17 00:00:00 2001 From: drebs Date: Sun, 31 Jul 2016 10:37:35 -0300 Subject: [feat] use couch _all_docs for get_docs() and get_all_docs() The previous solution would make use of concurrent get's to couch backend in a pool of threads to implement the get_docs() and get_all_docs() CouchDatabase backend methods. This commit replaces those by a simpler implementation use the `_all_docs` couchdb view api. It passes all needed IDs to the view and r etrieves all documents with content in the same request. A comparison between both implementations shows an improvement of at least 15 times for large number of documents. The table below shows the time for different implementations of get_all_docs() for different number of documents and threads versus _all_docs implementation: +-------+-----------------+------------------+-------------+ | | threads | _all_docs | improvement | +-------+-----------------+------------------+-------------+ | 10 | 0.0728030204773 | 0.00782012939453 | 9.3 | | 100 | 0.609349966049 | 0.0377721786499 | 16.1 | | 1000 | 5.86522197723 | 0.370730876923 | 15.8 | | 10000 | 66.1713931561 | 3.61764383316 | 18.3 | +-------+-----------------+------------------+-------------+ --- common/src/leap/soledad/common/couch/__init__.py | 59 +++++++++++++----------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'common/src/leap/soledad') diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index 9edbe380..d0c1a7ba 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -23,15 +23,12 @@ import json import re import uuid import binascii -import time -import functools from collections import defaultdict from StringIO import StringIO from urlparse import urljoin from contextlib import contextmanager -from multiprocessing.pool import ThreadPool from threading import Lock @@ -98,9 +95,6 @@ def couch_server(url): yield server -THREAD_POOL = ThreadPool(20) - - def _get_gen_doc_id(gen): return 'gen-%s' % str(gen).zfill(10) @@ -307,8 +301,8 @@ class CouchDatabase(object): """ generation, _ = self.get_generation_info() - results = list(self.get_docs(self._database, - include_deleted=include_deleted)) + results = list( + self._get_docs(None, True, include_deleted)) return (generation, results) def get_docs(self, doc_ids, check_for_conflicts=True, @@ -329,24 +323,37 @@ class CouchDatabase(object): in matching doc_ids order. :rtype: iterable """ - # Workaround for: - # - # http://bugs.python.org/issue7980 - # https://leap.se/code/issues/5449 - # - # python-couchdb uses time.strptime, which is not thread safe. In - # order to avoid the problem described on the issues above, we preload - # strptime here by evaluating the conversion of an arbitrary date. - # This will not be needed when/if we switch from python-couchdb to - # paisley. - time.strptime('Mar 8 1917', '%b %d %Y') - get_one = functools.partial( - self.get_doc, check_for_conflicts=check_for_conflicts) - docs = [THREAD_POOL.apply_async(get_one, [doc_id]) - for doc_id in doc_ids] - for doc in docs: - doc = doc.get() - if not doc or not include_deleted and doc.is_tombstone(): + return self._get_docs(doc_ids, check_for_conflicts, include_deleted) + + def _get_docs(self, doc_ids, check_for_conflicts, include_deleted): + """ + Use couch's `_all_docs` view to get the documents indicated in + `doc_ids`, + + :param doc_ids: A list of document identifiers or None for all. + :type doc_ids: list + :param check_for_conflicts: If set to False, then the conflict check + will be skipped, and 'None' will be + returned instead of True/False. + :type check_for_conflicts: bool + :param include_deleted: If set to True, deleted documents will be + returned with empty content. Otherwise deleted + documents will not be included in the results. + + :return: iterable giving the Document object for each document id + in matching doc_ids order. + :rtype: iterable + """ + params = {'include_docs': 'true', 'attachments': 'true'} + if doc_ids is not None: + params['keys'] = doc_ids + view = self._database.view("_all_docs", **params) + for row in view.rows: + result = row['doc'] + doc = self.__parse_doc_from_couch( + result, result['_id'], check_for_conflicts=check_for_conflicts) + # filter out non-u1db or deleted documents + if not doc or (not include_deleted and doc.is_tombstone()): continue yield doc -- cgit v1.2.3