diff options
author | drebs <drebs@leap.se> | 2016-07-22 19:53:21 +0200 |
---|---|---|
committer | drebs <drebs@leap.se> | 2016-07-25 09:19:53 -0300 |
commit | 239273e765ffc2cf3f74dc2bdd6b7ac8a8acdccd (patch) | |
tree | 1b74f80737d71181b1017ff638f0a1fc5a4d7cd9 /common/src | |
parent | fda2acf0c8aaf123359470ced37f56e8223a3286 (diff) |
[feat] do not use couch views for sync metadata
When compared to plain couch document get, the use of the simplest view
functions takes around double the time, while the use of the simplest
list function can take more than 8 times:
get 100 docs:
total: 0.440337 secs
mean: 0.004403
query 100 views:
total: 0.911425 secs
mean: 0.009114
query 100 lists:
total: 3.711537 secs
mean: 0.037115
Besides that, the current implementation of sync metadata storage over
couch is dependent of timestamps of document puts, what can lead to
metadata corruption if the clock of the system is changed for any
reason.
Because of these reasons, we seek to change the implementation of
database metadata. This commit implements the storage of transaction log
data on couch documents with special ids, in the form "gen-xxxxxxxxxx",
where the x's are replaced by the generation index.
Each generation document holds a dictionary containing the generation,
doc_id and transaction_id for the changed document. For each modified
document, a generation document is inserted holding the transaction
metadata.
Diffstat (limited to 'common/src')
-rw-r--r-- | common/src/leap/soledad/common/couch/__init__.py | 121 |
1 files changed, 65 insertions, 56 deletions
diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index 21ffd036..6cad2b19 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -103,6 +103,10 @@ def couch_server(url): THREAD_POOL = ThreadPool(20) +def _get_gen_doc_id(gen): + return 'gen-%s' % str(gen).zfill(10) + + class CouchDatabase(object): """ Holds CouchDB related code. @@ -213,7 +217,7 @@ class CouchDatabase(object): Ensure that the design documents used by the backend exist on the couch database. """ - for ddoc_name in ['docs', 'syncs', 'transactions']: + for ddoc_name in ['docs', 'syncs']: try: self.json_from_resource(['_design'] + ddoc_name.split('/') + ['_info'], @@ -437,8 +441,6 @@ class CouchDatabase(object): result['_attachments']['u1db_conflicts']['data'])))) # store couch revision doc.couch_rev = result['_rev'] - # store transactions - doc.transactions = result['u1db_transactions'] return doc def _build_conflicts(self, doc_id, attached_conflicts): @@ -474,14 +476,11 @@ class CouchDatabase(object): """ if generation == 0: return '' - # query a couch list function - ddoc_path = [ - '_design', 'transactions', '_list', 'trans_id_for_gen', 'log' - ] - response = self.json_from_resource(ddoc_path, gen=generation) - if response == {}: + log = self._get_transaction_log(start=generation, end=generation) + if not log: raise InvalidGeneration - return response['transaction_id'] + _, _, trans_id = log[0] + return trans_id def get_replica_gen_and_trans_id(self, other_replica_uid): """ @@ -512,8 +511,8 @@ class CouchDatabase(object): 'transaction_id': '', } self._database.save(doc) - result = doc['generation'], doc['transaction_id'] - return result + gen, trans_id = doc['generation'], doc['transaction_id'] + return gen, trans_id def get_doc_conflicts(self, doc_id, couch_rev=None): """ @@ -581,12 +580,32 @@ class CouchDatabase(object): :return: The complete transaction log. :rtype: [(str, str)] """ - # query a couch view - ddoc_path = ['_design', 'transactions', '_view', 'log'] - response = self.json_from_resource(ddoc_path) - return map( - lambda row: (row['id'], row['value']), - response['rows']) + log = self._get_transaction_log() + return map(lambda i: (i[1], i[2]), log) + + def _get_gen_docs( + self, start=0, end=9999999999, descending=None, limit=None): + params = {} + if descending: + params['descending'] = 'true' + # honor couch way of traversing the view tree in reverse order + start, end = end, start + params['startkey'] = _get_gen_doc_id(start) + params['endkey'] = _get_gen_doc_id(end) + params['include_docs'] = 'true' + if limit: + params['limit'] = limit + view = self._database.view("_all_docs", **params) + return view.rows + + def _get_transaction_log(self, start=0, end=9999999999): + # get current gen and trans_id + rows = self._get_gen_docs(start=start, end=end) + log = [] + for row in rows: + doc = row['doc'] + log.append((doc['gen'], doc['doc_id'], doc['trans_id'])) + return log def whats_changed(self, old_generation=0): """ @@ -605,32 +624,18 @@ class CouchDatabase(object): changes first) :rtype: (int, str, [(str, int, str)]) """ - # query a couch list function - ddoc_path = [ - '_design', 'transactions', '_list', 'whats_changed', 'log' - ] - response = self.json_from_resource(ddoc_path, old_gen=old_generation) - results = map( - lambda row: - (row['generation'], row['doc_id'], row['transaction_id']), - response['transactions']) - results.reverse() - cur_gen = old_generation - seen = set() changes = [] - newest_trans_id = '' - for generation, doc_id, trans_id in results: + cur_generation, last_trans_id = self.get_generation_info() + relevant_tail = self._get_transaction_log(start=old_generation + 1) + seen = set() + generation = cur_generation + for _, doc_id, trans_id in reversed(relevant_tail): if doc_id not in seen: changes.append((doc_id, generation, trans_id)) seen.add(doc_id) - if changes: - cur_gen = changes[0][1] # max generation - newest_trans_id = changes[0][2] - changes.reverse() - else: - cur_gen, newest_trans_id = self.get_generation_info() - - return cur_gen, newest_trans_id, changes + generation -= 1 + changes.reverse() + return (cur_generation, last_trans_id, changes) def get_generation_info(self): """ @@ -641,10 +646,11 @@ class CouchDatabase(object): """ if self.batching and self.batch_generation: return self.batch_generation - # query a couch list function - ddoc_path = ['_design', 'transactions', '_list', 'generation', 'log'] - info = self.json_from_resource(ddoc_path) - return (info['generation'], info['transaction_id']) + rows = self._get_gen_docs(descending=True, limit=1) + if not rows: + return 0, '' + gen_doc = rows.pop()['doc'] + return gen_doc['gen'], gen_doc['trans_id'] def json_from_resource(self, ddoc_path, check_missing_ddoc=True, **kwargs): @@ -740,21 +746,25 @@ class CouchDatabase(object): 'length': len(conflicts), } parts.append(conflicts) - # store old transactions, if any - transactions = old_doc.transactions[:] if old_doc is not None else [] - # create a new transaction id and timestamp it so the transaction log - # is consistent when querying the database. - transactions.append( - # here we store milliseconds to keep consistent with javascript - # Date.prototype.getTime() which was used before inside a couchdb - # update handler. - (int(time.time() * 1000), - transaction_id)) + # add the gen document + while True: # TODO: add a lock, remove this while + try: + gen, _ = self.get_generation_info() + new_gen = gen + 1 + gen_doc = { + '_id': _get_gen_doc_id(new_gen), + 'gen': new_gen, + 'doc_id': doc.doc_id, + 'trans_id': transaction_id, + } + self._database.save(gen_doc) + break + except ResourceConflict: + pass # build the couch document couch_doc = { '_id': doc.doc_id, 'u1db_rev': doc.rev, - 'u1db_transactions': transactions, '_attachments': attachments, } # if we are updating a doc we have to add the couch doc revision @@ -786,7 +796,6 @@ class CouchDatabase(object): self.batch_docs[doc.doc_id] = couch_doc last_gen, last_trans_id = self.batch_generation self.batch_generation = (last_gen + 1, transaction_id) - return transactions[-1][1] def _new_resource(self, *path): """ |