From 81f97ec532a13dc57bf23a44dab3d44d12cc2ba4 Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Fri, 16 Sep 2016 04:13:26 -0300 Subject: [feature] get attachments as generator runs Instead of getting the attachments as the generator runs, get_docs will now get as needed. Also, deepcopy solves a memory issue where we were feeding the couchdb lib view with blobs while modifying it unintentionally. --- common/src/leap/soledad/common/couch/__init__.py | 32 ++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index 0f4102db..d751747d 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -20,6 +20,7 @@ import json +import copy import re import uuid import binascii @@ -337,14 +338,22 @@ class CouchDatabase(object): in matching doc_ids order. :rtype: iterable """ - params = {'include_docs': 'true', 'attachments': 'true'} + params = {'include_docs': 'true', 'attachments': 'false'} if doc_ids is not None: params['keys'] = doc_ids view = self._database.view("_all_docs", **params) for row in view.rows: - result = row['doc'] + result = copy.deepcopy(row['doc']) + attachment_file_names = result['_attachments'].keys() + result['_attachments'] = {} + for file_name in attachment_file_names: + result['_attachments'][file_name] = { + 'data': json.load( + self._database.get_attachment(result, file_name)) + } doc = self.__parse_doc_from_couch( - result, result['_id'], check_for_conflicts=check_for_conflicts) + result, result['_id'], + check_for_conflicts=check_for_conflicts, decode=False) # filter out non-u1db or deleted documents if not doc or (not include_deleted and doc.is_tombstone()): continue @@ -408,7 +417,7 @@ class CouchDatabase(object): self.batch_docs.clear() return rev - def __parse_doc_from_couch(self, result, doc_id, + def __parse_doc_from_couch(self, result, doc_id, decode=True, check_for_conflicts=False): # restrict to u1db documents if 'u1db_rev' not in result: @@ -418,19 +427,22 @@ class CouchDatabase(object): if '_attachments' not in result \ or 'u1db_content' not in result['_attachments']: doc.make_tombstone() - else: + elif decode: doc.content = json.loads( binascii.a2b_base64( result['_attachments']['u1db_content']['data'])) + else: + doc.content = result['_attachments']['u1db_content']['data'] # determine if there are conflicts if check_for_conflicts \ and '_attachments' in result \ and 'u1db_conflicts' in result['_attachments']: - doc.set_conflicts( - self._build_conflicts( - doc.doc_id, - json.loads(binascii.a2b_base64( - result['_attachments']['u1db_conflicts']['data'])))) + if decode: + conflicts = json.loads(binascii.a2b_base64( + result['_attachments']['u1db_conflicts']['data'])) + else: + conflicts = result['_attachments']['u1db_conflicts']['data'] + doc.set_conflicts(self._build_conflicts(doc.doc_id, conflicts)) # store couch revision doc.couch_rev = result['_rev'] return doc -- cgit v1.2.3 From ffe15f154541b6f929c569caf07560d117ad5efa Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Thu, 11 Aug 2016 11:34:47 -0300 Subject: [feature] use transactions on sync We were using 1 transaction per doc, which is bad. Reference: http://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite Code now uses 1 transaction for the whole sync. --- .../src/leap/soledad/common/l2db/backends/sqlite_backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py b/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py index d73c0d16..295f3132 100644 --- a/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py +++ b/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py @@ -505,12 +505,11 @@ class SQLiteDatabase(CommonBackend): def _put_doc_if_newer(self, doc, save_conflict, replica_uid=None, replica_gen=None, replica_trans_id=None): - with self._db_handle: - return super(SQLiteDatabase, self)._put_doc_if_newer( - doc, - save_conflict=save_conflict, - replica_uid=replica_uid, replica_gen=replica_gen, - replica_trans_id=replica_trans_id) + return super(SQLiteDatabase, self)._put_doc_if_newer( + doc, + save_conflict=save_conflict, + replica_uid=replica_uid, replica_gen=replica_gen, + replica_trans_id=replica_trans_id) def _add_conflict(self, c, doc_id, my_doc_rev, my_content): c.execute("INSERT INTO conflicts VALUES (?, ?, ?)", -- cgit v1.2.3 From 5d056170357acd0945899d7f0c40f530cbe816e0 Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Fri, 16 Sep 2016 19:33:06 -0300 Subject: [feature] server download stream from file object couchdb lib returns a file object representing the attachment. This commit dumps the read() call into the wsgi write() call. Doc representation uses 2 lines also, separating metadata from content. --- common/src/leap/soledad/common/couch/__init__.py | 5 ++--- common/src/leap/soledad/common/l2db/remote/http_app.py | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index d751747d..1a95e590 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -348,8 +348,7 @@ class CouchDatabase(object): result['_attachments'] = {} for file_name in attachment_file_names: result['_attachments'][file_name] = { - 'data': json.load( - self._database.get_attachment(result, file_name)) + 'data': self._database.get_attachment(result, file_name) } doc = self.__parse_doc_from_couch( result, result['_id'], @@ -432,7 +431,7 @@ class CouchDatabase(object): binascii.a2b_base64( result['_attachments']['u1db_content']['data'])) else: - doc.content = result['_attachments']['u1db_content']['data'] + doc._json = result['_attachments']['u1db_content']['data'] # determine if there are conflicts if check_for_conflicts \ and '_attachments' in result \ diff --git a/common/src/leap/soledad/common/l2db/remote/http_app.py b/common/src/leap/soledad/common/l2db/remote/http_app.py index 5cf6645e..a9680890 100644 --- a/common/src/leap/soledad/common/l2db/remote/http_app.py +++ b/common/src/leap/soledad/common/l2db/remote/http_app.py @@ -501,7 +501,9 @@ class HTTPResponder(object): self._write('\r\n') else: self._write(',\r\n') - self._write(json.dumps(entry)) + if type(entry) == dict: + entry = json.dumps(entry) + self._write(entry) def end_stream(self): "end stream (array)." -- cgit v1.2.3 From b774387754ecae77d3ae00de2a9e072cef2eb2e7 Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Sat, 17 Sep 2016 04:26:08 -0300 Subject: [feature] make reading attachments optional Will put a file object on doc json string if read_content is False, otherwise it will fetch and fill as usual. This is useful for improving server througput on sync download stream by receiving a bulk-get without attachments and consume the file-objects as they come. --- common/src/leap/soledad/common/backend.py | 4 +-- common/src/leap/soledad/common/couch/__init__.py | 42 ++++++++---------------- 2 files changed, 15 insertions(+), 31 deletions(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/backend.py b/common/src/leap/soledad/common/backend.py index f4f48f86..5c995d38 100644 --- a/common/src/leap/soledad/common/backend.py +++ b/common/src/leap/soledad/common/backend.py @@ -570,7 +570,7 @@ class SoledadBackend(CommonBackend): self._put_doc(cur_doc, doc) def get_docs(self, doc_ids, check_for_conflicts=True, - include_deleted=False): + include_deleted=False, read_content=True): """ Get the JSON content for many documents. @@ -588,7 +588,7 @@ class SoledadBackend(CommonBackend): :rtype: iterable """ return self._database.get_docs(doc_ids, check_for_conflicts, - include_deleted) + include_deleted, read_content) def _prune_conflicts(self, doc, doc_vcr): """ diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index 1a95e590..f19b0acb 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -296,31 +296,14 @@ class CouchDatabase(object): generation, _ = self.get_generation_info() results = list( - self._get_docs(None, True, include_deleted)) + self.get_docs(None, True, include_deleted)) return (generation, results) def get_docs(self, doc_ids, check_for_conflicts=True, - include_deleted=False): + include_deleted=False, read_content=True): """ Get the JSON content for many documents. - :param doc_ids: A list of document identifiers or None for all. - :type doc_ids: list - :param check_for_conflicts: If set to False, then the conflict check - will be skipped, and 'None' will be - returned instead of True/False. - :type check_for_conflicts: bool - :param include_deleted: If set to True, deleted documents will be - returned with empty content. Otherwise deleted - documents will not be included in the results. - :return: iterable giving the Document object for each document id - in matching doc_ids order. - :rtype: iterable - """ - return self._get_docs(doc_ids, check_for_conflicts, include_deleted) - - def _get_docs(self, doc_ids, check_for_conflicts, include_deleted): - """ Use couch's `_all_docs` view to get the documents indicated in `doc_ids`, @@ -344,12 +327,12 @@ class CouchDatabase(object): view = self._database.view("_all_docs", **params) for row in view.rows: result = copy.deepcopy(row['doc']) - attachment_file_names = result['_attachments'].keys() - result['_attachments'] = {} - for file_name in attachment_file_names: - result['_attachments'][file_name] = { - 'data': self._database.get_attachment(result, file_name) - } + for file_name in result.get('_attachments', {}).keys(): + data = self._database.get_attachment(result, file_name) + if data: + if read_content: + data = data.read() + result['_attachments'][file_name] = {'data': data} doc = self.__parse_doc_from_couch( result, result['_id'], check_for_conflicts=check_for_conflicts, decode=False) @@ -416,8 +399,8 @@ class CouchDatabase(object): self.batch_docs.clear() return rev - def __parse_doc_from_couch(self, result, doc_id, decode=True, - check_for_conflicts=False): + def __parse_doc_from_couch(self, result, doc_id, + check_for_conflicts=False, decode=True): # restrict to u1db documents if 'u1db_rev' not in result: return None @@ -437,10 +420,11 @@ class CouchDatabase(object): and '_attachments' in result \ and 'u1db_conflicts' in result['_attachments']: if decode: - conflicts = json.loads(binascii.a2b_base64( - result['_attachments']['u1db_conflicts']['data'])) + conflicts = binascii.a2b_base64( + result['_attachments']['u1db_conflicts']['data']) else: conflicts = result['_attachments']['u1db_conflicts']['data'] + conflicts = json.loads(conflicts) doc.set_conflicts(self._build_conflicts(doc.doc_id, conflicts)) # store couch revision doc.couch_rev = result['_rev'] -- cgit v1.2.3 From 9ea98145abd130227b33d691b82dbcca76ef70de Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Thu, 22 Sep 2016 01:02:28 -0300 Subject: [feature] fix and enable batch Batching is now decided by server, this commits enables it. --- common/src/leap/soledad/common/backend.py | 2 +- common/src/leap/soledad/common/couch/__init__.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/backend.py b/common/src/leap/soledad/common/backend.py index 5c995d38..4a29ca87 100644 --- a/common/src/leap/soledad/common/backend.py +++ b/common/src/leap/soledad/common/backend.py @@ -73,8 +73,8 @@ class SoledadBackend(CommonBackend): def batch_end(self): if not self.BATCH_SUPPORT: return - self.batching = False self._database.batch_end() + self.batching = False for name in self.after_batch_callbacks: self.after_batch_callbacks[name]() self.after_batch_callbacks = None diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index f19b0acb..6f233b26 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -658,7 +658,7 @@ class CouchDatabase(object): _, _, data = resource.get_json(**kwargs) return data - def _allocate_new_generation(self, doc_id, transaction_id): + def _allocate_new_generation(self, doc_id, transaction_id, save=True): """ Allocate a new generation number for a document modification. @@ -698,10 +698,12 @@ class CouchDatabase(object): DOC_ID_KEY: doc_id, TRANSACTION_ID_KEY: transaction_id, } - self._database.save(gen_doc) + if save: + self._database.save(gen_doc) break # succeeded allocating a new generation, proceed except ResourceConflict: pass # try again! + return gen_doc def save_document(self, old_doc, doc, transaction_id): """ @@ -780,6 +782,7 @@ class CouchDatabase(object): headers=envelope.headers) except ResourceConflict: raise RevisionConflict() + self._allocate_new_generation(doc.doc_id, transaction_id) else: for name, attachment in attachments.items(): del attachment['follows'] @@ -788,12 +791,12 @@ class CouchDatabase(object): attachment['data'] = binascii.b2a_base64( parts[index]).strip() couch_doc['_attachments'] = attachments + gen_doc = self._allocate_new_generation(doc.doc_id, transaction_id, save=False) self.batch_docs[doc.doc_id] = couch_doc + self.batch_docs[gen_doc['_id']] = gen_doc last_gen, last_trans_id = self.batch_generation self.batch_generation = (last_gen + 1, transaction_id) - self._allocate_new_generation(doc.doc_id, transaction_id) - def _new_resource(self, *path): """ Return a new resource for accessing a couch database. -- cgit v1.2.3 From 32d73ec50d6147d2511d6679bb12c17dc01210e4 Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Thu, 22 Sep 2016 05:03:59 -0300 Subject: [feature] batch based on payload size batch is slower than usual insert for a single doc, so, if a document exceeds the buffer, commit the batch (if any) and put the huge load by traditional insert. refactor coming. --- common/src/leap/soledad/common/couch/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'common') diff --git a/common/src/leap/soledad/common/couch/__init__.py b/common/src/leap/soledad/common/couch/__init__.py index 6f233b26..2e6f734e 100644 --- a/common/src/leap/soledad/common/couch/__init__.py +++ b/common/src/leap/soledad/common/couch/__init__.py @@ -791,7 +791,8 @@ class CouchDatabase(object): attachment['data'] = binascii.b2a_base64( parts[index]).strip() couch_doc['_attachments'] = attachments - gen_doc = self._allocate_new_generation(doc.doc_id, transaction_id, save=False) + gen_doc = self._allocate_new_generation( + doc.doc_id, transaction_id, save=False) self.batch_docs[doc.doc_id] = couch_doc self.batch_docs[gen_doc['_id']] = gen_doc last_gen, last_trans_id = self.batch_generation -- cgit v1.2.3 From 529dbdf27804f12da80907d25c412d10e9fa3763 Mon Sep 17 00:00:00 2001 From: Victor Shyba Date: Thu, 17 Nov 2016 01:33:04 -0300 Subject: [style] fix pep8 and confs Fixes setup.cfg, adding current exclude rules, simplified tox.ini to use setup.cfg and fixed all. --- common/src/leap/soledad/common/l2db/backends/sqlite_backend.py | 1 + common/src/leap/soledad/common/l2db/remote/http_app.py | 1 + 2 files changed, 2 insertions(+) (limited to 'common') diff --git a/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py b/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py index 295f3132..27db65af 100644 --- a/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py +++ b/common/src/leap/soledad/common/l2db/backends/sqlite_backend.py @@ -923,4 +923,5 @@ class SQLitePartialExpandDatabase(SQLiteDatabase): raw_doc = json.loads(doc) self._update_indexes(doc_id, raw_doc, getters, c) + SQLiteDatabase.register_implementation(SQLitePartialExpandDatabase) diff --git a/common/src/leap/soledad/common/l2db/remote/http_app.py b/common/src/leap/soledad/common/l2db/remote/http_app.py index a9680890..496274b2 100644 --- a/common/src/leap/soledad/common/l2db/remote/http_app.py +++ b/common/src/leap/soledad/common/l2db/remote/http_app.py @@ -194,6 +194,7 @@ class URLToResource(object): resource_cls = params.pop('resource_cls') return resource_cls, params + url_to_resource = URLToResource() -- cgit v1.2.3