From fd693f972cd987c4fddda25738239582153d2a7c Mon Sep 17 00:00:00 2001 From: Adam Kocoloski Date: Wed, 8 Dec 2010 15:48:52 +0000 Subject: Usort the infos during compaction to remove dupes, COUCHDB-968 This is not a bulletproof solution; it only removes dupes when the they appear in the same batch of 1000 updates. However, for dupes that show up in _all_docs the probability of that happening is quite high. If the dupes are only in _changes a user may need to compact twice, once to get the dupes ordered together and a second time to remove them. A more complete solution would be to trigger the compaction in "retry" mode, but this is siginificantly slower. git-svn-id: https://svn.apache.org/repos/asf/couchdb/branches/1.1.x@1043461 13f79535-47bb-0310-9956-ffa450edef68 --- src/couchdb/couch_db_updater.erl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/couchdb/couch_db_updater.erl b/src/couchdb/couch_db_updater.erl index bbba5d4b..8630ff4e 100644 --- a/src/couchdb/couch_db_updater.erl +++ b/src/couchdb/couch_db_updater.erl @@ -775,7 +775,10 @@ copy_rev_tree_attachments(SrcDb, DestFd, Tree) -> end, Tree). -copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> +copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq0, Retry) -> + % COUCHDB-968, make sure we prune duplicates during compaction + InfoBySeq = lists:usort(fun(#doc_info{id=A}, #doc_info{id=B}) -> A =< B end, + InfoBySeq0), Ids = [Id || #doc_info{id=Id} <- InfoBySeq], LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids), -- cgit v1.2.3