diff options
author | Brad Anderson <brad@cloudant.com> | 2010-05-07 23:33:19 -0400 |
---|---|---|
committer | Brad Anderson <brad@cloudant.com> | 2010-05-09 22:56:25 -0400 |
commit | bdd612392c4ee759c95deeaccfa471983f4d3c28 (patch) | |
tree | 08ff96bc7ad10efad9722210fffda7be95713305 /src | |
parent | 7896702038b9b5c9adb3951a8196b198046783a2 (diff) |
work on create_db functionality, can now generate fullmap for a db based on its name, its config, and current mem3 nodes - BugzID 10007
Diffstat (limited to 'src')
-rw-r--r-- | src/mem3.erl | 3 | ||||
-rw-r--r-- | src/partitions.erl | 395 |
2 files changed, 81 insertions, 317 deletions
diff --git a/src/mem3.erl b/src/mem3.erl index b0105286..a95b5fb0 100644 --- a/src/mem3.erl +++ b/src/mem3.erl @@ -118,6 +118,7 @@ reset() -> %% @doc get the list of cluster nodes (according to membership module) %% This may differ from erlang:nodes() +%% Guaranteed to be in order of State's node list (1st elem in 3-tuple) nodes() -> gen_server:call(?SERVER, nodes). @@ -203,7 +204,7 @@ handle_call(reset, _From, #mem{args=Args} = State) -> %% nodes handle_call(nodes, _From, #mem{nodes=Nodes} = State) -> - {_,NodeList,_} = lists:unzip3(Nodes), + {_,NodeList,_} = lists:unzip3(lists:keysort(1, Nodes)), {reply, {ok, NodeList}, State}; %% gossip diff --git a/src/partitions.erl b/src/partitions.erl index 27d2a5a1..f029fedd 100644 --- a/src/partitions.erl +++ b/src/partitions.erl @@ -1,339 +1,102 @@ -%%%------------------------------------------------------------------- -%%% File: partitions.erl -%%% @author Cliff Moon <cliff@powerset.com> [http://www.powerset.com/] -%%% @copyright 2008 Cliff Moon -%%% @doc -%%% -%%% @end -%%% -%%% @since 2008-10-12 by Cliff Moon -%%%------------------------------------------------------------------- -module(partitions). --author('cliff@powerset.com'). +-author('brad@cloudant.com'). %% API --export([partition_range/1, create_partitions/2, create_partitions/3, - diff/2, pp_diff/1, int_to_partition/2, map_partitions/2, - join/3, leave/3, hash/1, hash_to_partition/2, item_to_nodepart/1, - shard_name/2, hash_to_hex/2]). +-export([fullmap/3, hash/1]). --define(RINGTOP, trunc(math:pow(2,160)-1)). % SHA-1 space +-define(RINGTOP, trunc(math:pow(2,160))). % SHA-1 space -include("../../couch/src/couch_db.hrl"). -include_lib("eunit/include/eunit.hrl"). -%% -ifdef(TEST). -%% -include("etest/partitions_test.erl"). -%% -endif. - %%==================================================================== %% API %%==================================================================== -partition_range(Q) -> - trunc( ?RINGTOP / math:pow(2,Q) ). % SHA-1 space / 2^Q - - -create_partitions(Q, Node) -> - create_partitions(Q, Node, []). - - -create_partitions(Q, Node, _Nodes) -> - fresh(trunc(math:pow(2,Q)), Node). - % map_partitions(Table, Nodes). - - -%% @spec map_partitions(Table::proplist(),Nodes::list()) -> proplist() -%% @doc maps partitions to nodes. The resulting list should be Dynomite format, -%% namely {Node,Part} -%% @end -map_partitions(Table, Nodes) -> - {_Nodes, Parts} = lists:unzip(Table), - do_map(Nodes, Parts). - - -%% @doc in case Hints is undefined, turn it into a list for clauses below. -join(Node, Table, undefined) -> - join(Node, Table, []); - -%% @spec join(node(), proplist(), list()) -> {ok, PartTable::proplist()} | -%% {error, Error} -%% @doc given a node, current partition table, and hints, this function returns -%% the new partition table -join(Node, Table, Hints) -> - {NodeList, Parts} = lists:unzip(Table), - OtherNodes = lists:delete(Node, NodeList), - OtherDistinctNodes = lists:usort(OtherNodes), - %% quick check to see if we have more nodes than partitions - if - length(Parts) == length(OtherDistinctNodes) -> - {error, "Too many nodes vs partitions", Table}; - true -> - AlreadyPresent = length(NodeList) - length(OtherNodes), - Nodes = lists:usort(NodeList), - PartCountToTake = trunc(length(Parts) / (length(Nodes) + 1)), - %% calcs done, let's steal some partitions - {HintsTaken, NewTable} = steal_hints(Node, Table, Hints), - if - PartCountToTake - AlreadyPresent - HintsTaken > 0 -> - steal_partitions(Node, OtherDistinctNodes, NewTable, - PartCountToTake - AlreadyPresent - HintsTaken); - true -> - %% no partitions to take - {ok, NewTable} - end - end. - - -%% TODO: implement me -leave(_Node, Table, _Hints) -> - Table. - - -diff(From, To) when length(From) =/= length(To) -> - {error, badlength, "Cannot diff partition maps with different length"}; - -diff(From, To) -> - diff(sort_for_diff(From), sort_for_diff(To), []). - - -pp_diff(Diff) -> - lists:map( - fun({F,T,Part}) -> {F,T,showroom_utils:int_to_hexstr(Part)} end, - Diff). - +%% @doc build a full partition map +fullmap(DbName, Nodes, Options) -> + {N,Q} = db_init_constants(Options), + NewNodes = ordered_nodes(DbName, Nodes), + Pmap = pmap(Q, NewNodes), + int_fullmap(N, Pmap, NewNodes). %% @spec hash(term()) -> Digest::binary() -%% @doc Showroom uses SHA-1 as its hash +%% @doc uses SHA-1 as its hash hash(Item) -> crypto:sha(term_to_binary(Item)). - -%% @spec hash_to_partition(binary(), integer()) -> integer() -%% @doc given a hashed value and Q, return the partition -hash_to_partition(Hash, Q) -> - HashInt = hash_int(Hash), - Size = partition_range(Q), - Factor = (HashInt div Size), - Rem = (HashInt rem Size), - if - Rem > 0 -> Factor * Size; - true -> ((Factor-1) * Size) - end. - - -hash_to_hex(Hash, Q) -> - Part = hash_to_partition(Hash, Q), - showroom_utils:int_to_hexstr(Part). - - -%% @doc given an int and a list of partitions, get the first part greater -%% than Int. Used for a hex part being turned back into an int. -int_to_partition(Int, Parts) -> - Rem = lists:dropwhile(fun(E) -> E < Int end, lists:sort(Parts)), - case Rem of - [] -> 0; % wrap-around-ring case (back to 0) - [H|_T] -> H - end. - - -%% @spec item_to_nodepart(bin()) -> {Node::node(),Part::integer()} -%% @doc given a raw item, return the node/partition/shard -%% name based on consistent hashing -item_to_nodepart(Item) when is_binary(Item) -> - Q = list_to_integer(couch_config:get("cluster","q")), - Hash = hash(?b2l(Item)), - Part = hash_to_partition(Hash, Q), - {ok, Table} = membership2:partitions(), - lists:keyfind(Part, 2, Table); - -item_to_nodepart(Item) -> - item_to_nodepart(term_to_binary(Item)). - - -%% @spec shard_name(integer(), binary()) -> binary() -%% @doc create shard name -shard_name(Part, DbName) -> - PartHex = ?l2b(showroom_utils:int_to_hexstr(Part)), - <<"x", PartHex/binary, "/", DbName/binary, "_", PartHex/binary>>. - %%==================================================================== %% Internal functions %%==================================================================== -%% @doc Create a brand new table. The size and seednode are specified; -%% initially all partitions are owned by the seednode. If NumPartitions -%% is not much larger than the intended eventual number of -%% participating nodes, then performance will suffer. -%% from http://code.google.com/p/distributerl (trunk revision 4) chash:fresh/2 -%% @spec fresh(NumPartitions :: integer(), SeedNode :: node()) -> table() -fresh(NumPartitions, SeedNode) -> - Increment = ?RINGTOP div NumPartitions, - [{SeedNode, IndexAsInt} || IndexAsInt <- lists:seq(0,(?RINGTOP-1),Increment)]. - - -%% @spec steal_hints(node(), proplist(), list( integer() )) -> -%% {integer(), proplist()} -%% @doc move the partitions listed in Hints over to the new owner, Node -steal_hints(Node, Table, Hints) -> - steal_hints(Node, Table, Hints, 0). - - -%% @doc recursive workhorse for hints mechanism, Acc is tracking how many -%% hints/partitions were successfully moved to a new Node. -%% @end -steal_hints(_Node, Table, [], Acc) -> - {Acc, Table}; - -steal_hints(Node, Table, [Hint|RestHints], Acc) -> - {Status, NewTable} = swap_node_for_part(Node, Hint, Table), - Acc1 = case Status of - ok -> Acc+1; - _ -> Acc - end, - steal_hints(Node, NewTable, RestHints, Acc1). - - -%% @doc take a part from one of the other nodes based on most # of parts per -%% node. -%% @end -%% TODO: This fun does list ops on the Table each time through. Inefficient? -%% Hopefully not, due to small Table sizes -steal_partitions(_Node, _OtherNodes, Table, 0) -> - {ok, Table}; -steal_partitions(Node, OtherNodes, Table, Count) -> - %% first, get a list of OtherNodes and their partition counts - NPCountFun = fun(N) -> - L = proplists:get_all_values(N, Table), - {N, length(lists:delete(undefined, L))} - end, - NPCounts = lists:reverse(lists:keysort(2,lists:map(NPCountFun, OtherNodes))), - %% grab the node that has the most partitions - [{TakeFrom, _PartsCount}|_RestOfTable] = NPCounts, - %% get the highest # partition of the TakeFrom node - TakeFromParts = lists:reverse(lists:sort(proplists:get_all_values(TakeFrom, - Table))), - [Part|_RestOfParts] = TakeFromParts, - {ok, NewTable} = swap_node_for_part(Node, Part, Table), - steal_partitions(Node, OtherNodes, NewTable, Count-1). - - -%% @doc Make Node the owner of the partition beginning at Part. -%% from http://code.google.com/p/distributerl (trunk revision 4) chash:update/3 -swap_node_for_part(Node, Part, Table) -> - case lists:keymember(Part, 2, Table) of - true -> - GapList = [{N,P} || {N,P} <- Table, P /= Part], - {A, B} = lists:partition(fun({_,K1}) -> K1 < Part end, GapList), - {ok, A ++ [{Node, Part}] ++ B}; - false -> - showroom_log:message(info, - "'~p' partition was not found in partition table", [Part]), - {noswap, Table} - end. - - -%% @doc get the difference between two FullPMaps -%% lists need to be sorted by part, then node -diff([], [], Results) -> - lists:reverse(remove_dupes(Results)); - -diff([{Node,Part,_}|PartsA], [{Node,Part,_}|PartsB], Results) -> - diff(PartsA, PartsB, Results); - -diff([{NodeA,Part,_}|PartsA], [{NodeB,Part,_}|PartsB], Results) -> - diff(PartsA, PartsB, [{NodeA,NodeB,Part}|Results]). - - -%% @doc sorts the full map for diff/3. This may change to get more accurate -%% diff w/o dupes -sort_for_diff(FullMap) -> - lists:keysort(2,lists:sort(FullMap)). - - -remove_dupes(Diff) -> - {_,_,AllParts} = lists:unzip3(Diff), - Parts = lists:usort(AllParts), - remove_dupes_from_part(Parts, Diff, []). - - -%% @doc ex: take [{a,b,1},{b,c,1}] diff and make it [{a,c,1}] so we don't go -%% moving unnecessary shard files. 'Move partition 1 from a to b and -%% then move partition 1 from b to c' is unnecessary. Just move it a to c. -remove_dupes_from_part([], _Diff, Acc) -> - Acc; - -remove_dupes_from_part([Part|Rest], Diff, Acc) -> - PartData = lists:filter(fun({_,_,P}) -> P =:= Part end, Diff), - NewPartData = process_part_data(Part, PartData, PartData, PartData), - remove_dupes_from_part(Rest, Diff, lists:concat([NewPartData, Acc])). - - -%% for one partition of the full diff, remove the dupes -process_part_data(_Part, _PartData, [], Acc) -> - Acc; - -process_part_data(Part, PartData, [{From,To,_Part}|Rest], Acc) -> - case proplists:lookup(To, PartData) of - {To, NewTo, _Part} -> - - Remove1 = proplists:delete(To, PartData), - Remove2 = proplists:delete(From, Remove1), - NewPartData = [{From, NewTo, Part}|Remove2], - %?debugFmt("~nFrom : ~p~nTo : ~p~nNewTo: ~p~n" - % "Remove1: ~p~nRemove2: ~p~n" - % "NewPartData: ~p~n" - % , [From, To, NewTo, Remove1, Remove2, NewPartData]), - process_part_data(Part, NewPartData, Rest, NewPartData); - none -> - process_part_data(Part, PartData, Rest, Acc) - end. - - -% %% @doc from dynomite -% diff([], [], Results) -> -% lists:reverse(Results); - -% diff([{Node,Part}|PartsA], [{Node,Part}|PartsB], Results) -> -% diff(PartsA, PartsB, Results); - -% diff([{NodeA,Part}|PartsA], [{NodeB,Part}|PartsB], Results) -> -% diff(PartsA, PartsB, [{NodeA,NodeB,Part}|Results]). - - -%% @doc does Node/Partition mapping based on Amazon Dynamo paper, -%% section 6.2, strategy 3, more or less -%% http://www.allthingsdistributed.com/2007/10/amazons_dynamo.html -%% @end -do_map([Node|RestNodes], Parts) -> - Max = length(Parts) / length([Node|RestNodes]), - do_map(Node, RestNodes, Parts, [], 1, Max). - - -%% return final mapped list -do_map(_,_,[],Mapped, _, _) -> - lists:keysort(1, Mapped); - -%% finish off last node, Cnt & Max no longer needed -do_map(Node, [], [Part|RestParts], Mapped, _, _) -> - do_map(Node, [], RestParts, [{Node, Part}|Mapped], 0,0); - -%% workhorse clause, iterates through parts, until Cnt > Max, then advances to -%% next node, wash, rinse, repeat -do_map(Node, [NextNode|RestNodes], [Part|RestParts], Mapped, Cnt, Max) -> - case Cnt > Max of - true -> - do_map(NextNode, RestNodes, RestParts, [{Node, Part}|Mapped], - 1, Max); - false -> - do_map(Node, [NextNode|RestNodes], RestParts, [{Node, Part}|Mapped], - Cnt+1, Max) - end. - - -%% TODO: other guards +%% @doc get cluster constants from options or config +db_init_constants(Options) -> + {const(n, Options), const(q, Options)}. + +%% @doc get individual constant +const(Const, Options) -> + ListResult = case couch_util:get_value(Const, Options) of + undefined -> couch_config:get("cluster", atom_to_list(Const)); + Val -> Val + end, + list_to_integer(ListResult). + +%% @doc hash the dbname, and return the corresponding node for seeding a ring +seednode(DbName, Nodes) -> + Hash = hash(DbName), + HashInt = hash_int(Hash), + Size = partition_range(length(Nodes)), + Factor = (HashInt div Size), + lists:nth(Factor+1, Nodes). + +%% @doc take the list of nodes, and rearrange it, starting with the node that +%% results from hashing the Term +ordered_nodes(Term, Nodes) -> + SeedNode = seednode(Term, Nodes), + {A, B} = lists:splitwith(fun(N) -> N /= SeedNode end, Nodes), + lists:append(B,A). + +%% @doc create a partition map [{node(),part{}|_} +pmap(NumPartitions, Nodes) -> + Increment = ?RINGTOP div NumPartitions + 1, + Parts = lists:seq(0,(?RINGTOP),Increment), + make_map(Nodes, Nodes, Parts, []). + +%% @doc create a full map, which is a pmap with N-1 replication partner nodes +%% added per partition +int_fullmap(N, Pmap, Nodes) -> + Full = lists:foldl(fun({Node,Part}, AccIn) -> + Partners = partners(N, Node, Nodes, Part), + lists:append([ [{Node,Part}], Partners, AccIn]) + end, [], Pmap), + lists:reverse(Full). + +partners(N, Node, Nodes, Part) -> + {A, [Node|B]} = lists:splitwith(fun(Nd) -> Nd /= Node end, Nodes), + Nodes1 = lists:append(B,A), + Partners = lists:sublist(Nodes1, N-1), % N-1 replication partner nodes + lists:map(fun(Partner) -> {Partner, Part} end, Partners). + + +%% @doc turn hash into an integer hash_int(Hash) when is_binary(Hash) -> - <<IndexAsInt:160/integer>> = Hash, - IndexAsInt; + <<IndexAsInt:160/integer>> = Hash, + IndexAsInt; hash_int(Hash) when is_integer(Hash) -> - Hash. + Hash. + +%% @doc size of one partition in the ring +partition_range(Q) -> + trunc( ?RINGTOP / Q ). % SHA-1 space / Q + +%% @doc assign nodes to each of the partitions. When you run out of nodes, +%% start at the beginning of the node list again. +%% The provided node list starts with the seed node (seednode fun) +make_map(_,_,[], Acc) -> + lists:keysort(2,Acc); +make_map(AllNodes, [], Parts, Acc) -> + % start back at beginning of node list + make_map(AllNodes, AllNodes, Parts, Acc); +make_map(AllNodes, [Node|RestNodes], [Part|RestParts], Acc) -> + % add a node/part combo to the Acc + make_map(AllNodes, RestNodes, RestParts, [{Node,Part}|Acc]). |