summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBrad Anderson <brad@cloudant.com>2010-05-07 23:33:19 -0400
committerBrad Anderson <brad@cloudant.com>2010-05-09 22:56:25 -0400
commitbdd612392c4ee759c95deeaccfa471983f4d3c28 (patch)
tree08ff96bc7ad10efad9722210fffda7be95713305 /src
parent7896702038b9b5c9adb3951a8196b198046783a2 (diff)
work on create_db functionality, can now generate fullmap for a db based on its name, its config, and current mem3 nodes - BugzID 10007
Diffstat (limited to 'src')
-rw-r--r--src/mem3.erl3
-rw-r--r--src/partitions.erl395
2 files changed, 81 insertions, 317 deletions
diff --git a/src/mem3.erl b/src/mem3.erl
index b0105286..a95b5fb0 100644
--- a/src/mem3.erl
+++ b/src/mem3.erl
@@ -118,6 +118,7 @@ reset() ->
%% @doc get the list of cluster nodes (according to membership module)
%% This may differ from erlang:nodes()
+%% Guaranteed to be in order of State's node list (1st elem in 3-tuple)
nodes() ->
gen_server:call(?SERVER, nodes).
@@ -203,7 +204,7 @@ handle_call(reset, _From, #mem{args=Args} = State) ->
%% nodes
handle_call(nodes, _From, #mem{nodes=Nodes} = State) ->
- {_,NodeList,_} = lists:unzip3(Nodes),
+ {_,NodeList,_} = lists:unzip3(lists:keysort(1, Nodes)),
{reply, {ok, NodeList}, State};
%% gossip
diff --git a/src/partitions.erl b/src/partitions.erl
index 27d2a5a1..f029fedd 100644
--- a/src/partitions.erl
+++ b/src/partitions.erl
@@ -1,339 +1,102 @@
-%%%-------------------------------------------------------------------
-%%% File: partitions.erl
-%%% @author Cliff Moon <cliff@powerset.com> [http://www.powerset.com/]
-%%% @copyright 2008 Cliff Moon
-%%% @doc
-%%%
-%%% @end
-%%%
-%%% @since 2008-10-12 by Cliff Moon
-%%%-------------------------------------------------------------------
-module(partitions).
--author('cliff@powerset.com').
+-author('brad@cloudant.com').
%% API
--export([partition_range/1, create_partitions/2, create_partitions/3,
- diff/2, pp_diff/1, int_to_partition/2, map_partitions/2,
- join/3, leave/3, hash/1, hash_to_partition/2, item_to_nodepart/1,
- shard_name/2, hash_to_hex/2]).
+-export([fullmap/3, hash/1]).
--define(RINGTOP, trunc(math:pow(2,160)-1)). % SHA-1 space
+-define(RINGTOP, trunc(math:pow(2,160))). % SHA-1 space
-include("../../couch/src/couch_db.hrl").
-include_lib("eunit/include/eunit.hrl").
-%% -ifdef(TEST).
-%% -include("etest/partitions_test.erl").
-%% -endif.
-
%%====================================================================
%% API
%%====================================================================
-partition_range(Q) ->
- trunc( ?RINGTOP / math:pow(2,Q) ). % SHA-1 space / 2^Q
-
-
-create_partitions(Q, Node) ->
- create_partitions(Q, Node, []).
-
-
-create_partitions(Q, Node, _Nodes) ->
- fresh(trunc(math:pow(2,Q)), Node).
- % map_partitions(Table, Nodes).
-
-
-%% @spec map_partitions(Table::proplist(),Nodes::list()) -> proplist()
-%% @doc maps partitions to nodes. The resulting list should be Dynomite format,
-%% namely {Node,Part}
-%% @end
-map_partitions(Table, Nodes) ->
- {_Nodes, Parts} = lists:unzip(Table),
- do_map(Nodes, Parts).
-
-
-%% @doc in case Hints is undefined, turn it into a list for clauses below.
-join(Node, Table, undefined) ->
- join(Node, Table, []);
-
-%% @spec join(node(), proplist(), list()) -> {ok, PartTable::proplist()} |
-%% {error, Error}
-%% @doc given a node, current partition table, and hints, this function returns
-%% the new partition table
-join(Node, Table, Hints) ->
- {NodeList, Parts} = lists:unzip(Table),
- OtherNodes = lists:delete(Node, NodeList),
- OtherDistinctNodes = lists:usort(OtherNodes),
- %% quick check to see if we have more nodes than partitions
- if
- length(Parts) == length(OtherDistinctNodes) ->
- {error, "Too many nodes vs partitions", Table};
- true ->
- AlreadyPresent = length(NodeList) - length(OtherNodes),
- Nodes = lists:usort(NodeList),
- PartCountToTake = trunc(length(Parts) / (length(Nodes) + 1)),
- %% calcs done, let's steal some partitions
- {HintsTaken, NewTable} = steal_hints(Node, Table, Hints),
- if
- PartCountToTake - AlreadyPresent - HintsTaken > 0 ->
- steal_partitions(Node, OtherDistinctNodes, NewTable,
- PartCountToTake - AlreadyPresent - HintsTaken);
- true ->
- %% no partitions to take
- {ok, NewTable}
- end
- end.
-
-
-%% TODO: implement me
-leave(_Node, Table, _Hints) ->
- Table.
-
-
-diff(From, To) when length(From) =/= length(To) ->
- {error, badlength, "Cannot diff partition maps with different length"};
-
-diff(From, To) ->
- diff(sort_for_diff(From), sort_for_diff(To), []).
-
-
-pp_diff(Diff) ->
- lists:map(
- fun({F,T,Part}) -> {F,T,showroom_utils:int_to_hexstr(Part)} end,
- Diff).
-
+%% @doc build a full partition map
+fullmap(DbName, Nodes, Options) ->
+ {N,Q} = db_init_constants(Options),
+ NewNodes = ordered_nodes(DbName, Nodes),
+ Pmap = pmap(Q, NewNodes),
+ int_fullmap(N, Pmap, NewNodes).
%% @spec hash(term()) -> Digest::binary()
-%% @doc Showroom uses SHA-1 as its hash
+%% @doc uses SHA-1 as its hash
hash(Item) ->
crypto:sha(term_to_binary(Item)).
-
-%% @spec hash_to_partition(binary(), integer()) -> integer()
-%% @doc given a hashed value and Q, return the partition
-hash_to_partition(Hash, Q) ->
- HashInt = hash_int(Hash),
- Size = partition_range(Q),
- Factor = (HashInt div Size),
- Rem = (HashInt rem Size),
- if
- Rem > 0 -> Factor * Size;
- true -> ((Factor-1) * Size)
- end.
-
-
-hash_to_hex(Hash, Q) ->
- Part = hash_to_partition(Hash, Q),
- showroom_utils:int_to_hexstr(Part).
-
-
-%% @doc given an int and a list of partitions, get the first part greater
-%% than Int. Used for a hex part being turned back into an int.
-int_to_partition(Int, Parts) ->
- Rem = lists:dropwhile(fun(E) -> E < Int end, lists:sort(Parts)),
- case Rem of
- [] -> 0; % wrap-around-ring case (back to 0)
- [H|_T] -> H
- end.
-
-
-%% @spec item_to_nodepart(bin()) -> {Node::node(),Part::integer()}
-%% @doc given a raw item, return the node/partition/shard
-%% name based on consistent hashing
-item_to_nodepart(Item) when is_binary(Item) ->
- Q = list_to_integer(couch_config:get("cluster","q")),
- Hash = hash(?b2l(Item)),
- Part = hash_to_partition(Hash, Q),
- {ok, Table} = membership2:partitions(),
- lists:keyfind(Part, 2, Table);
-
-item_to_nodepart(Item) ->
- item_to_nodepart(term_to_binary(Item)).
-
-
-%% @spec shard_name(integer(), binary()) -> binary()
-%% @doc create shard name
-shard_name(Part, DbName) ->
- PartHex = ?l2b(showroom_utils:int_to_hexstr(Part)),
- <<"x", PartHex/binary, "/", DbName/binary, "_", PartHex/binary>>.
-
%%====================================================================
%% Internal functions
%%====================================================================
-%% @doc Create a brand new table. The size and seednode are specified;
-%% initially all partitions are owned by the seednode. If NumPartitions
-%% is not much larger than the intended eventual number of
-%% participating nodes, then performance will suffer.
-%% from http://code.google.com/p/distributerl (trunk revision 4) chash:fresh/2
-%% @spec fresh(NumPartitions :: integer(), SeedNode :: node()) -> table()
-fresh(NumPartitions, SeedNode) ->
- Increment = ?RINGTOP div NumPartitions,
- [{SeedNode, IndexAsInt} || IndexAsInt <- lists:seq(0,(?RINGTOP-1),Increment)].
-
-
-%% @spec steal_hints(node(), proplist(), list( integer() )) ->
-%% {integer(), proplist()}
-%% @doc move the partitions listed in Hints over to the new owner, Node
-steal_hints(Node, Table, Hints) ->
- steal_hints(Node, Table, Hints, 0).
-
-
-%% @doc recursive workhorse for hints mechanism, Acc is tracking how many
-%% hints/partitions were successfully moved to a new Node.
-%% @end
-steal_hints(_Node, Table, [], Acc) ->
- {Acc, Table};
-
-steal_hints(Node, Table, [Hint|RestHints], Acc) ->
- {Status, NewTable} = swap_node_for_part(Node, Hint, Table),
- Acc1 = case Status of
- ok -> Acc+1;
- _ -> Acc
- end,
- steal_hints(Node, NewTable, RestHints, Acc1).
-
-
-%% @doc take a part from one of the other nodes based on most # of parts per
-%% node.
-%% @end
-%% TODO: This fun does list ops on the Table each time through. Inefficient?
-%% Hopefully not, due to small Table sizes
-steal_partitions(_Node, _OtherNodes, Table, 0) ->
- {ok, Table};
-steal_partitions(Node, OtherNodes, Table, Count) ->
- %% first, get a list of OtherNodes and their partition counts
- NPCountFun = fun(N) ->
- L = proplists:get_all_values(N, Table),
- {N, length(lists:delete(undefined, L))}
- end,
- NPCounts = lists:reverse(lists:keysort(2,lists:map(NPCountFun, OtherNodes))),
- %% grab the node that has the most partitions
- [{TakeFrom, _PartsCount}|_RestOfTable] = NPCounts,
- %% get the highest # partition of the TakeFrom node
- TakeFromParts = lists:reverse(lists:sort(proplists:get_all_values(TakeFrom,
- Table))),
- [Part|_RestOfParts] = TakeFromParts,
- {ok, NewTable} = swap_node_for_part(Node, Part, Table),
- steal_partitions(Node, OtherNodes, NewTable, Count-1).
-
-
-%% @doc Make Node the owner of the partition beginning at Part.
-%% from http://code.google.com/p/distributerl (trunk revision 4) chash:update/3
-swap_node_for_part(Node, Part, Table) ->
- case lists:keymember(Part, 2, Table) of
- true ->
- GapList = [{N,P} || {N,P} <- Table, P /= Part],
- {A, B} = lists:partition(fun({_,K1}) -> K1 < Part end, GapList),
- {ok, A ++ [{Node, Part}] ++ B};
- false ->
- showroom_log:message(info,
- "'~p' partition was not found in partition table", [Part]),
- {noswap, Table}
- end.
-
-
-%% @doc get the difference between two FullPMaps
-%% lists need to be sorted by part, then node
-diff([], [], Results) ->
- lists:reverse(remove_dupes(Results));
-
-diff([{Node,Part,_}|PartsA], [{Node,Part,_}|PartsB], Results) ->
- diff(PartsA, PartsB, Results);
-
-diff([{NodeA,Part,_}|PartsA], [{NodeB,Part,_}|PartsB], Results) ->
- diff(PartsA, PartsB, [{NodeA,NodeB,Part}|Results]).
-
-
-%% @doc sorts the full map for diff/3. This may change to get more accurate
-%% diff w/o dupes
-sort_for_diff(FullMap) ->
- lists:keysort(2,lists:sort(FullMap)).
-
-
-remove_dupes(Diff) ->
- {_,_,AllParts} = lists:unzip3(Diff),
- Parts = lists:usort(AllParts),
- remove_dupes_from_part(Parts, Diff, []).
-
-
-%% @doc ex: take [{a,b,1},{b,c,1}] diff and make it [{a,c,1}] so we don't go
-%% moving unnecessary shard files. 'Move partition 1 from a to b and
-%% then move partition 1 from b to c' is unnecessary. Just move it a to c.
-remove_dupes_from_part([], _Diff, Acc) ->
- Acc;
-
-remove_dupes_from_part([Part|Rest], Diff, Acc) ->
- PartData = lists:filter(fun({_,_,P}) -> P =:= Part end, Diff),
- NewPartData = process_part_data(Part, PartData, PartData, PartData),
- remove_dupes_from_part(Rest, Diff, lists:concat([NewPartData, Acc])).
-
-
-%% for one partition of the full diff, remove the dupes
-process_part_data(_Part, _PartData, [], Acc) ->
- Acc;
-
-process_part_data(Part, PartData, [{From,To,_Part}|Rest], Acc) ->
- case proplists:lookup(To, PartData) of
- {To, NewTo, _Part} ->
-
- Remove1 = proplists:delete(To, PartData),
- Remove2 = proplists:delete(From, Remove1),
- NewPartData = [{From, NewTo, Part}|Remove2],
- %?debugFmt("~nFrom : ~p~nTo : ~p~nNewTo: ~p~n"
- % "Remove1: ~p~nRemove2: ~p~n"
- % "NewPartData: ~p~n"
- % , [From, To, NewTo, Remove1, Remove2, NewPartData]),
- process_part_data(Part, NewPartData, Rest, NewPartData);
- none ->
- process_part_data(Part, PartData, Rest, Acc)
- end.
-
-
-% %% @doc from dynomite
-% diff([], [], Results) ->
-% lists:reverse(Results);
-
-% diff([{Node,Part}|PartsA], [{Node,Part}|PartsB], Results) ->
-% diff(PartsA, PartsB, Results);
-
-% diff([{NodeA,Part}|PartsA], [{NodeB,Part}|PartsB], Results) ->
-% diff(PartsA, PartsB, [{NodeA,NodeB,Part}|Results]).
-
-
-%% @doc does Node/Partition mapping based on Amazon Dynamo paper,
-%% section 6.2, strategy 3, more or less
-%% http://www.allthingsdistributed.com/2007/10/amazons_dynamo.html
-%% @end
-do_map([Node|RestNodes], Parts) ->
- Max = length(Parts) / length([Node|RestNodes]),
- do_map(Node, RestNodes, Parts, [], 1, Max).
-
-
-%% return final mapped list
-do_map(_,_,[],Mapped, _, _) ->
- lists:keysort(1, Mapped);
-
-%% finish off last node, Cnt & Max no longer needed
-do_map(Node, [], [Part|RestParts], Mapped, _, _) ->
- do_map(Node, [], RestParts, [{Node, Part}|Mapped], 0,0);
-
-%% workhorse clause, iterates through parts, until Cnt > Max, then advances to
-%% next node, wash, rinse, repeat
-do_map(Node, [NextNode|RestNodes], [Part|RestParts], Mapped, Cnt, Max) ->
- case Cnt > Max of
- true ->
- do_map(NextNode, RestNodes, RestParts, [{Node, Part}|Mapped],
- 1, Max);
- false ->
- do_map(Node, [NextNode|RestNodes], RestParts, [{Node, Part}|Mapped],
- Cnt+1, Max)
- end.
-
-
-%% TODO: other guards
+%% @doc get cluster constants from options or config
+db_init_constants(Options) ->
+ {const(n, Options), const(q, Options)}.
+
+%% @doc get individual constant
+const(Const, Options) ->
+ ListResult = case couch_util:get_value(Const, Options) of
+ undefined -> couch_config:get("cluster", atom_to_list(Const));
+ Val -> Val
+ end,
+ list_to_integer(ListResult).
+
+%% @doc hash the dbname, and return the corresponding node for seeding a ring
+seednode(DbName, Nodes) ->
+ Hash = hash(DbName),
+ HashInt = hash_int(Hash),
+ Size = partition_range(length(Nodes)),
+ Factor = (HashInt div Size),
+ lists:nth(Factor+1, Nodes).
+
+%% @doc take the list of nodes, and rearrange it, starting with the node that
+%% results from hashing the Term
+ordered_nodes(Term, Nodes) ->
+ SeedNode = seednode(Term, Nodes),
+ {A, B} = lists:splitwith(fun(N) -> N /= SeedNode end, Nodes),
+ lists:append(B,A).
+
+%% @doc create a partition map [{node(),part{}|_}
+pmap(NumPartitions, Nodes) ->
+ Increment = ?RINGTOP div NumPartitions + 1,
+ Parts = lists:seq(0,(?RINGTOP),Increment),
+ make_map(Nodes, Nodes, Parts, []).
+
+%% @doc create a full map, which is a pmap with N-1 replication partner nodes
+%% added per partition
+int_fullmap(N, Pmap, Nodes) ->
+ Full = lists:foldl(fun({Node,Part}, AccIn) ->
+ Partners = partners(N, Node, Nodes, Part),
+ lists:append([ [{Node,Part}], Partners, AccIn])
+ end, [], Pmap),
+ lists:reverse(Full).
+
+partners(N, Node, Nodes, Part) ->
+ {A, [Node|B]} = lists:splitwith(fun(Nd) -> Nd /= Node end, Nodes),
+ Nodes1 = lists:append(B,A),
+ Partners = lists:sublist(Nodes1, N-1), % N-1 replication partner nodes
+ lists:map(fun(Partner) -> {Partner, Part} end, Partners).
+
+
+%% @doc turn hash into an integer
hash_int(Hash) when is_binary(Hash) ->
- <<IndexAsInt:160/integer>> = Hash,
- IndexAsInt;
+ <<IndexAsInt:160/integer>> = Hash,
+ IndexAsInt;
hash_int(Hash) when is_integer(Hash) ->
- Hash.
+ Hash.
+
+%% @doc size of one partition in the ring
+partition_range(Q) ->
+ trunc( ?RINGTOP / Q ). % SHA-1 space / Q
+
+%% @doc assign nodes to each of the partitions. When you run out of nodes,
+%% start at the beginning of the node list again.
+%% The provided node list starts with the seed node (seednode fun)
+make_map(_,_,[], Acc) ->
+ lists:keysort(2,Acc);
+make_map(AllNodes, [], Parts, Acc) ->
+ % start back at beginning of node list
+ make_map(AllNodes, AllNodes, Parts, Acc);
+make_map(AllNodes, [Node|RestNodes], [Part|RestParts], Acc) ->
+ % add a node/part combo to the Acc
+ make_map(AllNodes, RestNodes, RestParts, [{Node,Part}|Acc]).