From 7860f047be25f99c4d1e83f9456c7eb245ad37dc Mon Sep 17 00:00:00 2001 From: "Damien F. Katz" Date: Sat, 30 May 2009 02:43:52 +0000 Subject: Test and fix for infinite loops in view_servers, fix for crashed OS processes causing leaked erlang processes and fix for view server crashing when view group process terminates. git-svn-id: https://svn.apache.org/repos/asf/couchdb/trunk@780165 13f79535-47bb-0310-9956-ffa450edef68 --- src/couchdb/Makefile.am | 10 ++++-- src/couchdb/couch.app.tpl.in | 2 +- src/couchdb/couch_os_process.erl | 63 ++++++++++++++++--------------------- src/couchdb/couch_query_servers.erl | 63 ++++++++++++++++++++++++------------- src/couchdb/couch_server_sup.erl | 4 ++- src/couchdb/couch_view.erl | 2 +- src/couchdb/couch_view_group.erl | 7 +++-- src/couchdb/priv/couchspawnkillable | 20 ++++++++++++ 8 files changed, 104 insertions(+), 67 deletions(-) create mode 100644 src/couchdb/priv/couchspawnkillable (limited to 'src/couchdb') diff --git a/src/couchdb/Makefile.am b/src/couchdb/Makefile.am index 7105300b..e19a7539 100644 --- a/src/couchdb/Makefile.am +++ b/src/couchdb/Makefile.am @@ -13,9 +13,11 @@ ICU_LOCAL_FLAGS = $(ICU_LOCAL_CFLAGS) $(ICU_LOCAL_LDFLAGS) # devdocdir = $(localdocdir)/developer/couchdb -couchprivlibdir = $(localerlanglibdir)/couch-$(version)/priv/lib -couchincludedir = $(localerlanglibdir)/couch-$(version)/include -couchebindir = $(localerlanglibdir)/couch-$(version)/ebin +couchlibdir = $(localerlanglibdir)/couch-$(version) +couchprivdir = $(couchlibdir)/priv +couchprivlibdir = $(couchlibdir)/priv/lib +couchincludedir = $(couchlibdir)/include +couchebindir = $(couchlibdir)/ebin couchprivlib_LTLIBRARIES = couch_erl_driver.la couch_erl_driver_la_SOURCES = couch_erl_driver.c @@ -33,6 +35,8 @@ couchinclude_DATA = couch_db.hrl couchebin_DATA = $(compiled_files) +couchpriv_SCRIPTS = priv/couchspawnkillable + # dist_devdoc_DATA = $(doc_base) $(doc_modules) CLEANFILES = $(compiled_files) $(doc_base) diff --git a/src/couchdb/couch.app.tpl.in b/src/couchdb/couch.app.tpl.in index e0100cb4..259bac6c 100644 --- a/src/couchdb/couch.app.tpl.in +++ b/src/couchdb/couch.app.tpl.in @@ -1,4 +1,4 @@ -{application,couch, +{application,couchdb, [{description,"@package_name@"}, {vsn,"@version@"}, {modules,[couch_btree, diff --git a/src/couchdb/couch_os_process.erl b/src/couchdb/couch_os_process.erl index 9a1fcb0c..75937eb8 100644 --- a/src/couchdb/couch_os_process.erl +++ b/src/couchdb/couch_os_process.erl @@ -14,7 +14,7 @@ -behaviour(gen_server). -export([start_link/1, start_link/2, start_link/3, stop/1]). --export([set_timeout/2, write/2, read/1, prompt/2, async/3]). +-export([set_timeout/2, prompt/2]). -export([writeline/2, readline/1, writejson/2, readjson/1]). -export([init/1, terminate/2, handle_call/3, handle_cast/2, handle_info/2, code_change/3]). @@ -27,7 +27,7 @@ port, writer, reader, - timeout + timeout=5000 }). start_link(Command) -> @@ -44,24 +44,15 @@ stop(Pid) -> set_timeout(Pid, TimeOut) when is_integer(TimeOut) -> ok = gen_server:call(Pid, {set_timeout, TimeOut}). -write(Pid, Data) -> - gen_server:call(Pid, {write, Data}). - -read(Pid) -> - gen_server:call(Pid, read). - prompt(Pid, Data) -> case gen_server:call(Pid, {prompt, Data}, infinity) of {ok, Result} -> Result; - {error, Error} -> + Error -> ?LOG_DEBUG("OS Process Error ~p",[Error]), throw(Error) end. -async(Pid, Data, CallBack) -> - gen_server:cast(Pid, {async, Data, CallBack}). - % Utility functions for reading and writing % in custom functions writeline(OsProc, Data) when is_record(OsProc, os_proc) -> @@ -103,17 +94,31 @@ readjson(OsProc) when is_record(OsProc, os_proc) -> Result end. + % gen_server API init([Command, Options, PortOptions]) -> - BaseTimeOut = list_to_integer(couch_config:get( - "couchdb", "os_process_timeout", "5000")), + case code:priv_dir(couch) of + {error, bad_name} -> + % small hack, in dev mode "app" is couchdb. Fixing requires renaming + % src/couch to src/couch. Not really worth the hassle.-Damien + PrivDir = code:priv_dir(couchdb); + PrivDir -> ok + end, + Spawnkiller = filename:join(PrivDir, "couchspawnkillable"), BaseProc = #os_proc{ command=Command, - port=open_port({spawn, Command}, PortOptions), + port=open_port({spawn, Spawnkiller ++ " " ++ Command}, PortOptions), writer=fun writejson/2, - reader=fun readjson/1, - timeout=BaseTimeOut + reader=fun readjson/1 }, + KillCmd = readline(BaseProc), + Pid = self(), + spawn(fun() -> + % this ensure the real os process is killed when this process dies. + erlang:monitor(process, Pid), + receive _ -> ok end, + os:cmd(?b2l(KillCmd)) + end), OsProc = lists:foldl(fun(Opt, Proc) -> case Opt of @@ -127,36 +132,22 @@ init([Command, Options, PortOptions]) -> end, BaseProc, Options), {ok, OsProc}. -terminate(_Reason, #os_proc{port=nil}) -> - ok; terminate(_Reason, #os_proc{port=Port}) -> catch port_close(Port), ok. handle_call({set_timeout, TimeOut}, _From, OsProc) -> {reply, ok, OsProc#os_proc{timeout=TimeOut}}; -handle_call({write, Data}, _From, OsProc) -> - Writer = OsProc#os_proc.writer, - {reply, Writer(OsProc, Data), OsProc}; -handle_call(read, _From, OsProc) -> - Reader = OsProc#os_proc.reader, - {reply, Reader(OsProc), OsProc}; handle_call({prompt, Data}, _From, OsProc) -> #os_proc{writer=Writer, reader=Reader} = OsProc, - Writer(OsProc, Data), - Result = try Reader(OsProc) of - Ok -> {ok, Ok} + try + Writer(OsProc, Data), + {reply, {ok, Reader(OsProc)}, OsProc} catch throw:OsError -> - {error, OsError} - end, - {reply, Result, OsProc}. + {stop, normal, OsError, OsProc} + end. -handle_cast({async, Data, CallBack}, OsProc) -> - #os_proc{writer=Writer, reader=Reader} = OsProc, - Writer(OsProc, Data), - CallBack(Reader(OsProc)), - {noreply, OsProc}; handle_cast(stop, OsProc) -> {stop, normal, OsProc}; handle_cast(Msg, OsProc) -> diff --git a/src/couchdb/couch_query_servers.erl b/src/couchdb/couch_query_servers.erl index 7c82abe7..ef2bde3b 100644 --- a/src/couchdb/couch_query_servers.erl +++ b/src/couchdb/couch_query_servers.erl @@ -118,7 +118,7 @@ recombine_reduce_results([<<"_", _/binary>>|RedSrcs], OsResults, [BRes|BuiltinRe recombine_reduce_results([_OsFun|RedSrcs], [OsR|OsResults], BuiltinResults, Acc) -> recombine_reduce_results(RedSrcs, OsResults, BuiltinResults, [OsR|Acc]). -os_reduce(Lang, [], KVs) -> +os_reduce(_Lang, [], _KVs) -> {ok, []}; os_reduce(Lang, OsRedSrcs, KVs) -> Pid = get_os_process(Lang), @@ -130,7 +130,7 @@ os_reduce(Lang, OsRedSrcs, KVs) -> end, {ok, OsResults}. -builtin_reduce(_Re, [], KVs, Acc) -> +builtin_reduce(_Re, [], _KVs, Acc) -> {ok, lists:reverse(Acc)}; builtin_reduce(Re, [<<"_sum">>|BuiltinReds], KVs, Acc) -> Sum = builtin_sum_rows(KVs), @@ -242,6 +242,7 @@ init([]) -> lists:foreach(fun({Lang, Command}) -> true = ets:insert(Langs, {?l2b(Lang), Command}) end, couch_config:get("query_servers")), + process_flag(trap_exit, true), {ok, {Langs, PidLangs, Pids, InUse}}. terminate(_Reason, _Server) -> @@ -255,13 +256,15 @@ handle_call({get_proc, Lang}, _From, {Langs, PidLangs, Pids, InUse}=Server) -> add_value(PidLangs, Pid, Lang), rem_from_list(Pids, Lang, Pid), add_to_list(InUse, Lang, Pid), - QueryConfig = get_query_server_config(), - true = couch_os_process:prompt(Pid, [<<"reset">>, QueryConfig]), - {reply, Pid, Server}; + {reply, {recycled, Pid, get_query_server_config()}, Server}; _ -> - {ok, Pid} = new_process(Langs, Lang), - add_to_list(InUse, Lang, Pid), - {reply, Pid, Server} + case (catch new_process(Langs, Lang)) of + {ok, Pid} -> + add_to_list(InUse, Lang, Pid), + {reply, {new, Pid}, Server}; + Error -> + {reply, Error, Server} + end end; handle_call({ret_proc, Lang, Pid}, _From, {_, _, Pids, InUse}=Server) -> % Along with max process limit, here we should check @@ -273,22 +276,20 @@ handle_call({ret_proc, Lang, Pid}, _From, {_, _, Pids, InUse}=Server) -> handle_cast(_Whatever, Server) -> {noreply, Server}. -handle_info({'EXIT', Pid, Status}, {Langs, PidLangs, Pids, InUse}) -> +handle_info({'EXIT', Pid, Status}, {_, PidLangs, Pids, InUse}=Server) -> case ets:lookup(PidLangs, Pid) of [{Pid, Lang}] -> case Status of normal -> ok; _ -> ?LOG_DEBUG("Linked process died abnormally: ~p (reason: ~p)", [Pid, Status]) end, - {ok, { - Langs, - rem_value(PidLangs, Pid), - rem_from_list(Pids, Lang, Pid), - rem_from_list(InUse, Lang, Pid) - }}; + rem_value(PidLangs, Pid), + catch rem_from_list(Pids, Lang, Pid), + catch rem_from_list(InUse, Lang, Pid), + {noreply, Server}; [] -> ?LOG_DEBUG("Unknown linked process died: ~p (reason: ~p)", [Pid, Status]), - {ok, {Langs, PidLangs, Pids, InUse}} + {stop, Status, Server} end. code_change(_OldVsn, State, _Extra) -> @@ -302,27 +303,45 @@ get_query_server_config() -> {[{<<"reduce_limit">>, ReduceLimit}]}. new_process(Langs, Lang) -> - Proc = case ets:lookup(Langs, Lang) of [{Lang, Command}] -> couch_os_process:start_link(Command); _ -> - throw({unknown_query_language, Lang}) - end, - Proc. + {unknown_query_language, Lang} + end. get_os_process(Lang) -> - gen_server:call(couch_query_servers, {get_proc, Lang}). + case gen_server:call(couch_query_servers, {get_proc, Lang}) of + {new, Pid} -> + couch_os_process:set_timeout(Pid, list_to_integer(couch_config:get( + "couchdb", "os_process_timeout", "5000"))), + link(Pid), + Pid; + {recycled, Pid, QueryConfig} -> + case (catch couch_os_process:prompt(Pid, [<<"reset">>, QueryConfig])) of + true -> + couch_os_process:set_timeout(Pid, list_to_integer(couch_config:get( + "couchdb", "os_process_timeout", "5000"))), + link(Pid), + Pid; + _ -> + catch couch_os_process:stop(Pid), + get_os_process(Lang) + end; + Error -> + throw(Error) + end. ret_os_process(Lang, Pid) -> true = gen_server:call(couch_query_servers, {ret_proc, Lang, Pid}), + catch unlink(Pid), ok. add_value(Tid, Key, Value) -> true = ets:insert(Tid, {Key, Value}). rem_value(Tid, Key) -> - true = ets:insert(Tid, Key). + true = ets:delete(Tid, Key). add_to_list(Tid, Key, Value) -> case ets:lookup(Tid, Key) of diff --git a/src/couchdb/couch_server_sup.erl b/src/couchdb/couch_server_sup.erl index 344073a3..e59286f3 100644 --- a/src/couchdb/couch_server_sup.erl +++ b/src/couchdb/couch_server_sup.erl @@ -32,8 +32,10 @@ start_link(IniFiles) -> end. restart_core_server() -> + supervisor:terminate_child(couch_secondary_services, couch_server), supervisor:terminate_child(couch_primary_services, couch_server), - supervisor:restart_child(couch_primary_services, couch_server). + supervisor:restart_child(couch_primary_services, couch_server), + supervisor:restart_child(couch_secondary_services, couch_server). couch_config_start_link_wrapper(IniFiles, FirstConfigPid) -> case is_process_alive(FirstConfigPid) of diff --git a/src/couchdb/couch_view.erl b/src/couchdb/couch_view.erl index b74fb47d..301ea8b7 100644 --- a/src/couchdb/couch_view.erl +++ b/src/couchdb/couch_view.erl @@ -260,7 +260,7 @@ handle_call({start_group_server, DbName, GroupId}, _From, #server{root_dir=Root} case ets:lookup(group_servers_by_name, {DbName, GroupId}) of [] -> ?LOG_DEBUG("Spawning new group server for view group ~s in database ~s.", [GroupId, DbName]), - case couch_view_group:start_link({view, Root, DbName, GroupId}) of + case (catch couch_view_group:start_link({view, Root, DbName, GroupId})) of {ok, NewPid} -> add_to_ets(NewPid, DbName, GroupId), {reply, {ok, NewPid}, Server}; diff --git a/src/couchdb/couch_view_group.erl b/src/couchdb/couch_view_group.erl index 28679927..0ab1077e 100644 --- a/src/couchdb/couch_view_group.erl +++ b/src/couchdb/couch_view_group.erl @@ -274,9 +274,10 @@ handle_info({'DOWN',_,_,_,_}, State) -> {stop, normal, reply_all(State, shutdown)}. -terminate(Reason, State) -> - reply_all(State, Reason), - couch_util:terminate_linked(Reason), +terminate(Reason, #group_state{updater_pid=Update, compactor_pid=Compact}=S) -> + reply_all(S, Reason), + catch exit(Update, Reason), + catch exit(Compact, Reason), ok. code_change(_OldVsn, State, _Extra) -> diff --git a/src/couchdb/priv/couchspawnkillable b/src/couchdb/priv/couchspawnkillable new file mode 100644 index 00000000..3d85ea3c --- /dev/null +++ b/src/couchdb/priv/couchspawnkillable @@ -0,0 +1,20 @@ +#! /bin/sh -e + +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# The purpose of this script is to echo an OS specific command before launching +# the actual process. This provides a way for Erlang to hard-kill its external +# processes. + +echo "kill -9 $$" +exec $* \ No newline at end of file -- cgit v1.2.3