Skip to content

Commit

Permalink
shutdown existing socket when eaddrinuse hit during socket process re…
Browse files Browse the repository at this point in the history
…start
  • Loading branch information
andymck committed Aug 11, 2021
1 parent 5698251 commit 244bd9b
Showing 1 changed file with 25 additions and 42 deletions.
67 changes: 25 additions & 42 deletions src/grpcbox_socket.erl
Original file line number Diff line number Diff line change
Expand Up @@ -16,70 +16,52 @@
start_link(Pool, ListenOpts, AcceptorOpts) ->
gen_server:start_link(?MODULE, [Pool, ListenOpts, AcceptorOpts], []).


%% gen_server api

init([Pool, ListenOpts, PoolOpts]) ->
%% Trapping exit so can close socket in terminate/2
_ = process_flag(trap_exit, true),
Port = maps:get(port, ListenOpts, 8080),
IPAddress = maps:get(ip, ListenOpts, {0, 0, 0, 0}),
AcceptorPoolSize = maps:get(size, PoolOpts, 10),
SocketOpts = maps:get(socket_options, ListenOpts, [{reuseaddr, true},
{nodelay, true},
{reuseaddr, true},
{backlog, 32768},
{keepalive, true}]),
%% Trapping exit so can close socket in terminate/2
_ = process_flag(trap_exit, true),
Opts = [{active, false}, {mode, binary}, {packet, raw}, {ip, IPAddress} | SocketOpts],
case gen_tcp:listen(Port, Opts) of
{ok, Socket} ->
%% acceptor could close the socket if there is a problem
MRef = monitor(port, Socket),
{ok, _} = grpcbox_pool:accept_socket(Pool, Socket, AcceptorPoolSize),
grpcbox_pool:accept_socket(Pool, Socket, AcceptorPoolSize),
{ok, {Socket, MRef}};
{error, eaddrinuse} ->
{error, eaddrinuse} = Error ->
%% our desired port is already in use
%% its likely this grpcbox_socket server is restarting
%% its likely this grpcbox_socket server has been killed ( for reason unknown ) and is restarting
%% previously it would have bound to the port before passing control to our acceptor pool
%% the socket remains open
%% in the restart scenario, the socket process would attempt to bind again
%% to the port and then stop, the sup would keep restarting it
%% and we would end up breaching the restart strategy of the parent sup
%% eventually taking down the entire tree
%% result of which is we have no active listener and grpcbox is effectively down
%% so now if we hit eaddrinuse, we check if our acceptor pool is already the
%% controlling process, if so we reuse the port from its state and
%% allow grpcbox_socket to start cleanly
%% so now if we hit eaddrinuse, we check if our acceptor pool using it
%% if so we close the port here and stop this process
%% NOTE: issuing stop in init wont trigger terminate and so cant rely on
%% the socket being closed there
%% This allows the sup to restart things cleanly
%% We could try to reuse the exising port rather than closing it
%% but side effects were encountered there, so deliberately avoiding

%% NOTE: acceptor_pool has a grace period for connections before it terminates
%% grpcbox_pool sets this to a default of 5 secs
%% this needs considered when deciding on related supervisor restart strategies
%% AND keep in mind the acceptor pool will continue accepting new connections
%% during this grace period

%% Other possible fixes here include changing the grpcbox_services_sup from its
%% rest_for_one to a one_for_all strategy. This ensures the pool and thus the
%% current controlling process of the socket is terminated
%% and allows things to restart cleanly if grpcbox_socket dies
%% the disadvantage there however is we will drop all existing grpc connections

%% Another possible fix is to play with the restart strategy intensity and periods
%% and ensure the top level sup doesnt get breached but...
%% a requirement will be to ensure the grpcbox_service_sup forces a restart
%% of grpcbox_pool and therefore the acceptor_pool process
%% as only by doing that will be free up the socket and allow grpcbox_socket to rebind
%% thus we end up terminating any existing grpc connections

%% Yet another possible fix is to move the cleanup of closing the socket
%% out of grpcbox_socket's terminate and into acceptor_pool's terminate
%% that however puts two way co-ordination between two distinct libs
%% which is far from ideal and in addition will also result in existing grpc connection
%% being dropped

%% my view is, if at all possible, its better to restart the grpcbox_socket process without
%% impacting existing connections, the fix below allows for that, albeit a lil messy
%% there is most likely a better solution to all of this, TODO: revisit

%% get the current sockets in use by the acceptor pool
%% if one is bound to our target port then reuse
%% if one is bound to our target port then close it
%% need to allow for possibility of multiple services, each with its own socket
%% so we need to identify our interested socket via port number
PoolSockets = grpcbox_pool:pool_sockets(Pool),
Expand All @@ -89,15 +71,15 @@ init([Pool, ListenOpts, PoolOpts]) ->
{ok, Socket};
(_, Acc) ->
Acc
end, {error, eaddrinuse}, PoolSockets),
end, socket_not_found, PoolSockets),
case MaybeHaveExistingSocket of
{ok, Socket} ->
MRef = monitor(port, Socket),
{ok, {Socket, MRef}};
{error, Reason} ->
{stop, Reason}
end;
{error, Reason}->
gen_tcp:close(Socket);
socket_not_found ->
noop
end,
Error;
{error, Reason} ->
{stop, Reason}
end.

Expand All @@ -115,10 +97,11 @@ handle_info(_, State) ->
code_change(_, State, _) ->
{ok, State}.

terminate(_, {Socket, MRef}) ->
terminate(_Reason, {Socket, MRef}) ->
%% Socket may already be down but need to ensure it is closed to avoid
%% eaddrinuse error on restart
%% this takes care of that, unless of course this process is killed...
case demonitor(MRef, [flush, info]) of
true -> gen_tcp:close(Socket);
false -> ok
end.
end.

0 comments on commit 244bd9b

Please sign in to comment.