From b80e5ef191912068f7b05c26e19a1be5211a0c01 Mon Sep 17 00:00:00 2001 From: tnasu Date: Tue, 1 Feb 2022 18:59:50 +0900 Subject: [PATCH] rpc: Add experimental config params to allow for subscription buffer size control (tm v0.34.x) (#7230) A workaround for #6729. Add parameters to control buffer sizes for event subscription RPC clients. On some networks, buffering causes clients to be dropped and/or events to be lost. For additional context, see the discussion on #7188. - Add experimental_subscription_buffer_size config parameter - Add experimental_websocket_write_buffer_size config parameter - Add experimental_close_on_slow_client config parameter Co-authored-by: M. J. Fromberger Co-authored-by: Thane Thomson --- config/config.go | 40 +++++++++++++++++++++++++++++++ config/toml.go | 27 +++++++++++++++++++++ libs/pubsub/subscription.go | 2 +- node/node.go | 1 + proxy/mocks/app_conn_consensus.go | 3 +-- proxy/mocks/app_conn_mempool.go | 3 +-- rpc/core/events.go | 24 +++++++++++++------ state/mocks/evidence_pool.go | 3 +-- state/mocks/store.go | 6 ++--- statesync/mocks/state_provider.go | 3 +-- 10 files changed, 92 insertions(+), 20 deletions(-) diff --git a/config/config.go b/config/config.go index cdbf87d52..6512faccd 100644 --- a/config/config.go +++ b/config/config.go @@ -57,6 +57,9 @@ var ( defaultNodeKeyPath = filepath.Join(defaultConfigDir, defaultNodeKeyName) defaultAddrBookPath = filepath.Join(defaultConfigDir, defaultAddrBookName) + + minSubscriptionBufferSize = 100 + defaultSubscriptionBufferSize = 200 ) // Config defines the top level configuration for an Ostracon node @@ -387,6 +390,29 @@ type RPCConfig struct { // to the estimated maximum number of broadcast_tx_commit calls per block. MaxSubscriptionsPerClient int `mapstructure:"max_subscriptions_per_client"` + // The number of events that can be buffered per subscription before + // returning `ErrOutOfCapacity`. + SubscriptionBufferSize int `mapstructure:"experimental_subscription_buffer_size"` + + // The maximum number of responses that can be buffered per WebSocket + // client. If clients cannot read from the WebSocket endpoint fast enough, + // they will be disconnected, so increasing this parameter may reduce the + // chances of them being disconnected (but will cause the node to use more + // memory). + // + // Must be at least the same as `SubscriptionBufferSize`, otherwise + // connections may be dropped unnecessarily. + WebSocketWriteBufferSize int `mapstructure:"experimental_websocket_write_buffer_size"` + + // If a WebSocket client cannot read fast enough, at present we may + // silently drop events instead of generating an error or disconnecting the + // client. + // + // Enabling this parameter will cause the WebSocket connection to be closed + // instead if it cannot read fast enough, allowing for greater + // predictability in subscription behaviour. + CloseOnSlowClient bool `mapstructure:"experimental_close_on_slow_client"` + // How long to wait for a tx to be committed during /broadcast_tx_commit // WARNING: Using a value larger than 'WriteTimeout' will result in increasing the // global HTTP write timeout, which applies to all connections and endpoints. @@ -439,7 +465,9 @@ func DefaultRPCConfig() *RPCConfig { MaxSubscriptionClients: 100, MaxSubscriptionsPerClient: 5, + SubscriptionBufferSize: defaultSubscriptionBufferSize, TimeoutBroadcastTxCommit: 10 * time.Second, + WebSocketWriteBufferSize: defaultSubscriptionBufferSize, MaxBodyBytes: int64(1000000), // 1MB MaxHeaderBytes: 1 << 20, // same as the net/http default @@ -482,6 +510,18 @@ func (cfg *RPCConfig) ValidateBasic() error { if cfg.MaxSubscriptionsPerClient < 0 { return errors.New("max_subscriptions_per_client can't be negative") } + if cfg.SubscriptionBufferSize < minSubscriptionBufferSize { + return fmt.Errorf( + "experimental_subscription_buffer_size must be >= %d", + minSubscriptionBufferSize, + ) + } + if cfg.WebSocketWriteBufferSize < cfg.SubscriptionBufferSize { + return fmt.Errorf( + "experimental_websocket_write_buffer_size must be >= experimental_subscription_buffer_size (%d)", + cfg.SubscriptionBufferSize, + ) + } if cfg.TimeoutBroadcastTxCommit < 0 { return errors.New("timeout_broadcast_tx_commit can't be negative") } diff --git a/config/toml.go b/config/toml.go index 8abc9a5dd..0bca78e60 100644 --- a/config/toml.go +++ b/config/toml.go @@ -229,6 +229,33 @@ max_subscription_clients = {{ .RPC.MaxSubscriptionClients }} # the estimated # maximum number of broadcast_tx_commit calls per block. max_subscriptions_per_client = {{ .RPC.MaxSubscriptionsPerClient }} +# Experimental parameter to specify the maximum number of events a node will +# buffer, per subscription, before returning an error and closing the +# subscription. Must be set to at least 100, but higher values will accommodate +# higher event throughput rates (and will use more memory). +experimental_subscription_buffer_size = {{ .RPC.SubscriptionBufferSize }} + +# Experimental parameter to specify the maximum number of RPC responses that +# can be buffered per WebSocket client. If clients cannot read from the +# WebSocket endpoint fast enough, they will be disconnected, so increasing this +# parameter may reduce the chances of them being disconnected (but will cause +# the node to use more memory). +# +# Must be at least the same as "experimental_subscription_buffer_size", +# otherwise connections could be dropped unnecessarily. This value should +# ideally be somewhat higher than "experimental_subscription_buffer_size" to +# accommodate non-subscription-related RPC responses. +experimental_websocket_write_buffer_size = {{ .RPC.WebSocketWriteBufferSize }} + +# If a WebSocket client cannot read fast enough, at present we may +# silently drop events instead of generating an error or disconnecting the +# client. +# +# Enabling this experimental parameter will cause the WebSocket connection to +# be closed instead if it cannot read fast enough, allowing for greater +# predictability in subscription behaviour. +experimental_close_on_slow_client = {{ .RPC.CloseOnSlowClient }} + # How long to wait for a tx to be committed during /broadcast_tx_commit. # WARNING: Using a value larger than 'WriteTimeout' will result in increasing the # global HTTP write timeout, which applies to all connections and endpoints. diff --git a/libs/pubsub/subscription.go b/libs/pubsub/subscription.go index 816ec5e9e..094b4a98f 100644 --- a/libs/pubsub/subscription.go +++ b/libs/pubsub/subscription.go @@ -12,7 +12,7 @@ var ( // ErrOutOfCapacity is returned by Err when a client is not pulling messages // fast enough. Note the client's subscription will be terminated. - ErrOutOfCapacity = errors.New("client is not pulling messages fast enough") + ErrOutOfCapacity = errors.New("internal subscription event buffer is out of capacity") ) // A Subscription represents a client subscription for a particular query and diff --git a/node/node.go b/node/node.go index 681d2b62f..f2e309f04 100644 --- a/node/node.go +++ b/node/node.go @@ -1115,6 +1115,7 @@ func (n *Node) startRPC() ([]net.Listener, error) { } }), rpcserver.ReadLimit(config.MaxBodyBytes), + rpcserver.WriteChanCapacity(n.config.RPC.WebSocketWriteBufferSize), ) wm.SetLogger(wmLogger) mux.HandleFunc("/websocket", wm.WebsocketHandler) diff --git a/proxy/mocks/app_conn_consensus.go b/proxy/mocks/app_conn_consensus.go index a21beed72..1e9bf67ec 100644 --- a/proxy/mocks/app_conn_consensus.go +++ b/proxy/mocks/app_conn_consensus.go @@ -4,9 +4,8 @@ package mocks import ( abcicli "github.com/line/ostracon/abci/client" - mock "github.com/stretchr/testify/mock" - types "github.com/line/ostracon/abci/types" + mock "github.com/stretchr/testify/mock" ) // AppConnConsensus is an autogenerated mock type for the AppConnConsensus type diff --git a/proxy/mocks/app_conn_mempool.go b/proxy/mocks/app_conn_mempool.go index 24f19bcef..86e59cb40 100644 --- a/proxy/mocks/app_conn_mempool.go +++ b/proxy/mocks/app_conn_mempool.go @@ -4,9 +4,8 @@ package mocks import ( abcicli "github.com/line/ostracon/abci/client" - mock "github.com/stretchr/testify/mock" - types "github.com/line/ostracon/abci/types" + mock "github.com/stretchr/testify/mock" ) // AppConnMempool is an autogenerated mock type for the AppConnMempool type diff --git a/rpc/core/events.go b/rpc/core/events.go index 07568fd63..c5cb163fd 100644 --- a/rpc/core/events.go +++ b/rpc/core/events.go @@ -2,6 +2,7 @@ package core import ( "context" + "errors" "fmt" "time" @@ -11,11 +12,6 @@ import ( rpctypes "github.com/line/ostracon/rpc/jsonrpc/types" ) -const ( - // Buffer on the Ostracon (server) side to allow some slowness in clients. - subBufferSize = 100 -) - // Subscribe for events via WebSocket. // More: https://docs.tendermint.com/master/rpc/#/Websocket/subscribe func Subscribe(ctx *rpctypes.Context, query string) (*ctypes.ResultSubscribe, error) { @@ -37,11 +33,13 @@ func Subscribe(ctx *rpctypes.Context, query string) (*ctypes.ResultSubscribe, er subCtx, cancel := context.WithTimeout(ctx.Context(), SubscribeTimeout) defer cancel() - sub, err := env.EventBus.Subscribe(subCtx, addr, q, subBufferSize) + sub, err := env.EventBus.Subscribe(subCtx, addr, q, env.Config.SubscriptionBufferSize) if err != nil { return nil, err } + closeIfSlow := env.Config.CloseOnSlowClient + // Capture the current ID, since it can change in the future. subscriptionID := ctx.JSONReq.ID go func() { @@ -57,6 +55,18 @@ func Subscribe(ctx *rpctypes.Context, query string) (*ctypes.ResultSubscribe, er if err := ctx.WSConn.WriteRPCResponse(writeCtx, resp); err != nil { env.Logger.Info("Can't write response (slow client)", "to", addr, "subscriptionID", subscriptionID, "err", err) + + if closeIfSlow { + var ( + err = errors.New("subscription was cancelled (reason: slow client)") + resp = rpctypes.RPCServerError(subscriptionID, err) + ) + if !ctx.WSConn.TryWriteRPCResponse(resp) { + env.Logger.Info("Can't write response (slow client)", + "to", addr, "subscriptionID", subscriptionID, "err", err) + } + return + } } case <-sub.Cancelled(): if sub.Err() != tmpubsub.ErrUnsubscribed { @@ -70,7 +80,7 @@ func Subscribe(ctx *rpctypes.Context, query string) (*ctypes.ResultSubscribe, er err = fmt.Errorf("subscription was cancelled (reason: %s)", reason) resp = rpctypes.RPCServerError(subscriptionID, err) ) - if ok := ctx.WSConn.TryWriteRPCResponse(resp); !ok { + if !ctx.WSConn.TryWriteRPCResponse(resp) { env.Logger.Info("Can't write response (slow client)", "to", addr, "subscriptionID", subscriptionID, "err", err) } diff --git a/state/mocks/evidence_pool.go b/state/mocks/evidence_pool.go index 44217d27e..379a0077e 100644 --- a/state/mocks/evidence_pool.go +++ b/state/mocks/evidence_pool.go @@ -3,10 +3,9 @@ package mocks import ( - mock "github.com/stretchr/testify/mock" state "github.com/line/ostracon/state" - types "github.com/line/ostracon/types" + mock "github.com/stretchr/testify/mock" ) // EvidencePool is an autogenerated mock type for the EvidencePool type diff --git a/state/mocks/store.go b/state/mocks/store.go index bf12cd27a..028242fdb 100644 --- a/state/mocks/store.go +++ b/state/mocks/store.go @@ -4,12 +4,10 @@ package mocks import ( ostraconstate "github.com/line/ostracon/proto/ostracon/state" + types "github.com/line/ostracon/proto/ostracon/types" + state "github.com/line/ostracon/state" ostracontypes "github.com/line/ostracon/types" mock "github.com/stretchr/testify/mock" - - state "github.com/line/ostracon/state" - - types "github.com/line/ostracon/proto/ostracon/types" ) // Store is an autogenerated mock type for the Store type diff --git a/statesync/mocks/state_provider.go b/statesync/mocks/state_provider.go index b27a263bf..a0b9f5de5 100644 --- a/statesync/mocks/state_provider.go +++ b/statesync/mocks/state_provider.go @@ -5,10 +5,9 @@ package mocks import ( context "context" - mock "github.com/stretchr/testify/mock" state "github.com/line/ostracon/state" - types "github.com/line/ostracon/types" + mock "github.com/stretchr/testify/mock" ) // StateProvider is an autogenerated mock type for the StateProvider type