Skip to content

Commit

Permalink
NetworkTransport make pipelining configurable and default to max 2 in…
Browse files Browse the repository at this point in the history
… flight. (#541)

* NetworkTransport make pipelining configurable and default to max 2 inflight

* Add PR link

* Fix net transport test

* Use constants instead of magic numbers to express the semantics

* Doc comment tweaks
  • Loading branch information
banks committed Apr 21, 2023
1 parent 34bcd7c commit 8fdc4ce
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 80 deletions.
93 changes: 81 additions & 12 deletions net_transport.go
Expand Up @@ -28,9 +28,11 @@ const (
// DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport.
DefaultTimeoutScale = 256 * 1024 // 256KB

// rpcMaxPipeline controls the maximum number of outstanding
// AppendEntries RPC calls.
rpcMaxPipeline = 128
// DefaultMaxRPCsInFlight is the default value used for pipelining configuration
// if a zero value is passed. See https://github.com/hashicorp/raft/pull/541
// for rationale. Note, if this is changed we should update the doc comments
// below for NetworkTransportConfig.MaxRPCsInFlight.
DefaultMaxRPCsInFlight = 2

// connReceiveBufferSize is the size of the buffer we will use for reading RPC requests into
// on followers
Expand All @@ -39,6 +41,16 @@ const (
// connSendBufferSize is the size of the buffer we will use for sending RPC request data from
// the leader to followers.
connSendBufferSize = 256 * 1024 // 256KB

// minInFlightForPipelining is a property of our current pipelining
// implementation and must not be changed unless we change the invariants of
// that implementation. Roughly speaking even with a zero-length in-flight
// buffer we still allow 2 requests to be in-flight before we block because we
// only block after sending and the receiving go-routine always unblocks the
// chan right after first send. This is a constant just to provide context
// rather than a magic number in a few places we have to check invariants to
// avoid panics etc.
minInFlightForPipelining = 2
)

var (
Expand Down Expand Up @@ -76,7 +88,8 @@ type NetworkTransport struct {

logger hclog.Logger

maxPool int
maxPool int
maxInFlight int

serverAddressProvider ServerAddressProvider

Expand Down Expand Up @@ -108,6 +121,39 @@ type NetworkTransportConfig struct {
// MaxPool controls how many connections we will pool
MaxPool int

// MaxRPCsInFlight controls the pipelining "optimization" when replicating
// entries to followers.
//
// Setting this to 1 explicitly disables pipelining since no overlapping of
// request processing is allowed. If set to 1 the pipelining code path is
// skipped entirely and every request is entirely synchronous.
//
// If zero is set (or left as default), DefaultMaxRPCsInFlight is used which
// is currently 2. A value of 2 overlaps the preparation and sending of the
// next request while waiting for the previous response, but avoids additional
// queuing.
//
// Historically this was internally fixed at (effectively) 130 however
// performance testing has shown that in practice the pipelining optimization
// combines badly with batching and actually has a very large negative impact
// on commit latency when throughput is high, whilst having very little
// benefit on latency or throughput in any other case! See
// [#541](https://github.com/hashicorp/raft/pull/541) for more analysis of the
// performance impacts.
//
// Increasing this beyond 2 is likely to be beneficial only in very
// high-latency network conditions. HashiCorp doesn't recommend using our own
// products this way.
//
// To maintain the behavior from before version 1.4.1 exactly, set this to
// 130. The old internal constant was 128 but was used directly as a channel
// buffer size. Since we send before blocking on the channel and unblock the
// channel as soon as the receiver is done with the earliest outstanding
// request, even an unbuffered channel (buffer=0) allows one request to be
// sent while waiting for the previous one (i.e. 2 inflight). so the old
// buffer actually allowed 130 RPCs to be inflight at once.
MaxRPCsInFlight int

// Timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
// the timeout by (SnapshotSize / TimeoutScale).
Timeout time.Duration
Expand Down Expand Up @@ -162,11 +208,17 @@ func NewNetworkTransportWithConfig(
Level: hclog.DefaultLevel,
})
}
maxInFlight := config.MaxRPCsInFlight
if maxInFlight == 0 {
// Default zero value
maxInFlight = DefaultMaxRPCsInFlight
}
trans := &NetworkTransport{
connPool: make(map[ServerAddress][]*netConn),
consumeCh: make(chan RPC),
logger: config.Logger,
maxPool: config.MaxPool,
maxInFlight: maxInFlight,
shutdownCh: make(chan struct{}),
stream: config.Stream,
timeout: config.Timeout,
Expand Down Expand Up @@ -379,14 +431,20 @@ func (n *NetworkTransport) returnConn(conn *netConn) {
// AppendEntriesPipeline returns an interface that can be used to pipeline
// AppendEntries requests.
func (n *NetworkTransport) AppendEntriesPipeline(id ServerID, target ServerAddress) (AppendPipeline, error) {
if n.maxInFlight < minInFlightForPipelining {
// Pipelining is disabled since no more than one request can be outstanding
// at once. Skip the whole code path and use synchronous requests.
return nil, ErrPipelineReplicationNotSupported
}

// Get a connection
conn, err := n.getConnFromAddressProvider(id, target)
if err != nil {
return nil, err
}

// Create the pipeline
return newNetPipeline(n, conn), nil
return newNetPipeline(n, conn, n.maxInFlight), nil
}

// AppendEntries implements the Transport interface.
Expand Down Expand Up @@ -720,14 +778,25 @@ func sendRPC(conn *netConn, rpcType uint8, args interface{}) error {
return nil
}

// newNetPipeline is used to construct a netPipeline from a given
// transport and connection.
func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline {
// newNetPipeline is used to construct a netPipeline from a given transport and
// connection. It is a bug to ever call this with maxInFlight less than 2
// (minInFlightForPipelining) and will cause a panic.
func newNetPipeline(trans *NetworkTransport, conn *netConn, maxInFlight int) *netPipeline {
if maxInFlight < minInFlightForPipelining {
// Shouldn't happen (tm) since we validate this in the one call site and
// skip pipelining if it's lower.
panic("pipelining makes no sense if maxInFlight < 2")
}
n := &netPipeline{
conn: conn,
trans: trans,
doneCh: make(chan AppendFuture, rpcMaxPipeline),
inprogressCh: make(chan *appendFuture, rpcMaxPipeline),
conn: conn,
trans: trans,
// The buffer size is 2 less than the configured max because we send before
// waiting on the channel and the decode routine unblocks the channel as
// soon as it's waiting on the first request. So a zero-buffered channel
// still allows 1 request to be sent even while decode is still waiting for
// a response from the previous one. i.e. two are inflight at the same time.
inprogressCh: make(chan *appendFuture, maxInFlight-2),
doneCh: make(chan AppendFuture, maxInFlight-2),
shutdownCh: make(chan struct{}),
}
go n.decodeResponses()
Expand Down

0 comments on commit 8fdc4ce

Please sign in to comment.