Skip to content

Commit

Permalink
kv: add raft.commands.pending metric
Browse files Browse the repository at this point in the history
Fixes cockroachdb#123259.

This commit adds a new metric called `raft.commands.pending`. This
metric tracks the number of Raft commands that the leaseholders on a
node are tracking as in-flight. These commands will be periodically
reproposed until they are applied or until they fail, either
unequivocally or ambiguously.

Release note: None
  • Loading branch information
nvanbenschoten committed Apr 29, 2024
1 parent 37e0fc5 commit 60ec815
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@
<tr><td>STORAGE</td><td>queue.tsmaintenance.process.failure</td><td>Number of replicas which failed processing in the time series maintenance queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>queue.tsmaintenance.process.success</td><td>Number of replicas successfully processed by the time series maintenance queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>queue.tsmaintenance.processingnanos</td><td>Nanoseconds spent processing replicas in the time series maintenance queue</td><td>Processing Time</td><td>COUNTER</td><td>NANOSECONDS</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>raft.commands.pending</td><td>Number of Raft commands proposed and pending.<br/><br/>The number of Raft commands that the leaseholders are tracking as in-flight.<br/>These commands will be periodically reproposed until they are applied or until<br/>they fail, either unequivocally or ambiguously.</td><td>Commands</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>raft.commands.proposed</td><td>Number of Raft commands proposed.<br/><br/>The number of proposals and all kinds of reproposals made by leaseholders. This<br/>metric approximates the number of commands submitted through Raft.</td><td>Commands</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>raft.commands.reproposed.new-lai</td><td>Number of Raft commands re-proposed with a newer LAI.<br/><br/>The number of Raft commands that leaseholders re-proposed with a modified LAI.<br/>Such re-proposals happen for commands that are committed to Raft out of intended<br/>order, and hence can not be applied as is.</td><td>Commands</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>raft.commands.reproposed.unchanged</td><td>Number of Raft commands re-proposed without modification.<br/><br/>The number of Raft commands that leaseholders re-proposed without modification.<br/>Such re-proposals happen for commands that are not committed/applied within a<br/>timeout, and have a high chance of being dropped.</td><td>Commands</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ func (r *Replica) QuotaReleaseQueueLen() int {
return len(r.mu.quotaReleaseQueue)
}

func (r *Replica) NumPendingProposals() int {
func (r *Replica) NumPendingProposals() int64 {
r.mu.RLock()
defer r.mu.RUnlock()
return r.numPendingProposalsRLocked()
Expand Down
12 changes: 12 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1316,6 +1316,16 @@ order, and hence can not be applied as is.`,
Measurement: "Commands",
Unit: metric.Unit_COUNT,
}
metaRaftCommandsPending = metric.Metadata{
Name: "raft.commands.pending",
Help: `Number of Raft commands proposed and pending.
The number of Raft commands that the leaseholders are tracking as in-flight.
These commands will be periodically reproposed until they are applied or until
they fail, either unequivocally or ambiguously.`,
Measurement: "Commands",
Unit: metric.Unit_COUNT,
}
metaRaftCommandsApplied = metric.Metadata{
Name: "raft.commandsapplied",
Help: `Number of Raft commands applied.
Expand Down Expand Up @@ -2710,6 +2720,7 @@ type StoreMetrics struct {
RaftCommandsProposed *metric.Counter
RaftCommandsReproposed *metric.Counter
RaftCommandsReproposedLAI *metric.Counter
RaftCommandsPending *metric.Gauge
RaftCommandsApplied *metric.Counter
RaftLogCommitLatency metric.IHistogram
RaftCommandCommitLatency metric.IHistogram
Expand Down Expand Up @@ -3425,6 +3436,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RaftCommandsProposed: metric.NewCounter(metaRaftCommandsProposed),
RaftCommandsReproposed: metric.NewCounter(metaRaftCommandsReproposed),
RaftCommandsReproposedLAI: metric.NewCounter(metaRaftCommandsReproposedLAI),
RaftCommandsPending: metric.NewGauge(metaRaftCommandsPending),
RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied),
RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/replica_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type ReplicaMetrics struct {
RaftLogTooLarge bool
BehindCount int64
PausedFollowerCount int64
RaftProposalCount int64
SlowRaftProposalCount int64

QuotaPoolPercentUsed int64 // [0,100]
Expand Down Expand Up @@ -110,6 +111,7 @@ func (r *Replica) Metrics(
qpUsed: qpUsed,
qpCapacity: qpCap,
paused: r.mu.pausedFollowers,
raftProposalCount: r.numPendingProposalsRLocked(),
slowRaftProposalCount: r.mu.slowProposalCount,
}

Expand Down Expand Up @@ -137,6 +139,7 @@ type calcReplicaMetricsInput struct {
raftLogSizeTrusted bool
qpUsed, qpCapacity int64 // quota pool used and capacity bytes
paused map[roachpb.ReplicaID]struct{}
raftProposalCount int64
slowRaftProposalCount int64
}

Expand Down Expand Up @@ -192,6 +195,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
BehindCount: leaderBehindCount,
PausedFollowerCount: leaderPausedFollowerCount,
RaftProposalCount: d.raftProposalCount,
SlowRaftProposalCount: d.slowRaftProposalCount,
QuotaPoolPercentUsed: calcQuotaPoolPercentUsed(d.qpUsed, d.qpCapacity),
LatchMetrics: d.latchMetrics,
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,8 @@ func checkReplicationChangeAllowed(
return nil
}

func (r *Replica) numPendingProposalsRLocked() int {
return len(r.mu.proposals) + r.mu.proposalBuf.AllocatedIdx()
func (r *Replica) numPendingProposalsRLocked() int64 {
return int64(len(r.mu.proposals) + r.mu.proposalBuf.AllocatedIdx())
}

// hasPendingProposalsRLocked is part of the quiescer interface.
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -3182,6 +3182,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
behindCount int64
pausedFollowerCount int64
ioOverload float64
raftProposalCount int64
slowRaftProposalCount int64

locks int64
Expand Down Expand Up @@ -3260,6 +3261,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
}
}
pausedFollowerCount += metrics.PausedFollowerCount
raftProposalCount += metrics.RaftProposalCount
slowRaftProposalCount += metrics.SlowRaftProposalCount
behindCount += metrics.BehindCount
loadStats := rep.loadStats.Stats()
Expand Down Expand Up @@ -3322,6 +3324,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
s.metrics.RaftPausedFollowerCount.Update(pausedFollowerCount)
s.metrics.IOOverload.Update(ioOverload)
s.metrics.RaftCommandsPending.Update(raftProposalCount)
s.metrics.SlowRaftRequests.Update(slowRaftProposalCount)

var averageLockHoldDurationNanos int64
Expand Down

0 comments on commit 60ec815

Please sign in to comment.