Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new commit statistics metrics #10993

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
96 changes: 96 additions & 0 deletions fdbclient/Schemas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,102 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
"p99":0.0,
"p99.9":0.0
},
"commit_batch_transactions":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_batch_bytes":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_batching_waiting":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_preresolution_latency":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_resolution_latency":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_postresolution_latency":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_tlog_logging_latency":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"commit_reply_latency":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"grv_latency_bands":{
"$map": 1
},
Expand Down
26 changes: 22 additions & 4 deletions fdbserver/CommitProxyServer.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
r->value().emplace_back(versionReply.resolverChangesVersion, it.dest);
}

pProxyCommitData->stats.commitPreresolutionLatency.addMeasurement(now() - startTime);
//TraceEvent("ProxyGotVer", pProxyContext->dbgid).detail("Commit", commitVersion).detail("Prev", prevVersion);

if (debugID.present()) {
Expand Down Expand Up @@ -1053,7 +1054,10 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
std::vector<ResolveTransactionBatchReply> resolutionResp = wait(getAll(replies));
self->resolution.swap(*const_cast<std::vector<ResolveTransactionBatchReply>*>(&resolutionResp));

self->pProxyCommitData->stats.resolutionDist->sampleSeconds(g_network->timer_monotonic() - resolutionStart);
double resolutionDuration = g_network->timer_monotonic() - resolutionStart;

self->pProxyCommitData->stats.commitResolutionLatency.addMeasurement(resolutionDuration);
self->pProxyCommitData->stats.resolutionDist->sampleSeconds(resolutionDuration);
if (self->debugID.present()) {
g_traceBatch.addEvent(
"CommitDebug", self->debugID.get().first(), "CommitProxyServer.commitBatch.AfterResolution");
Expand Down Expand Up @@ -2326,7 +2330,10 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
}
}

pProxyCommitData->stats.processingMutationDist->sampleSeconds(g_network->timer_monotonic() - postResolutionQueuing);
double postResolutionEnd = g_network->timer_monotonic();

pProxyCommitData->stats.commitPostresolutionLatency.addMeasurement(postResolutionEnd - postResolutionStart);
pProxyCommitData->stats.processingMutationDist->sampleSeconds(postResolutionEnd - postResolutionQueuing);
return Void();
}

Expand Down Expand Up @@ -2365,7 +2372,11 @@ ACTOR Future<Void> transactionLogging(CommitBatchContext* self) {
pProxyCommitData->txsPopVersions.emplace_back(self->commitVersion, self->msg.popTo);
}
pProxyCommitData->logSystem->popTxs(self->msg.popTo);
pProxyCommitData->stats.tlogLoggingDist->sampleSeconds(g_network->timer_monotonic() - tLoggingStart);

double tLoggingDuration = g_network->timer_monotonic() - tLoggingStart;

pProxyCommitData->stats.commitTLogLoggingLatency.addMeasurement(tLoggingDuration);
pProxyCommitData->stats.tlogLoggingDist->sampleSeconds(tLoggingDuration);
return Void();
}

Expand Down Expand Up @@ -2485,6 +2496,7 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
// TODO: filter if pipelined with large commit
const double duration = endTime - tr.requestTime();
pProxyCommitData->stats.commitLatencySample.addMeasurement(duration);
pProxyCommitData->stats.commitBatchingWaiting.addMeasurement(self->startTime - tr.requestTime());
if (pProxyCommitData->latencyBandConfig.present()) {
bool filter = self->maxTransactionBytes >
pProxyCommitData->latencyBandConfig.get().commitConfig.maxCommitBytes.orDefault(
Expand Down Expand Up @@ -2549,7 +2561,11 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
pProxyCommitData->commitBatchesMemBytesCount -= self->currentBatchMemBytesCount;
ASSERT_ABORT(pProxyCommitData->commitBatchesMemBytesCount >= 0);
wait(self->releaseFuture);
pProxyCommitData->stats.replyCommitDist->sampleSeconds(g_network->timer_monotonic() - replyStart);

double replyDuration = g_network->timer_monotonic() - replyStart;

pProxyCommitData->stats.commitReplyLatency.addMeasurement(replyDuration);
pProxyCommitData->stats.replyCommitDist->sampleSeconds(replyDuration);
return Void();
}

Expand All @@ -2571,6 +2587,8 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
context.pProxyCommitData->lastVersionTime = context.startTime;
++context.pProxyCommitData->stats.commitBatchIn;
context.setupTraceBatch();
context.pProxyCommitData->stats.commitBatchBytes.addMeasurement(context.currentBatchMemBytesCount);
context.pProxyCommitData->stats.commitBatchTransactions.addMeasurement(context.trs.size());

/////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined
/// and *should* be available by now (unless empty commit); ordered; currently atomic but could yield)
Expand Down
58 changes: 54 additions & 4 deletions fdbserver/Status.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,46 @@ struct RolesInfo {
if (commitBatchingWindowSize.size()) {
obj["commit_batching_window_size"] = addLatencyStatistics(commitBatchingWindowSize);
}

TraceEventFields const& commitBatchTransactions = metrics.at("CommitBatchTransactions");
if (commitBatchTransactions.size()) {
obj["commit_batch_transactions"] = addLatencyStatistics(commitBatchTransactions);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this modified status json, you need to update documentation/sphinx/source/mr-status-json-schemas.rst.inc and fdbclient/Schemas.cpp as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added new elements to fdbclient/Schemas.cpp, but now there is no cluster.processes..roles objetcs in documentation/sphinx/source/mr-status-json-schemas.rst.inc at all.

I can add the description of the new metrics, but I cannot describe all other elements of roles.

}

TraceEventFields const& commitBatchBytes = metrics.at("CommitBatchBytes");
if (commitBatchBytes.size()) {
obj["commit_batch_bytes"] = addLatencyStatistics(commitBatchBytes);
}

TraceEventFields const& commitBatchingWaiting = metrics.at("CommitBatchingWaiting");
if (commitBatchingWaiting.size()) {
obj["commit_batching_waiting"] = addLatencyStatistics(commitBatchingWaiting);
}

TraceEventFields const& commitPreresolutionLatency = metrics.at("CommitPreresolutionLatency");
if (commitPreresolutionLatency.size()) {
obj["commit_preresolution_latency"] = addLatencyStatistics(commitPreresolutionLatency);
}

TraceEventFields const& commitResolutionLatency = metrics.at("CommitResolutionLatency");
if (commitResolutionLatency.size()) {
obj["commit_resolution_latency"] = addLatencyStatistics(commitResolutionLatency);
}

TraceEventFields const& commitPostresolutionLatency = metrics.at("CommitPostresolutionLatency");
if (commitPostresolutionLatency.size()) {
obj["commit_postresolution_latency"] = addLatencyStatistics(commitPostresolutionLatency);
}

TraceEventFields const& commitTLogLoggingLatency = metrics.at("CommitTLogLoggingLatency");
if (commitTLogLoggingLatency.size()) {
obj["commit_tlog_logging_latency"] = addLatencyStatistics(commitTLogLoggingLatency);
}

TraceEventFields const& commitReplyLatency = metrics.at("CommitReplyLatency");
if (commitReplyLatency.size()) {
obj["commit_reply_latency"] = addLatencyStatistics(commitReplyLatency);
}
} catch (Error& e) {
if (e.code() != error_code_attribute_not_found) {
throw e;
Expand Down Expand Up @@ -2017,10 +2057,20 @@ ACTOR static Future<std::vector<std::pair<TLogInterface, EventMap>>> getTLogsAnd
ACTOR static Future<std::vector<std::pair<CommitProxyInterface, EventMap>>> getCommitProxiesAndMetrics(
Reference<AsyncVar<ServerDBInfo>> db,
std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
std::vector<std::pair<CommitProxyInterface, EventMap>> results = wait(getServerMetrics(
db->get().client.commitProxies,
address_workers,
std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize" }));
std::vector<std::pair<CommitProxyInterface, EventMap>> results =
wait(getServerMetrics(db->get().client.commitProxies,
address_workers,
std::vector<std::string>{ "CommitLatencyMetrics",
"CommitLatencyBands",
"CommitBatchingWindowSize",
"CommitBatchTransactions",
"CommitBatchBytes",
"CommitBatchingWaiting",
"CommitPreresolutionLatency",
"CommitResolutionLatency",
"CommitPostresolutionLatency",
"CommitTLogLoggingLatency",
"CommitReplyLatency" }));

return results;
}
Expand Down
45 changes: 45 additions & 0 deletions fdbserver/include/fdbserver/ProxyCommitData.actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,19 @@ struct ProxyStats {

LatencySample commitBatchingWindowSize;

// Number of transactions in the batch
LatencySample commitBatchTransactions;
// Summary length of transactions in the batch
LatencySample commitBatchBytes;
// what time transactions were waiting in the batch before they
// started processing with commitBatch()
LatencySample commitBatchingWaiting;
LatencySample commitPreresolutionLatency;
LatencySample commitResolutionLatency;
LatencySample commitPostresolutionLatency;
LatencySample commitTLogLoggingLatency;
LatencySample commitReplyLatency;

LatencySample computeLatency;

Future<Void> logger;
Expand Down Expand Up @@ -154,6 +167,38 @@ struct ProxyStats {
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitBatchTransactions("CommitBatchTransactions",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitBatchBytes("CommitBatchBytes",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitBatchingWaiting("CommitBatchingWaiting",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitPreresolutionLatency("CommitPreresolutionLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitResolutionLatency("CommitResolutionLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitPostresolutionLatency("CommitPostresolutionLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitTLogLoggingLatency("CommitTLogLoggingLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
commitReplyLatency("CommitReplyLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
computeLatency("ComputeLatency",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
Expand Down