Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

direct_failure_detector: increase ping timeout and make it tunable #18443

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions db/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"\n"
"Related information: Failure detection and recovery")
, failure_detector_timeout_in_ms(this, "failure_detector_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 20 * 1000, "Maximum time between two successful echo message before gossip mark a node down in milliseconds.\n")
, direct_failure_detector_ping_timeout_in_ms(this, "direct_failure_detector_ping_timeout_in_ms", value_status::Used, 600, "Duration after which the direct failure detector aborts a ping message, so the next ping can start.\n"
"Note: this failure detector is used by Raft, and is different from gossiper's failure detector (configured by `failure_detector_timeout_in_ms`).\n")
/**
* @Group Performance tuning properties
* @GroupDescription Tuning performance and system resource utilization, including commit log, compaction, memory, disk I/O, CPU, reads, and writes.
Expand Down
1 change: 1 addition & 0 deletions db/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ public:
named_value<bool> snapshot_before_compaction;
named_value<uint32_t> phi_convict_threshold;
named_value<uint32_t> failure_detector_timeout_in_ms;
named_value<uint32_t> direct_failure_detector_ping_timeout_in_ms;
named_value<sstring> commitlog_sync;
named_value<uint32_t> commitlog_segment_size_in_mb;
named_value<uint32_t> schema_commitlog_segment_size_in_mb;
Expand Down
19 changes: 10 additions & 9 deletions direct_failure_detector/failure_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ struct failure_detector::impl {
clock& _clock;

clock::interval_t _ping_period;
clock::interval_t _ping_timeout;

// Number of workers on each shard.
// We use this to decide where to create new workers (we pick a shard with the smallest number of workers).
Expand Down Expand Up @@ -138,7 +139,7 @@ struct failure_detector::impl {
// The unregistering process requires cross-shard operations which we perform on this fiber.
future<> _destroy_subscriptions = make_ready_future<>();

impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period);
impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
~impl();

// Inform update_endpoint_fiber() about an added/removed endpoint.
Expand Down Expand Up @@ -174,12 +175,14 @@ struct failure_detector::impl {
future<> mark(listener* l, pinger::endpoint_id ep, bool alive);
};

failure_detector::failure_detector(pinger& pinger, clock& clock, clock::interval_t ping_period)
: _impl(std::make_unique<impl>(*this, pinger, clock, ping_period))
failure_detector::failure_detector(
pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
: _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
{}

failure_detector::impl::impl(failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period)
: _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period) {
failure_detector::impl::impl(
failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
: _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
if (this_shard_id() != 0) {
return;
}
Expand Down Expand Up @@ -536,11 +539,9 @@ future<> endpoint_worker::ping_fiber() noexcept {
auto start = clock.now();
auto next_ping_start = start + _fd._ping_period;

// A ping should take significantly less time than _ping_period, but we give it a multiple of ping_period before it times out
// just in case of transient network partitions.
// However, if there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
auto timeout = start + _fd._ping_timeout;
// If there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
// the listener (mark it as dead).
auto timeout = start + 3 * _fd._ping_period;
for (auto& [threshold, l]: _fd._listeners_liveness) {
if (l.endpoint_liveness[_id].alive && last_response + threshold < timeout) {
timeout = last_response + threshold;
Expand Down
14 changes: 7 additions & 7 deletions direct_failure_detector/failure_detector.hh
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ public:

// Every endpoint in the detected set will be periodically pinged every `ping_period`,
// assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
// before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
// will start immediately.
//
// `ping_period` should be chosen so that during normal operation, a ping takes significantly
// less time than `ping_period` (preferably at least an order of magnitude less).
// before it's aborted (up to `ping_timeout`), in which case the next ping will start immediately.
//
// The passed-in value must be the same on every shard.
clock::interval_t ping_period
clock::interval_t ping_period,

// Duration after which a ping is aborted, so that next ping can be started
// (pings are sent sequentially).
clock::interval_t ping_timeout
);

~failure_detector();
Expand All @@ -147,7 +147,7 @@ public:
// The listener stops being called when the returned subscription is destroyed.
// The subscription must be destroyed before service is stopped.
//
// `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
// `threshold` should be significantly larger than `ping_timeout`, preferably at least an order of magnitude larger.
//
// Different listeners may use different thresholds, depending on the use case:
// some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important
Expand Down
3 changes: 2 additions & 1 deletion main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1367,7 +1367,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
supervisor::notify("starting direct failure detector service");
fd.start(
std::ref(fd_pinger), std::ref(fd_clock),
service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count()).get();

auto stop_fd = defer_verbose_shutdown("direct_failure_detector", [] {
fd.stop().get();
Expand Down
2 changes: 1 addition & 1 deletion service/raft/raft_group_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ seastar::future<> raft_group_registry::start() {
// then to send VoteRequest messages.
init_rpc_verbs();

direct_fd_clock::base::duration threshold{std::chrono::seconds{1}};
direct_fd_clock::base::duration threshold{std::chrono::seconds{2}};
if (const auto ms = utils::get_local_injector().inject_parameter<int64_t>("raft-group-registry-fd-threshold-in-ms"); ms) {
threshold = direct_fd_clock::base::duration{std::chrono::milliseconds{*ms}};
}
Expand Down
3 changes: 2 additions & 1 deletion test/lib/cql_test_env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,8 @@ class single_node_cql_env : public cql_test_env {
service::direct_fd_clock fd_clock;
_fd.start(
std::ref(_fd_pinger), std::ref(fd_clock),
service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
service::direct_fd_clock::base::duration{std::chrono::milliseconds{600}}.count()).get();

auto stop_fd = defer([this] {
_fd.stop().get();
Expand Down
2 changes: 1 addition & 1 deletion test/raft/failure_detector_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ SEASTAR_TEST_CASE(failure_detector_test) {
test_pinger pinger;
test_clock clock;
sharded<direct_failure_detector::failure_detector> fd;
co_await fd.start(std::ref(pinger), std::ref(clock), 10);
co_await fd.start(std::ref(pinger), std::ref(clock), 10, 30);

test_listener l1, l2;
auto sub1 = co_await fd.local().register_listener(l1, 95);
Expand Down
3 changes: 2 additions & 1 deletion test/raft/randomized_nemesis_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1421,14 +1421,15 @@ class raft_server {
future<> start() {
// TODO: make it adjustable
static const raft::logical_clock::duration fd_ping_period = 10_t;
static const raft::logical_clock::duration fd_ping_timeout = 30_t;

assert(!_started);
_started = true;

// _fd_service must be started before raft server,
// because as soon as raft server is started, it may start adding endpoints to the service.
// _fd_service is using _server's RPC, but not until the first endpoint is added.
co_await _fd_service->start(std::ref(*_fd_pinger), std::ref(*_fd_clock), fd_ping_period.count());
co_await _fd_service->start(std::ref(*_fd_pinger), std::ref(*_fd_clock), fd_ping_period.count(), fd_ping_timeout.count());
_fd_subscription.emplace(co_await _fd_service->local().register_listener(*_fd_listener, _fd_convict_threshold.count()));
co_await _server->start();
}
Expand Down
1 change: 1 addition & 0 deletions test/topology_custom/test_raft_no_quorum.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
# loop inside do_on_leader_with_retries.

config = {
'direct_failure_detector_ping_timeout_in_ms': 300,
'error_injections_at_startup': [
{
'name': 'group0-raft-op-timeout-in-ms',
Expand Down