scylladb · kbr-scylla · Apr 26, 2024
diff --git a/db/config.cc b/db/config.cc
@@ -515,6 +515,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "\n"
         "Related information: Failure detection and recovery")
     , failure_detector_timeout_in_ms(this, "failure_detector_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 20 * 1000, "Maximum time between two successful echo message before gossip mark a node down in milliseconds.\n")
+    , direct_failure_detector_ping_timeout_in_ms(this, "direct_failure_detector_ping_timeout_in_ms", value_status::Used, 600, "Duration after which the direct failure detector aborts a ping message, so the next ping can start.\n"
+        "Note: this failure detector is used by Raft, and is different from gossiper's failure detector (configured by `failure_detector_timeout_in_ms`).\n")
     /**
     * @Group Performance tuning properties
     * @GroupDescription Tuning performance and system resource utilization, including commit log, compaction, memory, disk I/O, CPU, reads, and writes.

diff --git a/db/config.hh b/db/config.hh
@@ -211,6 +211,7 @@ public:
     named_value<bool> snapshot_before_compaction;
     named_value<uint32_t> phi_convict_threshold;
     named_value<uint32_t> failure_detector_timeout_in_ms;
+    named_value<uint32_t> direct_failure_detector_ping_timeout_in_ms;
     named_value<sstring> commitlog_sync;
     named_value<uint32_t> commitlog_segment_size_in_mb;
     named_value<uint32_t> schema_commitlog_segment_size_in_mb;

diff --git a/direct_failure_detector/failure_detector.cc b/direct_failure_detector/failure_detector.cc
@@ -96,6 +96,7 @@ struct failure_detector::impl {
     clock& _clock;
 
     clock::interval_t _ping_period;
+    clock::interval_t _ping_timeout;
 
     // Number of workers on each shard.
     // We use this to decide where to create new workers (we pick a shard with the smallest number of workers).
@@ -138,7 +139,7 @@ struct failure_detector::impl {
     // The unregistering process requires cross-shard operations which we perform on this fiber.
     future<> _destroy_subscriptions = make_ready_future<>();
 
-    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period);
+    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
     ~impl();
 
     // Inform update_endpoint_fiber() about an added/removed endpoint.
@@ -174,12 +175,14 @@ struct failure_detector::impl {
     future<> mark(listener* l, pinger::endpoint_id ep, bool alive);
 };
 
-failure_detector::failure_detector(pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period))
+failure_detector::failure_detector(
+    pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
 {}
 
-failure_detector::impl::impl(failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period) {
+failure_detector::impl::impl(
+    failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
     if (this_shard_id() != 0) {
         return;
     }
@@ -536,11 +539,9 @@ future<> endpoint_worker::ping_fiber() noexcept {
         auto start = clock.now();
         auto next_ping_start = start + _fd._ping_period;
 
-        // A ping should take significantly less time than _ping_period, but we give it a multiple of ping_period before it times out
-        // just in case of transient network partitions.
-        // However, if there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
+        auto timeout = start + _fd._ping_timeout;
+        // If there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
         // the listener (mark it as dead).
-        auto timeout = start + 3 * _fd._ping_period;
         for (auto& [threshold, l]: _fd._listeners_liveness) {
             if (l.endpoint_liveness[_id].alive && last_response + threshold < timeout) {
                 timeout = last_response + threshold;

diff --git a/direct_failure_detector/failure_detector.hh b/direct_failure_detector/failure_detector.hh
@@ -120,14 +120,14 @@ public:
 
         // Every endpoint in the detected set will be periodically pinged every `ping_period`,
         // assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
-        // before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
-        // will start immediately.
-        //
-        // `ping_period` should be chosen so that during normal operation, a ping takes significantly
-        // less time than `ping_period` (preferably at least an order of magnitude less).
+        // before it's aborted (up to `ping_timeout`), in which case the next ping will start immediately.
         //
         // The passed-in value must be the same on every shard.
-        clock::interval_t ping_period
+        clock::interval_t ping_period,
+
+        // Duration after which a ping is aborted, so that next ping can be started
+        // (pings are sent sequentially).
+        clock::interval_t ping_timeout
     );
 
     ~failure_detector();
@@ -147,7 +147,7 @@ public:
     // The listener stops being called when the returned subscription is destroyed.
     // The subscription must be destroyed before service is stopped.
     //
-    // `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
+    // `threshold` should be significantly larger than `ping_timeout`, preferably at least an order of magnitude larger.
     //
     // Different listeners may use different thresholds, depending on the use case:
     // some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important

diff --git a/main.cc b/main.cc
@@ -1367,7 +1367,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
             supervisor::notify("starting direct failure detector service");
             fd.start(
                 std::ref(fd_pinger), std::ref(fd_clock),
-                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count()).get();
 
             auto stop_fd = defer_verbose_shutdown("direct_failure_detector", [] {
                 fd.stop().get();

diff --git a/service/raft/raft_group_registry.cc b/service/raft/raft_group_registry.cc
@@ -290,7 +290,7 @@ seastar::future<> raft_group_registry::start() {
     // then to send VoteRequest messages.
     init_rpc_verbs();
 
-    direct_fd_clock::base::duration threshold{std::chrono::seconds{1}};
+    direct_fd_clock::base::duration threshold{std::chrono::seconds{2}};
     if (const auto ms = utils::get_local_injector().inject_parameter<int64_t>("raft-group-registry-fd-threshold-in-ms"); ms) {
         threshold = direct_fd_clock::base::duration{std::chrono::milliseconds{*ms}};
     }

diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc
@@ -715,7 +715,8 @@ class single_node_cql_env : public cql_test_env {
             service::direct_fd_clock fd_clock;
             _fd.start(
                 std::ref(_fd_pinger), std::ref(fd_clock),
-                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{600}}.count()).get();
 
             auto stop_fd = defer([this] {
                 _fd.stop().get();

diff --git a/test/raft/failure_detector_test.cc b/test/raft/failure_detector_test.cc
@@ -128,7 +128,7 @@ SEASTAR_TEST_CASE(failure_detector_test) {
     test_pinger pinger;
     test_clock clock;
     sharded<direct_failure_detector::failure_detector> fd;
-    co_await fd.start(std::ref(pinger), std::ref(clock), 10);
+    co_await fd.start(std::ref(pinger), std::ref(clock), 10, 30);
 
     test_listener l1, l2;
     auto sub1 = co_await fd.local().register_listener(l1, 95);

diff --git a/test/raft/randomized_nemesis_test.cc b/test/raft/randomized_nemesis_test.cc
@@ -1421,14 +1421,15 @@ class raft_server {
     future<> start() {
         // TODO: make it adjustable
         static const raft::logical_clock::duration fd_ping_period = 10_t;
+        static const raft::logical_clock::duration fd_ping_timeout = 30_t;
 
         assert(!_started);
         _started = true;
 
         // _fd_service must be started before raft server,
         // because as soon as raft server is started, it may start adding endpoints to the service.
         // _fd_service is using _server's RPC, but not until the first endpoint is added.
-        co_await _fd_service->start(std::ref(*_fd_pinger), std::ref(*_fd_clock), fd_ping_period.count());
+        co_await _fd_service->start(std::ref(*_fd_pinger), std::ref(*_fd_clock), fd_ping_period.count(), fd_ping_timeout.count());
         _fd_subscription.emplace(co_await _fd_service->local().register_listener(*_fd_listener, _fd_convict_threshold.count()));
         co_await _server->start();
     }

diff --git a/test/topology_custom/test_raft_no_quorum.py b/test/topology_custom/test_raft_no_quorum.py
@@ -40,6 +40,7 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
     # loop inside do_on_leader_with_retries.
 
     config = {
+        'direct_failure_detector_ping_timeout_in_ms': 300,
         'error_injections_at_startup': [
             {
                 'name': 'group0-raft-op-timeout-in-ms',