From 2102766926d25baf8b353bdbfaf865437747a6b3 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev Date: Wed, 27 Mar 2024 21:39:11 +0300 Subject: [PATCH] [Backport 2.14][PLAT-13249] Change clock skew alert threshold to 250 Summary: To detect clock skew issues before TServer starts crashing we're changing threshold to 250ms. Original diff: https://phorge.dev.yugabyte.com/D33582 Test Plan: Testead manually. Reviewers: vbansal Reviewed By: vbansal Subscribers: yugaware, sanketh Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D33588 --- .../yw/common/alerts/AlertConfigurationWriter.java | 5 +++++ .../postgres/V336__Clock_Skew_Alert_Update.sql | 12 ++++++++++++ managed/src/main/resources/reference.conf | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 managed/src/main/resources/db/migration/default/postgres/V336__Clock_Skew_Alert_Update.sql diff --git a/managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationWriter.java b/managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationWriter.java index bede163b11a..45251899990 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationWriter.java +++ b/managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationWriter.java @@ -132,6 +132,11 @@ private SyncResult syncDefinition(UUID definitionUuid) { || !configuration.isActive()) { swamperHelper.removeAlertDefinition(definitionUuid); requiresReload.set(true); + if (definition != null) { + // Don't want to retry inactive definitions. + definition.setConfigWritten(true); + alertDefinitionService.save(definition); + } return SyncResult.REMOVED; } if (definition.isConfigWritten()) { diff --git a/managed/src/main/resources/db/migration/default/postgres/V336__Clock_Skew_Alert_Update.sql b/managed/src/main/resources/db/migration/default/postgres/V336__Clock_Skew_Alert_Update.sql new file mode 100644 index 00000000000..7888546524c --- /dev/null +++ b/managed/src/main/resources/db/migration/default/postgres/V336__Clock_Skew_Alert_Update.sql @@ -0,0 +1,12 @@ +-- Copyright (c) YugaByte, Inc. + +--- Update default threshold +update alert_configuration +set thresholds = replace(thresholds, + '"SEVERE":{"condition":"GREATER_THAN","threshold":500.0}', + '"SEVERE":{"condition":"GREATER_THAN","threshold":250.0}') +where template = 'CLOCK_SKEW'; + +-- Recreate alert definition with new threshold +update alert_definition set config_written = false where configuration_uuid IN + (select uuid from alert_configuration where template = 'CLOCK_SKEW'); diff --git a/managed/src/main/resources/reference.conf b/managed/src/main/resources/reference.conf index 0b9dc71e62c..49bc633eefa 100644 --- a/managed/src/main/resources/reference.conf +++ b/managed/src/main/resources/reference.conf @@ -66,7 +66,7 @@ yb { # Alerts thresholds alert { # Value of maximum allowed clock skew before an alert is generated (in ms). - max_clock_skew_ms = 500 + max_clock_skew_ms = 250 # Value of maximum allowed replication lag before an alert is generated (in ms). replication_lag_ms = 180000 # Value of maximum allowed percents of used memory on nodes.