From 69257ed15209bbfb49fd75c3e459dfcb6a4acb76 Mon Sep 17 00:00:00 2001 From: Richard Mahlberg Date: Wed, 7 Jul 2021 16:51:45 +0200 Subject: [PATCH] Support cascade replication in get_stats_replication_delays When monitoring a cluster with cascade replication the current query fails with "get_stats_replication_delays". We can instead monitor the diff using pg_last_wal_receive_lsn, this will emit metrics showing how much replication delay the replica has compared to the upstream node (but not relative to the primary instance of the cluster). --- postgresql_metrics/postgres_queries.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/postgresql_metrics/postgres_queries.py b/postgresql_metrics/postgres_queries.py index ea825ed..7898008 100644 --- a/postgresql_metrics/postgres_queries.py +++ b/postgresql_metrics/postgres_queries.py @@ -221,6 +221,10 @@ def get_replication_delays(conn): sql = ("SELECT client_addr, " "pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff " "FROM public.pg_stat_repl") + if is_in_recovery(conn): + # pg_current_xlog_location cannot be called in a replica + # use pg_last_xlog_receive_location for monitoring cascade replication + sql = sql.replace("pg_current_xlog_location", "pg_last_xlog_receive_location") if conn.server_version >= 100000: # PostgreSQL 10 and higher sql = sql.replace('_xlog', '_wal') sql = sql.replace('_location', '_lsn') @@ -273,3 +277,7 @@ def get_wal_receiver_status(conn): host = CONNINFO_HOST_RE.search(conn_info).groupdict().get('host', 'UNKNOWN') host_replication_status.append((host, status)) return host_replication_status + + +def is_in_recovery(conn): + return query(conn, "SELECT pg_is_in_recovery()")[0][0]