From 69257ed15209bbfb49fd75c3e459dfcb6a4acb76 Mon Sep 17 00:00:00 2001
From: Richard Mahlberg <richard@spotify.com>
Date: Wed, 7 Jul 2021 16:51:45 +0200
Subject: [PATCH] Support cascade replication in get_stats_replication_delays

When monitoring a cluster with cascade replication the current query
fails with "get_stats_replication_delays". We can instead monitor the
diff using pg_last_wal_receive_lsn, this will emit metrics showing how
much replication delay the replica has compared to the upstream node
(but not relative to the primary instance of the cluster).
---
 postgresql_metrics/postgres_queries.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/postgresql_metrics/postgres_queries.py b/postgresql_metrics/postgres_queries.py
index ea825ed..7898008 100644
--- a/postgresql_metrics/postgres_queries.py
+++ b/postgresql_metrics/postgres_queries.py
@@ -221,6 +221,10 @@ def get_replication_delays(conn):
     sql = ("SELECT client_addr, "
            "pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff "
            "FROM public.pg_stat_repl")
+    if is_in_recovery(conn):
+        # pg_current_xlog_location cannot be called in a replica
+        # use pg_last_xlog_receive_location for monitoring cascade replication
+        sql = sql.replace("pg_current_xlog_location", "pg_last_xlog_receive_location")
     if conn.server_version >= 100000: # PostgreSQL 10 and higher
         sql = sql.replace('_xlog', '_wal')
         sql = sql.replace('_location', '_lsn')
@@ -273,3 +277,7 @@ def get_wal_receiver_status(conn):
         host = CONNINFO_HOST_RE.search(conn_info).groupdict().get('host', 'UNKNOWN')
         host_replication_status.append((host, status))
     return host_replication_status
+
+
+def is_in_recovery(conn):
+    return query(conn, "SELECT pg_is_in_recovery()")[0][0]