Skip to content

Commit

Permalink
Merge pull request #26 from andreasscherman/add-vacuum-metrics
Browse files Browse the repository at this point in the history
Add mxid and xid metrics
  • Loading branch information
dflemstr committed Mar 11, 2021
2 parents 6c16a8b + e39c822 commit 3c5397a
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 35 deletions.
13 changes: 10 additions & 3 deletions README.md
Expand Up @@ -217,14 +217,21 @@ Called once per your Postgres cluster.
per each slave. If the slave and the master are in synchronous state,
the replication delay is zero.


### Database Local Directory Based (Global) Metrics

* **get_stats_wal_file_amount**:
This graph shows the amount of files in your database clusters WAL log
directory (pg_wal or pg_xlog). If the WAL file amount starts to suddenly
increase, you probably have issues with your WAL archiving process, which
might lead to the disk filling up, and you database cluster crashing.

* **get_xid_remaining_ratio, get_multixact_remaining_ratio, get_multixact_members_remaining_ratio**:
These metric shows the corresponding remaining % of transaction ids ("xid"), multixact ids ("mxid"),
and multixact members that are available for postgres to use before exhaustion.
Useful for ensuring that the vacuuming is working as intended for your postgres instance.

* **get_multixact_members_per_mxid**:
This metric emits the number of multixact members there are per multixact ID. A larger number means
that it'll be quicker for the multixact members exhaustion to happen (as can
be seen in **get_multixact_members_usage_ratio**).


## Short Overview of Python Modules
Expand Down
14 changes: 6 additions & 8 deletions etc/postgresql-metrics/default/postgresql-metrics.yml
Expand Up @@ -38,8 +38,7 @@ ffwd:
# Each entry must be a tuple with the function name, and a time interval in seconds
# to call that metrics function.
#
# db_functions: Functions taking DB connection and returning a list of metrics,
# called once per each database in cluster.
# db_functions: Functions called once per each database in cluster.
db_functions:
- ["get_stats_disk_usage_for_database", 180]
- ["get_stats_tx_rate_for_database", 60]
Expand All @@ -52,15 +51,14 @@ db_functions:
# replication status relies on `pg_stat_wal_receiver`, which is only available on postgres 9.6+
# - ["get_stats_incoming_replication_status", 30]

# global_db_functions: Functions taking DB connection and returning a list of metrics,
# called once per the whole database cluster.
# global_db_functions: Functions called once per the whole database cluster.
global_db_functions:
- ["get_stats_client_connections", 60]
- ["get_stats_lock_statistics", 60]
- ["get_stats_heap_hit_statistics", 60]
- ["get_stats_replication_delays", 60]

# data_dir_functions: Functions taking a file path to Postgres data dir and returning
# a list of metrics, called once per the whole database cluster.
data_dir_functions:
- ["get_stats_wal_file_amount", 180]
- ["get_multixact_members_per_mxid", 60]
- ["get_multixact_members_remaining_ratio", 60]
- ["get_multixact_remaining_ratio", 60]
- ["get_xid_remaining_ratio", 60]
23 changes: 23 additions & 0 deletions postgresql_metrics/default_metrics.py
Expand Up @@ -116,6 +116,29 @@ def metric_sec_since_oldest_xact_start(database_name, value):
'unit': 's'})


def metric_xid_remaining_ratio(value):
return create_default_metric(value,
{'what': 'xid-remaining',
'unit': '%'})


def metric_multixact_remaining_ratio(value):
return create_default_metric(value,
{'what': 'mxid-remaining',
'unit': '%'})


def metric_multixact_members_per_mxid(value):
return create_default_metric(value,
{'what': 'multixact-members-per-mxid',
'unit': 'members/id'})


def metric_multixact_members_remaining_ratio(value):
return create_default_metric(value,
{'what': 'multixact-members-remaining',
'unit': '%'})

def metric_wal_file_amount(value):
return create_default_metric(value,
{'what': 'wal-file-amount',
Expand Down
12 changes: 12 additions & 0 deletions postgresql_metrics/localhost_postgres_stats.py
Expand Up @@ -24,6 +24,18 @@
LOG = get_logger()


def get_multixact_member_files(data_dir):
try:
members_dir = os.path.join(data_dir, "pg_multixact", "members")
if os.path.isdir(members_dir):
return len([f for f in os.listdir(members_dir) if os.path.isfile(os.path.join(members_dir, f))])
else:
LOG.exception(f"Missing pg_multixact/members directory in data_dir: {data_dir}")
except OSError:
LOG.exception('Failed accessing multixact member files in: {data_dir}. Is data dir readable by user?')
return 0


def get_amount_of_wal_files(data_dir):
amount_of_wal_files = 0
try:
Expand Down
74 changes: 61 additions & 13 deletions postgresql_metrics/metrics_gatherer.py
Expand Up @@ -43,9 +43,13 @@
metric_replication_delay_bytes,
metric_wal_file_amount,
metric_incoming_replication_running,
metric_multixact_members_per_mxid,
metric_multixact_remaining_ratio,
metric_xid_remaining_ratio,
metric_multixact_members_remaining_ratio,
)

from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files
from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files, get_multixact_member_files

from postgresql_metrics.postgres_queries import (
get_client_connections_amount,
Expand All @@ -60,23 +64,29 @@
get_replication_delays,
get_tables_with_oids_for_current_db,
get_wal_receiver_status,
get_max_mxid_age,
get_max_xid_age,
)

MEMBERS_PER_MEMBER_FILE = 52352
MAX_MULTIXACT_MEMBERS = 2**32
WRAPAROUND_LIMIT = (2**32/2) - 1

# Notice that all functions here are expected to return a list of metrics.
# Notice also that the names of these functions should match the configuration.

def get_stats_client_connections(db_connection):

def get_stats_client_connections(_data_dir, db_connection):
client_amount = get_client_connections_amount(db_connection)
return [metric_client_connections(client_amount)]


def get_stats_disk_usage_for_database(db_connection):
def get_stats_disk_usage_for_database(_data_dir, db_connection):
db_size = get_disk_usage_for_database(db_connection)
return [metric_database_size(db_size[0], db_size[1])]


def get_stats_tx_rate_for_database(db_connection):
def get_stats_tx_rate_for_database(_data_dir, db_connection):
db_name, tx_rate, tx_rollbacks = get_transaction_rate_for_database(db_connection)
if tx_rate is not None:
return [metric_transaction_rate(db_name, tx_rate),
Expand All @@ -85,15 +95,15 @@ def get_stats_tx_rate_for_database(db_connection):
return []


def get_stats_seconds_since_last_vacuum_per_table(db_connection):
def get_stats_seconds_since_last_vacuum_per_table(_data_dir, db_connection):
last_vacuums_data = get_seconds_since_last_vacuum_per_table(db_connection)
metrics = []
for db_name, table_name, seconds_since in last_vacuums_data:
metrics.append(metric_seconds_since_last_vacuum(db_name, table_name, seconds_since))
return metrics


def get_stats_heap_hit_statistics(db_connection):
def get_stats_heap_hit_statistics(_data_dir, db_connection):
db_name, heap_read, heap_hit, heap_hit_ratio = get_heap_hit_statistics(db_connection)
metrics = []
if heap_hit_ratio is not None:
Expand All @@ -103,7 +113,7 @@ def get_stats_heap_hit_statistics(db_connection):
return metrics


def get_stats_lock_statistics(db_connection):
def get_stats_lock_statistics(_data_dir, db_connection):
locks_by_type, [total_locks_waiting, total_locks_granted] = get_lock_statistics(db_connection)
metrics = []
for lock_type, [locks_waiting, locks_granted] in locks_by_type.items():
Expand All @@ -114,15 +124,15 @@ def get_stats_lock_statistics(db_connection):
return metrics


def get_stats_oldest_transaction_timestamp(db_connection):
def get_stats_oldest_transaction_timestamp(_data_dir, db_connection):
db_name, sec_since_oldest_xact_start = get_oldest_transaction_timestamp(db_connection)
metrics = []
if sec_since_oldest_xact_start is not None:
metrics.append(metric_sec_since_oldest_xact_start(db_name, sec_since_oldest_xact_start))
return metrics


def get_stats_table_bloat(db_connection):
def get_stats_table_bloat(_data_dir, db_connection):
tables_with_oids = get_tables_with_oids_for_current_db(db_connection)
metrics = []
for table_oid, table_name in tables_with_oids:
Expand All @@ -132,7 +142,7 @@ def get_stats_table_bloat(db_connection):
return metrics


def get_stats_index_hit_rates(db_connection):
def get_stats_index_hit_rates(_data_dir, db_connection):
index_hit_rates = get_index_hit_rates(db_connection)
metrics = []
for db_name, table_name, index_hit_ratio in index_hit_rates:
Expand All @@ -141,18 +151,56 @@ def get_stats_index_hit_rates(db_connection):
return metrics


def get_stats_replication_delays(db_connection):
def get_stats_replication_delays(_data_dir, db_connection):
replication_delays = get_replication_delays(db_connection)
metrics = []
for client_addr, delay_in_bytes in replication_delays:
metrics.append(metric_replication_delay_bytes(client_addr, delay_in_bytes))
return metrics


def get_stats_wal_file_amount(data_dir):
def _get_multixact_members(data_dir):
return get_multixact_member_files(data_dir) * MEMBERS_PER_MEMBER_FILE


def get_multixact_members_per_mxid(data_dir, db_connection):
members = _get_multixact_members(data_dir)
mxid_age = get_max_mxid_age(db_connection)
if not mxid_age:
return []
members_per_id = round(members / mxid_age, 2)
return [metric_multixact_members_per_mxid(members_per_id)]


def get_multixact_members_remaining_ratio(data_dir, _db_connection):
members = _get_multixact_members(data_dir)
ratio = round(members / MAX_MULTIXACT_MEMBERS, 2)
percentage_remaining = (1.0 - ratio) * 100
return [metric_multixact_members_remaining_ratio(percentage_remaining)]


def get_multixact_remaining_ratio(_data_dir, db_connection):
mxid_age = get_max_mxid_age(db_connection)
if not mxid_age:
return []
ratio = round(mxid_age / WRAPAROUND_LIMIT, 2)
percentage_remaining = (1.0 - ratio) * 100
return [metric_multixact_remaining_ratio(percentage_remaining)]


def get_xid_remaining_ratio(_data_dir, db_connection):
xid_age = get_max_xid_age(db_connection)
if not xid_age:
return []
ratio = round(xid_age / WRAPAROUND_LIMIT, 2)
percentage_remaining = (1.0 - ratio) * 100
return [metric_xid_remaining_ratio(percentage_remaining)]


def get_stats_wal_file_amount(data_dir, _db_connection):
return [metric_wal_file_amount(get_amount_of_wal_files(data_dir))]


def get_stats_incoming_replication_status(db_connection):
def get_stats_incoming_replication_status(_data_dir, db_connection):
return [metric_incoming_replication_running(host, is_streaming)
for host, is_streaming in get_wal_receiver_status(db_connection)]
24 changes: 13 additions & 11 deletions postgresql_metrics/metrics_logic.py
Expand Up @@ -82,7 +82,7 @@ def _is_time_to_call_stats_func_and_update_ts(database_name, metrics_func, run_i
return False


def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_name=None):
def _call_all_db_functions(db_stats_functions, db_parameters, schedule=False, db_name=None):
"""Iterates through all given statistics functions, calling them with the given parameter.
The db_parameter can be a database connection or a file path to Postgres data directory,
depending on the statistics function to call.
Expand All @@ -100,7 +100,7 @@ def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_
if is_call_required:
try:
LOG.debug('calling stats function {}', db_metrics_func.__name__)
metrics.extend(db_metrics_func(db_parameter))
metrics.extend(db_metrics_func(*db_parameters))
except Exception:
LOG.exception('failed calling stats function: ' + db_metrics_func.__name__)
return metrics
Expand All @@ -123,22 +123,25 @@ def get_stats_functions_from_conf(func_key_name, conf):
def get_all_stats_functions_from_conf(conf):
db_functions = get_stats_functions_from_conf('db_functions', conf)
global_db_functions = get_stats_functions_from_conf('global_db_functions', conf)
# `data_dir_functions` is deprecated, but to preserve backwards compatibility still read
data_dir_functions = get_stats_functions_from_conf('data_dir_functions', conf)
return db_functions, global_db_functions, data_dir_functions
if data_dir_functions:
LOG.warn("data_dir_functions field in config is deprecated -- consider moving functions to global_db_functions")
all_global_db_functions = data_dir_functions + global_db_functions
return db_functions, all_global_db_functions


def get_all_metrics_now(db_connections, conf):
"""Get all the metrics immediately without any scheduling.
First gets the global stats with first available database connection,
and then gets the rest per database.
"""
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)

all_metrics = _call_all_db_functions(db_connections[0], global_db_functions)
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions))
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]))
for db_connection in db_connections:
all_metrics.extend(_call_all_db_functions(db_connection, db_functions))
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection)))
return all_metrics


Expand All @@ -147,14 +150,13 @@ def get_all_metrics_scheduled(db_connections, conf):
First gets the global stats with first available database connection,
and then gets the rest per database.
"""
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)

all_metrics = _call_all_db_functions(db_connections[0], global_db_functions, schedule=True)
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions, schedule=True))
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]), schedule=True)
for db_connection in db_connections:
db_name = get_db_name_from_connection(db_connection)
all_metrics.extend(_call_all_db_functions(db_connection, db_functions,
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection),
schedule=True, db_name=db_name))
return all_metrics

Expand Down
22 changes: 22 additions & 0 deletions postgresql_metrics/postgres_queries.py
Expand Up @@ -195,6 +195,28 @@ def get_oldest_transaction_timestamp(conn):
return None, None


def get_max_mxid_age(conn):
# `mxid_age` is only available on postgres 9.5 and newer
if conn.server_version < 95000:
LOG.error("Unable to check mxid_age on versions of postgres below 9.5")
return None
sql = "SELECT max(mxid_age(relminmxid)) FROM pg_class WHERE relminmxid <> '0'"
results = query(conn, sql)
if not results:
return None
mxid_age, = results[0]
return int(mxid_age)


def get_max_xid_age(conn):
sql = "SELECT max(age(datfrozenxid)) FROM pg_database"
results = query(conn, sql)
if not results:
return None
xid_age, = results[0]
return int(xid_age)


def get_replication_delays(conn):
sql = ("SELECT client_addr, "
"pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff "
Expand Down

0 comments on commit 3c5397a

Please sign in to comment.