diff --git a/README.md b/README.md index 0343ebf..6b10e23 100644 --- a/README.md +++ b/README.md @@ -217,14 +217,21 @@ Called once per your Postgres cluster. per each slave. If the slave and the master are in synchronous state, the replication delay is zero. - -### Database Local Directory Based (Global) Metrics - * **get_stats_wal_file_amount**: This graph shows the amount of files in your database clusters WAL log directory (pg_wal or pg_xlog). If the WAL file amount starts to suddenly increase, you probably have issues with your WAL archiving process, which might lead to the disk filling up, and you database cluster crashing. + +* **get_xid_remaining_ratio, get_multixact_remaining_ratio, get_multixact_members_remaining_ratio**: + These metric shows the corresponding remaining % of transaction ids ("xid"), multixact ids ("mxid"), + and multixact members that are available for postgres to use before exhaustion. + Useful for ensuring that the vacuuming is working as intended for your postgres instance. + +* **get_multixact_members_per_mxid**: + This metric emits the number of multixact members there are per multixact ID. A larger number means + that it'll be quicker for the multixact members exhaustion to happen (as can + be seen in **get_multixact_members_usage_ratio**). ## Short Overview of Python Modules diff --git a/etc/postgresql-metrics/default/postgresql-metrics.yml b/etc/postgresql-metrics/default/postgresql-metrics.yml index 6625590..9a1b55f 100644 --- a/etc/postgresql-metrics/default/postgresql-metrics.yml +++ b/etc/postgresql-metrics/default/postgresql-metrics.yml @@ -38,8 +38,7 @@ ffwd: # Each entry must be a tuple with the function name, and a time interval in seconds # to call that metrics function. # -# db_functions: Functions taking DB connection and returning a list of metrics, -# called once per each database in cluster. +# db_functions: Functions called once per each database in cluster. db_functions: - ["get_stats_disk_usage_for_database", 180] - ["get_stats_tx_rate_for_database", 60] @@ -52,15 +51,14 @@ db_functions: # replication status relies on `pg_stat_wal_receiver`, which is only available on postgres 9.6+ # - ["get_stats_incoming_replication_status", 30] -# global_db_functions: Functions taking DB connection and returning a list of metrics, -# called once per the whole database cluster. +# global_db_functions: Functions called once per the whole database cluster. global_db_functions: - ["get_stats_client_connections", 60] - ["get_stats_lock_statistics", 60] - ["get_stats_heap_hit_statistics", 60] - ["get_stats_replication_delays", 60] - -# data_dir_functions: Functions taking a file path to Postgres data dir and returning -# a list of metrics, called once per the whole database cluster. -data_dir_functions: - ["get_stats_wal_file_amount", 180] + - ["get_multixact_members_per_mxid", 60] + - ["get_multixact_members_remaining_ratio", 60] + - ["get_multixact_remaining_ratio", 60] + - ["get_xid_remaining_ratio", 60] diff --git a/postgresql_metrics/default_metrics.py b/postgresql_metrics/default_metrics.py index 03125e1..f015726 100644 --- a/postgresql_metrics/default_metrics.py +++ b/postgresql_metrics/default_metrics.py @@ -116,6 +116,29 @@ def metric_sec_since_oldest_xact_start(database_name, value): 'unit': 's'}) +def metric_xid_remaining_ratio(value): + return create_default_metric(value, + {'what': 'xid-remaining', + 'unit': '%'}) + + +def metric_multixact_remaining_ratio(value): + return create_default_metric(value, + {'what': 'mxid-remaining', + 'unit': '%'}) + + +def metric_multixact_members_per_mxid(value): + return create_default_metric(value, + {'what': 'multixact-members-per-mxid', + 'unit': 'members/id'}) + + +def metric_multixact_members_remaining_ratio(value): + return create_default_metric(value, + {'what': 'multixact-members-remaining', + 'unit': '%'}) + def metric_wal_file_amount(value): return create_default_metric(value, {'what': 'wal-file-amount', diff --git a/postgresql_metrics/localhost_postgres_stats.py b/postgresql_metrics/localhost_postgres_stats.py index 1bbc7ff..999a649 100644 --- a/postgresql_metrics/localhost_postgres_stats.py +++ b/postgresql_metrics/localhost_postgres_stats.py @@ -24,6 +24,18 @@ LOG = get_logger() +def get_multixact_member_files(data_dir): + try: + members_dir = os.path.join(data_dir, "pg_multixact", "members") + if os.path.isdir(members_dir): + return len([f for f in os.listdir(members_dir) if os.path.isfile(os.path.join(members_dir, f))]) + else: + LOG.exception(f"Missing pg_multixact/members directory in data_dir: {data_dir}") + except OSError: + LOG.exception('Failed accessing multixact member files in: {data_dir}. Is data dir readable by user?') + return 0 + + def get_amount_of_wal_files(data_dir): amount_of_wal_files = 0 try: diff --git a/postgresql_metrics/metrics_gatherer.py b/postgresql_metrics/metrics_gatherer.py index 4364337..23f22e6 100644 --- a/postgresql_metrics/metrics_gatherer.py +++ b/postgresql_metrics/metrics_gatherer.py @@ -43,9 +43,13 @@ metric_replication_delay_bytes, metric_wal_file_amount, metric_incoming_replication_running, + metric_multixact_members_per_mxid, + metric_multixact_remaining_ratio, + metric_xid_remaining_ratio, + metric_multixact_members_remaining_ratio, ) -from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files +from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files, get_multixact_member_files from postgresql_metrics.postgres_queries import ( get_client_connections_amount, @@ -60,23 +64,29 @@ get_replication_delays, get_tables_with_oids_for_current_db, get_wal_receiver_status, + get_max_mxid_age, + get_max_xid_age, ) +MEMBERS_PER_MEMBER_FILE = 52352 +MAX_MULTIXACT_MEMBERS = 2**32 +WRAPAROUND_LIMIT = (2**32/2) - 1 # Notice that all functions here are expected to return a list of metrics. # Notice also that the names of these functions should match the configuration. -def get_stats_client_connections(db_connection): + +def get_stats_client_connections(_data_dir, db_connection): client_amount = get_client_connections_amount(db_connection) return [metric_client_connections(client_amount)] -def get_stats_disk_usage_for_database(db_connection): +def get_stats_disk_usage_for_database(_data_dir, db_connection): db_size = get_disk_usage_for_database(db_connection) return [metric_database_size(db_size[0], db_size[1])] -def get_stats_tx_rate_for_database(db_connection): +def get_stats_tx_rate_for_database(_data_dir, db_connection): db_name, tx_rate, tx_rollbacks = get_transaction_rate_for_database(db_connection) if tx_rate is not None: return [metric_transaction_rate(db_name, tx_rate), @@ -85,7 +95,7 @@ def get_stats_tx_rate_for_database(db_connection): return [] -def get_stats_seconds_since_last_vacuum_per_table(db_connection): +def get_stats_seconds_since_last_vacuum_per_table(_data_dir, db_connection): last_vacuums_data = get_seconds_since_last_vacuum_per_table(db_connection) metrics = [] for db_name, table_name, seconds_since in last_vacuums_data: @@ -93,7 +103,7 @@ def get_stats_seconds_since_last_vacuum_per_table(db_connection): return metrics -def get_stats_heap_hit_statistics(db_connection): +def get_stats_heap_hit_statistics(_data_dir, db_connection): db_name, heap_read, heap_hit, heap_hit_ratio = get_heap_hit_statistics(db_connection) metrics = [] if heap_hit_ratio is not None: @@ -103,7 +113,7 @@ def get_stats_heap_hit_statistics(db_connection): return metrics -def get_stats_lock_statistics(db_connection): +def get_stats_lock_statistics(_data_dir, db_connection): locks_by_type, [total_locks_waiting, total_locks_granted] = get_lock_statistics(db_connection) metrics = [] for lock_type, [locks_waiting, locks_granted] in locks_by_type.items(): @@ -114,7 +124,7 @@ def get_stats_lock_statistics(db_connection): return metrics -def get_stats_oldest_transaction_timestamp(db_connection): +def get_stats_oldest_transaction_timestamp(_data_dir, db_connection): db_name, sec_since_oldest_xact_start = get_oldest_transaction_timestamp(db_connection) metrics = [] if sec_since_oldest_xact_start is not None: @@ -122,7 +132,7 @@ def get_stats_oldest_transaction_timestamp(db_connection): return metrics -def get_stats_table_bloat(db_connection): +def get_stats_table_bloat(_data_dir, db_connection): tables_with_oids = get_tables_with_oids_for_current_db(db_connection) metrics = [] for table_oid, table_name in tables_with_oids: @@ -132,7 +142,7 @@ def get_stats_table_bloat(db_connection): return metrics -def get_stats_index_hit_rates(db_connection): +def get_stats_index_hit_rates(_data_dir, db_connection): index_hit_rates = get_index_hit_rates(db_connection) metrics = [] for db_name, table_name, index_hit_ratio in index_hit_rates: @@ -141,7 +151,7 @@ def get_stats_index_hit_rates(db_connection): return metrics -def get_stats_replication_delays(db_connection): +def get_stats_replication_delays(_data_dir, db_connection): replication_delays = get_replication_delays(db_connection) metrics = [] for client_addr, delay_in_bytes in replication_delays: @@ -149,10 +159,48 @@ def get_stats_replication_delays(db_connection): return metrics -def get_stats_wal_file_amount(data_dir): +def _get_multixact_members(data_dir): + return get_multixact_member_files(data_dir) * MEMBERS_PER_MEMBER_FILE + + +def get_multixact_members_per_mxid(data_dir, db_connection): + members = _get_multixact_members(data_dir) + mxid_age = get_max_mxid_age(db_connection) + if not mxid_age: + return [] + members_per_id = round(members / mxid_age, 2) + return [metric_multixact_members_per_mxid(members_per_id)] + + +def get_multixact_members_remaining_ratio(data_dir, _db_connection): + members = _get_multixact_members(data_dir) + ratio = round(members / MAX_MULTIXACT_MEMBERS, 2) + percentage_remaining = (1.0 - ratio) * 100 + return [metric_multixact_members_remaining_ratio(percentage_remaining)] + + +def get_multixact_remaining_ratio(_data_dir, db_connection): + mxid_age = get_max_mxid_age(db_connection) + if not mxid_age: + return [] + ratio = round(mxid_age / WRAPAROUND_LIMIT, 2) + percentage_remaining = (1.0 - ratio) * 100 + return [metric_multixact_remaining_ratio(percentage_remaining)] + + +def get_xid_remaining_ratio(_data_dir, db_connection): + xid_age = get_max_xid_age(db_connection) + if not xid_age: + return [] + ratio = round(xid_age / WRAPAROUND_LIMIT, 2) + percentage_remaining = (1.0 - ratio) * 100 + return [metric_xid_remaining_ratio(percentage_remaining)] + + +def get_stats_wal_file_amount(data_dir, _db_connection): return [metric_wal_file_amount(get_amount_of_wal_files(data_dir))] -def get_stats_incoming_replication_status(db_connection): +def get_stats_incoming_replication_status(_data_dir, db_connection): return [metric_incoming_replication_running(host, is_streaming) for host, is_streaming in get_wal_receiver_status(db_connection)] diff --git a/postgresql_metrics/metrics_logic.py b/postgresql_metrics/metrics_logic.py index 4550579..3fef4ee 100644 --- a/postgresql_metrics/metrics_logic.py +++ b/postgresql_metrics/metrics_logic.py @@ -82,7 +82,7 @@ def _is_time_to_call_stats_func_and_update_ts(database_name, metrics_func, run_i return False -def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_name=None): +def _call_all_db_functions(db_stats_functions, db_parameters, schedule=False, db_name=None): """Iterates through all given statistics functions, calling them with the given parameter. The db_parameter can be a database connection or a file path to Postgres data directory, depending on the statistics function to call. @@ -100,7 +100,7 @@ def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_ if is_call_required: try: LOG.debug('calling stats function {}', db_metrics_func.__name__) - metrics.extend(db_metrics_func(db_parameter)) + metrics.extend(db_metrics_func(*db_parameters)) except Exception: LOG.exception('failed calling stats function: ' + db_metrics_func.__name__) return metrics @@ -123,8 +123,12 @@ def get_stats_functions_from_conf(func_key_name, conf): def get_all_stats_functions_from_conf(conf): db_functions = get_stats_functions_from_conf('db_functions', conf) global_db_functions = get_stats_functions_from_conf('global_db_functions', conf) + # `data_dir_functions` is deprecated, but to preserve backwards compatibility still read data_dir_functions = get_stats_functions_from_conf('data_dir_functions', conf) - return db_functions, global_db_functions, data_dir_functions + if data_dir_functions: + LOG.warn("data_dir_functions field in config is deprecated -- consider moving functions to global_db_functions") + all_global_db_functions = data_dir_functions + global_db_functions + return db_functions, all_global_db_functions def get_all_metrics_now(db_connections, conf): @@ -132,13 +136,12 @@ def get_all_metrics_now(db_connections, conf): First gets the global stats with first available database connection, and then gets the rest per database. """ - db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf) + db_functions, global_db_functions = get_all_stats_functions_from_conf(conf) data_dir = figure_out_postgres_data_dir(db_connections[0], conf) - all_metrics = _call_all_db_functions(db_connections[0], global_db_functions) - all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions)) + all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0])) for db_connection in db_connections: - all_metrics.extend(_call_all_db_functions(db_connection, db_functions)) + all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection))) return all_metrics @@ -147,14 +150,13 @@ def get_all_metrics_scheduled(db_connections, conf): First gets the global stats with first available database connection, and then gets the rest per database. """ - db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf) + db_functions, global_db_functions = get_all_stats_functions_from_conf(conf) data_dir = figure_out_postgres_data_dir(db_connections[0], conf) - all_metrics = _call_all_db_functions(db_connections[0], global_db_functions, schedule=True) - all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions, schedule=True)) + all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]), schedule=True) for db_connection in db_connections: db_name = get_db_name_from_connection(db_connection) - all_metrics.extend(_call_all_db_functions(db_connection, db_functions, + all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection), schedule=True, db_name=db_name)) return all_metrics diff --git a/postgresql_metrics/postgres_queries.py b/postgresql_metrics/postgres_queries.py index 38c1f5d..ea825ed 100644 --- a/postgresql_metrics/postgres_queries.py +++ b/postgresql_metrics/postgres_queries.py @@ -195,6 +195,28 @@ def get_oldest_transaction_timestamp(conn): return None, None +def get_max_mxid_age(conn): + # `mxid_age` is only available on postgres 9.5 and newer + if conn.server_version < 95000: + LOG.error("Unable to check mxid_age on versions of postgres below 9.5") + return None + sql = "SELECT max(mxid_age(relminmxid)) FROM pg_class WHERE relminmxid <> '0'" + results = query(conn, sql) + if not results: + return None + mxid_age, = results[0] + return int(mxid_age) + + +def get_max_xid_age(conn): + sql = "SELECT max(age(datfrozenxid)) FROM pg_database" + results = query(conn, sql) + if not results: + return None + xid_age, = results[0] + return int(xid_age) + + def get_replication_delays(conn): sql = ("SELECT client_addr, " "pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff "