diff --git a/check_postgres.pl b/check_postgres.pl index 5f78bbb8..690cf207 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -39,6 +39,9 @@ package check_postgres; ## Which user to connect as if --dbuser is not given $opt{defaultuser} = 'postgres'; +## Number of seconds that slave can go without receiving a write from master before alerting if --maxidlemasterdelay is not given +$opt{defaultidlemasterdelay} = 3600; + ## Which port to connect to if --dbport is not given $opt{defaultport} = 5432; @@ -975,19 +978,20 @@ package check_postgres; 'tempdir=s', 'get_method=s', 'language=s', - 'mrtg=s', ## used by MRTG checks only - 'logfile=s', ## used by check_logfile only - 'queryname=s', ## used by query_runtime only - 'query=s', ## used by custom_query only - 'valtype=s', ## used by custom_query only - 'reverse', ## used by custom_query only - 'repinfo=s', ## used by replicate_row only - 'noidle', ## used by backends only - 'datadir=s', ## used by checkpoint only - 'schema=s@', ## used by slony_status only - 'filter=s@', ## used by same_schema only - 'suffix=s', ## used by same_schema only - 'replace', ## used by same_schema only + 'mrtg=s', ## used by MRTG checks only + 'logfile=s', ## used by check_logfile only + 'queryname=s', ## used by query_runtime only + 'query=s', ## used by custom_query only + 'valtype=s', ## used by custom_query only + 'reverse', ## used by custom_query only + 'repinfo=s', ## used by replicate_row only + 'noidle', ## used by backends only + 'datadir=s', ## used by checkpoint only + 'schema=s@', ## used by slony_status only + 'filter=s@', ## used by same_schema only + 'suffix=s', ## used by same_schema only + 'replace', ## used by same_schema only + 'maxidlemasterdelay=i', ## used by check_replay_delay only ); die $USAGE if ! keys %opt and ! @ARGV; @@ -1018,6 +1022,9 @@ package check_postgres; elsif ($name =~ /^dbservice(\d+)$/o) { push @{ $opt{dbservice} } => $value; } + elsif ($name =~ /^maxidlemasterdelay(\d+)$/o) { + push @{ $opt{maxidlemasterdelay} } => $value; + } else { push @badargs => $arg; } @@ -1148,6 +1155,7 @@ package check_postgres; hitratio => [0, 'Report if the hit ratio of a database is too low.'], hot_standby_delay => [1, 'Check the replication delay in hot standby setup'], index_size => [0, 'Checks the size of indexes only.'], + replay_delay => [0, 'Check the log replay delay during recovery'], table_size => [0, 'Checks the size of tables only.'], relation_size => [0, 'Checks the size of tables and indexes.'], last_analyze => [0, 'Check the maximum time in seconds since any one table has been analyzed.'], @@ -1224,6 +1232,7 @@ package check_postgres; --exclude=name(s) items to specifically exclude (e.g. tables), depends on the action --includeuser=include objects owned by certain users --excludeuser=exclude objects owned by certain users + --maxidlemasterdelay number of seconds slave can go without receiving a write from master; defaults to '$opt{defaultidlemasterdelay}' Other options: --assume-standby-mode assume that server in continious WAL recovery mode @@ -1752,6 +1761,7 @@ sub finishup { fsm_pages => 'VERSION: 8.2 MAX: 8.3', fsm_relations => 'VERSION: 8.2 MAX: 8.3', hot_standby_delay => 'VERSION: 9.0', + replay_delay => 'VERSION: 9.0', listener => 'MAX: 8.4', ); if ($opt{test}) { @@ -1945,6 +1955,9 @@ sub finishup { ## Check the replication delay in hot standby setup check_hot_standby_delay() if $action eq 'hot_standby_delay'; +## Check the log replay delay during recovery +check_replay_delay() if $action eq 'replay_delay'; + ## Check the maximum transaction age of all connections check_txn_time() if $action eq 'txn_time'; @@ -4743,6 +4756,61 @@ sub check_hitratio { } ## end of check_hitratio +sub check_replay_delay { + ## Check the log replay delay during recovery + ## Supports: Nagios + ## Critical and warning are the thresholds of delay in seconds. + ## Example: --critical=5 + + my ($warning, $critical) = validate_range({type => 'integer', leastone => 1}); + + # set max idle master delay to override or the default + my $maxidlemasterdelay = $opt{maxidlemasterdelay} || $opt{defaultidlemasterdelay}; + + # check if we are in recovery using pg_is_in_recovery() + $SQL = q{SELECT pg_is_in_recovery() AS recovery;}; + + my $info = run_command($SQL, { regex => qr([tf]) }); + for $db (@{$info->{db}}) { + my $status = $db->{slurp}[0]; + if ($status->{recovery} eq 'f') { + add_critical("not in recovery"); + return; + } + } + # We can't assume delay is none if last replayed equals last received, because in + # reality it could mean replication has gone out for lunch. + # This can lead to false negatives on an idle master, but is preferable to the + # opposite where replication has gone away and we assume everything is hunkey-dorey. + # This is also why I renamed this check to replay_delay from hot_standby_delay_slave. + # + # It only tells you the replication delay IF the master is active (i.e. receiving updates). + # If the master is not active, if it exceeds the threshold specified it will also alert. + $SQL = qq{SELECT CASE + WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() + AND EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) < $maxidlemasterdelay + THEN 0 + ELSE + EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;}; + $info = run_command($SQL); + + for $db (@{$info->{db}}) { + my $delay = $db->{slurp}[0]->{log_delay}; + my $msg = qq{delay=${delay}s}; + + if (length $critical and $delay > $critical) { + add_critical $msg; + } + elsif (length $warning and $delay > $warning) { + add_warning $msg; + } + else { + add_ok $msg; + } + } + +} ## end of check_replay_delay + sub check_hot_standby_delay { ## Check on the delay in PITR replication between master and slave @@ -8874,6 +8942,12 @@ =head2 B check_hot_standby_delay --dbhost=master,replica1 --warning='1048576 and 2 min' --critical='16777216 and 10 min' +=head2 B + +(C) Returns seconds passed since last transaction replayed +during recovery. This practically tells you the replication delay of a hot standby (locally) IF +the master is active (i.e. master is receiving updates). + =head2 B =head2 B