Skip to content

Commit 22f0fb6

Browse files
committed
add new options to keep services running as long as they are up
The issue with the options: - host_down_disable_service_checks - service_skip_check_dependency_status - service_skip_check_host_down_status - host_skip_check_dependency_status is that reports break because hosts/services suddenly stop executing and keep their OK state. Which makes those options pretty unusable. So in order to keep reporting correct, you need to keep services running, even if the host is down. With these new options, hosts/services keep on running as long as they are up. And as soon as the service is down, it stops running until the host comes back up. That way naemon has to do less checks, especially less checks which run into timeouts and such but reporting is still correct. The option service_skip_check_dependency_status=-2 will also be used for service parents. Adding a new option service_parents_disable_service_checks to prevent running service checks if service parents are down. Recommended settings are: host_down_disable_service_checks=1 ; disable service checks if host is down service_parents_disable_service_checks=1 ; also disable service checks if parents are down service_skip_check_host_down_status=-2 ; but keep running as long as they are ok service_skip_check_dependency_status=-2 ; same, but for dependency checks. host_skip_check_dependency_status=-2 ; and for host checks.
1 parent 7031173 commit 22f0fb6

File tree

7 files changed

+123
-29
lines changed

7 files changed

+123
-29
lines changed

sample-config/naemon.cfg.in

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,18 +1041,27 @@ allow_empty_hostgroup_assignment=0
10411041
# This option will disable all service checks if the host is not in an UP state
10421042
#
10431043
# While desirable in some environments, enabling this value can distort report
1044-
# values as the expected quantity of checks will not have been performed
1045-
1044+
# values as the expected quantity of checks will not have been performed.
1045+
# Set service_skip_check_host_down_status to -2 to mitigate this.
10461046
#host_down_disable_service_checks=0
10471047

1048+
# DISABLE SERVICE CHECKS WHEN SERVICE PARENTS DOWN
1049+
# This option will disable all service checks if the service parents are not in an UP state
1050+
#
1051+
# While desirable in some environments, enabling this value can distort report
1052+
# values as the expected quantity of checks will not have been performed.
1053+
# Set service_skip_check_dependency_status to -2 to mitigate this.
1054+
#service_parents_disable_service_checks=0
1055+
10481056
# SET SERVICE/HOST STATUS WHEN SERVICE CHECK SKIPPED
10491057
# These options will allow you to set the status of a service when its
1050-
# service check is skipped due to one of two reasons:
1051-
# 1) failed dependency check; 2) host not up
1058+
# service check is skipped due to the following reasons:
1059+
# 1) failed dependency check; 2) host not up 3) service parents failed
10521060
# Number 2 can only happen if 'host_down_disable_service_checks' above
10531061
# is set to 1.
10541062
# Valid values for the service* options are:
10551063
# -1 Do not change the service status (default)
1064+
# -2 Keep service running as long as it is ok/warning.
10561065
# 0 Set the service status to STATE_OK
10571066
# 1 Set the service status to STATE_WARNING
10581067
# 2 Set the service status to STATE_CRITICAL
@@ -1064,6 +1073,7 @@ allow_empty_hostgroup_assignment=0
10641073
# status of a host when its check is skipped due to a failed dependency check.
10651074
# Valid values for the host_dependency_skip_check_status are:
10661075
# -1 Do not change the service status (default)
1076+
# -2 Keep host running as long as it is up.
10671077
# 0 Set the host status to STATE_UP
10681078
# 1 Set the host status to STATE_DOWN
10691079
# 2 Set the host status to STATE_UNREACHABLE

src/naemon/checks_host.c

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -243,18 +243,29 @@ static int run_async_host_check(host *hst, int check_options, double latency)
243243
/* check host dependencies for execution */
244244
log_debug_info(DEBUGL_CHECKS, 0, "Host '%s' checking dependencies...\n", hst->name);
245245
if (check_host_dependencies(hst, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
246-
if (host_skip_check_dependency_status >= 0) {
247-
hst->current_state = host_skip_check_dependency_status;
248-
if (strstr(hst->plugin_output, "(host dependency check failed)") == NULL) {
249-
char *old_output = nm_strdup(hst->plugin_output);
250-
nm_free(hst->plugin_output);
251-
nm_asprintf(&hst->plugin_output, "(host dependency check failed) was: %s", old_output);
252-
nm_free(old_output);
253-
}
246+
int keep_running = FALSE;
247+
switch(host_skip_check_dependency_status) {
248+
case SKIP_KEEP_RUNNING_WHEN_UP:
249+
if (hst->current_state == STATE_UP) {
250+
keep_running = TRUE;
251+
}
252+
break;
253+
case STATE_UP:
254+
case STATE_DOWN:
255+
case STATE_UNREACHABLE:
256+
hst->current_state = host_skip_check_dependency_status;
257+
if (strstr(hst->plugin_output, "(host dependency check failed)") == NULL) {
258+
char *old_output = nm_strdup(hst->plugin_output);
259+
nm_free(hst->plugin_output);
260+
nm_asprintf(&hst->plugin_output, "(host dependency check failed) was: %s", old_output);
261+
nm_free(old_output);
262+
}
263+
break;
264+
}
265+
if(!keep_running) {
266+
log_debug_info(DEBUGL_CHECKS, 0, "Host '%s' failed dependency check. Aborting check\n", hst->name);
267+
return ERROR;
254268
}
255-
256-
log_debug_info(DEBUGL_CHECKS, 0, "Host '%s' failed dependency check. Aborting check\n", hst->name);
257-
return ERROR;
258269
}
259270
}
260271

src/naemon/checks_service.c

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -200,27 +200,94 @@ static void handle_service_check_event(struct nm_event_execution_properties *evp
200200
/* check service dependencies for execution */
201201
log_debug_info(DEBUGL_CHECKS, 0, "Service '%s' on host '%s' checking dependencies...\n", temp_service->description, temp_service->host_name);
202202
if (check_service_dependencies(temp_service, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
203-
if (service_skip_check_dependency_status >= 0) {
204-
temp_service->current_state = service_skip_check_dependency_status;
205-
if (strstr(temp_service->plugin_output, "(service dependency check failed)") == NULL) {
206-
char *old_output = nm_strdup(temp_service->plugin_output);
207-
nm_free(temp_service->plugin_output);
208-
nm_asprintf(&temp_service->plugin_output, "(service dependency check failed) was: %s", old_output);
209-
nm_free(old_output);
203+
int keep_running = FALSE;
204+
switch(service_skip_check_dependency_status) {
205+
case SKIP_KEEP_RUNNING_WHEN_UP:
206+
if (temp_service->current_state <= STATE_WARNING) {
207+
keep_running = TRUE;
208+
}
209+
break;
210+
case STATE_OK:
211+
case STATE_WARNING:
212+
case STATE_CRITICAL:
213+
case STATE_UNKNOWN:
214+
temp_service->current_state = service_skip_check_dependency_status;
215+
if (strstr(temp_service->plugin_output, "(service dependency check failed)") == NULL) {
216+
char *old_output = nm_strdup(temp_service->plugin_output);
217+
nm_free(temp_service->plugin_output);
218+
nm_asprintf(&temp_service->plugin_output, "(service dependency check failed) was: %s", old_output);
219+
nm_free(old_output);
220+
}
221+
break;
222+
}
223+
if (!keep_running) {
224+
log_debug_info(DEBUGL_CHECKS, 0, "Service '%s' on host '%s' failed dependency check. Aborting check\n", temp_service->description, temp_service->host_name);
225+
return;
226+
}
227+
}
228+
229+
/* check service parents for execution */
230+
if(service_parents_disable_service_checks && temp_service->parents) {
231+
int parents_failed = FALSE;
232+
if (temp_service->current_state != STATE_OK) {
233+
servicesmember *sm = temp_service->parents;
234+
while (sm && sm->service_ptr->current_state != STATE_OK) {
235+
sm = sm->next;
236+
}
237+
if (sm == NULL) {
238+
parents_failed = TRUE;
210239
}
211240
}
212-
log_debug_info(DEBUGL_CHECKS, 0, "Service '%s' on host '%s' failed dependency check. Aborting check\n", temp_service->description, temp_service->host_name);
213-
return;
241+
if(parents_failed) {
242+
switch(service_skip_check_dependency_status) {
243+
case SKIP_KEEP_RUNNING_WHEN_UP:
244+
if (temp_service->current_state <= STATE_WARNING) {
245+
parents_failed = FALSE;
246+
}
247+
break;
248+
case STATE_OK:
249+
case STATE_WARNING:
250+
case STATE_CRITICAL:
251+
case STATE_UNKNOWN:
252+
temp_service->current_state = service_skip_check_dependency_status;
253+
if (strstr(temp_service->plugin_output, "(service parents failed)") == NULL) {
254+
char *old_output = nm_strdup(temp_service->plugin_output);
255+
nm_free(temp_service->plugin_output);
256+
nm_asprintf(&temp_service->plugin_output, "(service parents failed) was: %s", old_output);
257+
nm_free(old_output);
258+
}
259+
break;
260+
}
261+
}
262+
if(parents_failed) {
263+
log_debug_info(DEBUGL_CHECKS, 0, "Service '%s' on host '%s' failed parents check. Aborting check\n", temp_service->description, temp_service->host_name);
264+
return;
265+
}
214266
}
215267

268+
216269
/* check if host is up - if not, do not perform check */
217270
if (host_down_disable_service_checks) {
218271
if ((temp_host = temp_service->host_ptr) == NULL) {
219272
log_debug_info(DEBUGL_CHECKS, 2, "Host pointer NULL in handle_service_check_event().\n");
220273
return;
221-
} else {
222-
if (temp_host->current_state != STATE_UP) {
274+
}
275+
if (temp_host->current_state != STATE_UP) {
276+
int keep_running = TRUE;
277+
switch (service_skip_check_host_down_status) {
278+
/* only keep running if service is up or host_down_disable_service_checks is disabled */
279+
case SKIP_KEEP_RUNNING_WHEN_UP:
280+
if (temp_service->current_state > STATE_WARNING) {
281+
log_debug_info(DEBUGL_CHECKS, 2, "Host and service state not UP, so service check will not be performed - will be rescheduled as normal.\n");
282+
keep_running = FALSE;
283+
}
284+
break;
285+
default:
223286
log_debug_info(DEBUGL_CHECKS, 2, "Host state not UP, so service check will not be performed - will be rescheduled as normal.\n");
287+
keep_running = FALSE;
288+
break;
289+
}
290+
if(!keep_running) {
224291
if (service_skip_check_host_down_status >= 0) {
225292
temp_service->current_state = service_skip_check_host_down_status;
226293
if (strstr(temp_service->plugin_output, "(host is down)") == NULL) {

src/naemon/configuration.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,23 +1067,25 @@ read_config_file(const char *main_config_file, nagios_macros *mac)
10671067
allow_circular_dependencies = atoi(value);
10681068
} else if (!strcmp(variable, "host_down_disable_service_checks")) {
10691069
host_down_disable_service_checks = strtoul(value, NULL, 0);
1070+
} else if (!strcmp(variable, "service_parents_disable_service_checks")) {
1071+
service_parents_disable_service_checks = strtoul(value, NULL, 0);
10701072
} else if (!strcmp(variable, "service_skip_check_dependency_status")) {
10711073
service_skip_check_dependency_status = atoi(value);
1072-
if (service_skip_check_dependency_status < -1 || service_skip_check_dependency_status > 3) {
1074+
if (service_skip_check_dependency_status < -2 || service_skip_check_dependency_status > 3) {
10731075
nm_asprintf(&error_message, "Illegal value for service_skip_check_dependency_status");
10741076
error = TRUE;
10751077
break;
10761078
}
10771079
} else if (!strcmp(variable, "service_skip_check_host_down_status")) {
10781080
service_skip_check_host_down_status = atoi(value);
1079-
if (service_skip_check_host_down_status < -1 || service_skip_check_host_down_status > 3) {
1081+
if (service_skip_check_host_down_status < -2 || service_skip_check_host_down_status > 3) {
10801082
nm_asprintf(&error_message, "Illegal value for service_skip_check_host_down_status");
10811083
error = TRUE;
10821084
break;
10831085
}
10841086
} else if (!strcmp(variable, "host_skip_check_dependency_status")) {
10851087
host_skip_check_dependency_status = atoi(value);
1086-
if (host_skip_check_dependency_status < -1 || host_skip_check_dependency_status > 3) {
1088+
if (host_skip_check_dependency_status < -2 || host_skip_check_dependency_status > 3) {
10871089
nm_asprintf(&error_message, "Illegal value for host_skip_check_dependency_status");
10881090
error = TRUE;
10891091
break;

src/naemon/defaults.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@
8989
#define DEFAULT_ALLOW_CIRCULAR_DEPENDENCIES 0 /* Allow circular dependencies */
9090
#define DEFAULT_HOST_DOWN_DISABLE_SERVICE_CHECKS 0 /* run service checks if the host is down */
9191
#define DEFAULT_SKIP_CHECK_STATUS -1 /* do not change status by default */
92+
#define SKIP_KEEP_RUNNING_WHEN_UP -2 /* run service checks as long as the host and service is up (ok/warning) */
93+
#define DEFAULT_SERVICE_PARENTS_DISABLE_SERVICE_CHECKS 0 /* run service checks if service parents are down */
9294

9395
#define DEFAULT_HOST_PERFDATA_FILE_TEMPLATE "[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$"
9496
#define DEFAULT_SERVICE_PERFDATA_FILE_TEMPLATE "[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"

src/naemon/globals.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ extern unsigned long max_debug_file_size;
146146
extern int allow_empty_hostgroup_assignment;
147147
extern int allow_circular_dependencies;
148148
extern int host_down_disable_service_checks;
149+
extern int service_parents_disable_service_checks;
149150
extern int service_skip_check_dependency_status;
150151
extern int service_skip_check_host_down_status;
151152
extern int host_skip_check_dependency_status;

src/naemon/utils.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ char *use_timezone = NULL;
165165
int allow_empty_hostgroup_assignment = DEFAULT_ALLOW_EMPTY_HOSTGROUP_ASSIGNMENT;
166166
int allow_circular_dependencies = DEFAULT_ALLOW_CIRCULAR_DEPENDENCIES;
167167
int host_down_disable_service_checks = DEFAULT_HOST_DOWN_DISABLE_SERVICE_CHECKS;
168+
int service_parents_disable_service_checks = DEFAULT_SERVICE_PARENTS_DISABLE_SERVICE_CHECKS;
168169
int service_skip_check_dependency_status = DEFAULT_SKIP_CHECK_STATUS;
169170
int service_skip_check_host_down_status = DEFAULT_SKIP_CHECK_STATUS;
170171
int host_skip_check_dependency_status = DEFAULT_SKIP_CHECK_STATUS;

0 commit comments

Comments
 (0)