Skip to content

Commit ddc4d76

Browse files
committed
launch command worker earlier
since the command worker forks the main naemon process, it inherits all open files like ex.: pidfile, logfiles, etc... It will keep those references open, even if the main process rotates and reopens those files. This patch closes query handler and pid file references after starting the command worker and also moves starting the command worker before initializing the neb modules, so it won't inherit open logfiles from neb modules. references: - ConSol-Monitoring/omd#146 Signed-off-by: Sven Nierlein <[email protected]>
1 parent 22f0fb6 commit ddc4d76

File tree

6 files changed

+59
-36
lines changed

6 files changed

+59
-36
lines changed

src/naemon/commands.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "globals.h"
2020
#include "logging.h"
2121
#include "nm_alloc.h"
22+
#include "query-handler.h"
2223
#include "lib/libnaemon.h"
2324
#include <string.h>
2425
#include <sys/types.h>
@@ -388,6 +389,13 @@ int launch_command_file_worker(void)
388389
/* make our own process-group so we can be traced into and stuff */
389390
setpgid(0, 0);
390391

392+
393+
// close inherited file handles
394+
close_log_file();
395+
close_standard_fds();
396+
qh_close_socket();
397+
close_lockfile_fd();
398+
391399
str = nm_strdup(command_file);
392400
free_memory(get_global_macros());
393401
command_file = str;

src/naemon/naemon.c

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -543,22 +543,6 @@ int main(int argc, char **argv)
543543
nerd_init();
544544
timing_point("Initialized NERD\n");
545545

546-
/* initialize check workers */
547-
timing_point("Spawning %u workers\n", wproc_num_workers_spawned);
548-
if (init_workers(num_check_workers) < 0) {
549-
nm_log(NSLOG_RUNTIME_ERROR, "Failed to spawn workers. Aborting\n");
550-
exit(EXIT_FAILURE);
551-
}
552-
timing_point("Spawned %u workers\n", wproc_num_workers_spawned);
553-
554-
timing_point("Connecting %u workers\n", wproc_num_workers_online);
555-
i = 0;
556-
while (i < 50 && wproc_num_workers_online < wproc_num_workers_spawned) {
557-
iobroker_poll(nagios_iobs, 50);
558-
i++;
559-
}
560-
timing_point("Connected %u workers\n", wproc_num_workers_online);
561-
562546
/* read in all object config data */
563547
if (result == OK) {
564548
timing_point("Reading all object data\n");
@@ -576,6 +560,29 @@ int main(int argc, char **argv)
576560
init_event_queue();
577561
timing_point("Initialized Event queue\n");
578562

563+
registered_commands_init(200);
564+
register_core_commands();
565+
/* fire up command file worker */
566+
timing_point("Launching command file worker\n");
567+
launch_command_file_worker();
568+
timing_point("Launched command file worker\n");
569+
570+
/* initialize check workers */
571+
timing_point("Spawning %u workers\n", wproc_num_workers_spawned);
572+
if (init_workers(num_check_workers) < 0) {
573+
nm_log(NSLOG_RUNTIME_ERROR, "Failed to spawn workers. Aborting\n");
574+
exit(EXIT_FAILURE);
575+
}
576+
timing_point("Spawned %u workers\n", wproc_num_workers_spawned);
577+
578+
timing_point("Connecting %u workers\n", wproc_num_workers_online);
579+
i = 0;
580+
while (i < 50 && wproc_num_workers_online < wproc_num_workers_spawned) {
581+
iobroker_poll(nagios_iobs, 50);
582+
i++;
583+
}
584+
timing_point("Connected %u workers\n", wproc_num_workers_online);
585+
579586
/* load modules */
580587
timing_point("Loading modules\n");
581588
if (neb_load_all_modules() != OK) {
@@ -680,13 +687,6 @@ int main(int argc, char **argv)
680687
log_service_states(INITIAL_STATES, NULL);
681688
timing_point("Logged initial states\n");
682689

683-
registered_commands_init(200);
684-
register_core_commands();
685-
/* fire up command file worker */
686-
timing_point("Launching command file worker\n");
687-
launch_command_file_worker();
688-
timing_point("Launched command file worker\n");
689-
690690
broker_program_state(NEBTYPE_PROCESS_EVENTLOOPSTART, NEBFLAG_NONE, NEBATTR_NONE);
691691

692692
/* get event start time and save as macro */

src/naemon/query-handler.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ int qh_init(const char *path)
394394
result = iobroker_register(nagios_iobs, qh_listen_sock, NULL, qh_registration_input);
395395
if (result < 0) {
396396
g_hash_table_destroy(qh_table);
397-
close(qh_listen_sock);
397+
qh_close_socket();
398398
nm_log(NSLOG_RUNTIME_ERROR, "qh: Failed to register socket with io broker: %s\n", iobroker_strerror(result));
399399
return ERROR;
400400
}
@@ -408,3 +408,9 @@ int qh_init(const char *path)
408408

409409
return 0;
410410
}
411+
412+
void qh_close_socket() {
413+
if( qh_listen_sock > 0 )
414+
close(qh_listen_sock);
415+
qh_listen_sock = -1;
416+
}

src/naemon/query-handler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ int qh_init(const char *path);
2020
void qh_deinit(const char *path);
2121
int qh_register_handler(const char *name, const char *description, unsigned int options, qh_handler handler);
2222
const char *qh_strerror(int code);
23+
void qh_close_socket(void);
2324

2425
NAGIOS_END_DECL
2526

src/naemon/utils.c

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ int host_skip_check_dependency_status = DEFAULT_SKIP_CHECK_STATUS;
172172

173173
static long long check_file_size(char *, unsigned long, struct rlimit);
174174

175+
static int lock_file_fd = -1; /* the file handle of the lockfile */
176+
175177
time_t max_check_result_file_age = DEFAULT_MAX_CHECK_RESULT_AGE;
176178

177179
check_stats check_statistics[MAX_CHECK_STATS_TYPES];
@@ -498,7 +500,6 @@ int signal_parent(int sig)
498500
int daemon_init(void)
499501
{
500502
int pid = 0;
501-
int lockfile = 0;
502503
int val = 0;
503504
char buf[256];
504505
struct flock lock;
@@ -509,16 +510,16 @@ int daemon_init(void)
509510

510511
umask(S_IWGRP | S_IWOTH);
511512

512-
lockfile = open(lock_file, O_RDWR | O_CREAT, S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH);
513+
lock_file_fd = open(lock_file, O_RDWR | O_CREAT, S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH);
513514

514-
if (lockfile < 0) {
515+
if (lock_file_fd < 0) {
515516
nm_log(NSLOG_RUNTIME_ERROR, "Failed to obtain lock on file %s: %s\n", lock_file, strerror(errno));
516517
nm_log(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR, "Bailing out due to errors encountered while attempting to daemonize... (PID=%d)", (int)getpid());
517518
return (ERROR);
518519
}
519520

520521
/* see if we can read the contents of the lockfile */
521-
if ((val = read(lockfile, buf, (size_t)10)) < 0) {
522+
if ((val = read(lock_file_fd, buf, (size_t)10)) < 0) {
522523
nm_log(NSLOG_RUNTIME_ERROR, "Lockfile exists but cannot be read");
523524
return (ERROR);
524525
}
@@ -540,7 +541,7 @@ int daemon_init(void)
540541
lock.l_start = 0;
541542
lock.l_whence = SEEK_SET;
542543
lock.l_len = 0;
543-
if (fcntl(lockfile, F_GETLK, &lock) == -1) {
544+
if (fcntl(lock_file_fd, F_GETLK, &lock) == -1) {
544545
nm_log(NSLOG_RUNTIME_ERROR, "Failed to access lockfile '%s'. %s. Bailing out...", lock_file, strerror(errno));
545546
return (ERROR);
546547
}
@@ -609,9 +610,9 @@ int daemon_init(void)
609610
lock.l_whence = SEEK_SET;
610611
lock.l_len = 0;
611612
lock.l_pid = getpid();
612-
if (fcntl(lockfile, F_SETLK, &lock) == -1) {
613+
if (fcntl(lock_file_fd, F_SETLK, &lock) == -1) {
613614
if (errno == EACCES || errno == EAGAIN) {
614-
fcntl(lockfile, F_GETLK, &lock);
615+
fcntl(lock_file_fd, F_GETLK, &lock);
615616
nm_log(NSLOG_RUNTIME_ERROR, "Lockfile '%s' looks like its already held by another instance of Naemon (PID %d). Bailing out, post-fork...", lock_file, (int)lock.l_pid);
616617
} else
617618
nm_log(NSLOG_RUNTIME_ERROR, "Cannot lock lockfile '%s': %s. Bailing out...", lock_file, strerror(errno));
@@ -620,28 +621,34 @@ int daemon_init(void)
620621
}
621622

622623
/* write PID to lockfile... */
623-
lseek(lockfile, 0, SEEK_SET);
624-
if (ftruncate(lockfile, 0) != 0) {
624+
lseek(lock_file_fd, 0, SEEK_SET);
625+
if (ftruncate(lock_file_fd, 0) != 0) {
625626
nm_log(NSLOG_RUNTIME_ERROR, "Cannot truncate lockfile '%s': %s. Bailing out...", lock_file, strerror(errno));
626627
return (ERROR);
627628
}
628629
sprintf(buf, "%d\n", (int)getpid());
629630

630-
if (nsock_write_all(lockfile, buf, strlen(buf)) != 0) {
631+
if (nsock_write_all(lock_file_fd, buf, strlen(buf)) != 0) {
631632
nm_log(NSLOG_RUNTIME_ERROR, "Cannot write PID to lockfile '%s': %s. Bailing out...", lock_file, strerror(errno));
632633
return (ERROR);
633634
}
634635

635636
/* make sure lock file stays open while program is executing... */
636-
val = fcntl(lockfile, F_GETFD, 0);
637+
val = fcntl(lock_file_fd, F_GETFD, 0);
637638
val |= FD_CLOEXEC;
638-
fcntl(lockfile, F_SETFD, val);
639+
fcntl(lock_file_fd, F_SETFD, val);
639640

640641
broker_program_state(NEBTYPE_PROCESS_DAEMONIZE, NEBFLAG_NONE, NEBATTR_NONE);
641642

642643
return OK;
643644
}
644645

646+
void close_lockfile_fd() {
647+
if(lock_file_fd > 0)
648+
close(lock_file_fd);
649+
lock_file_fd = -1;
650+
}
651+
645652
/******************************************************************/
646653
/************************* FILE FUNCTIONS *************************/
647654
/******************************************************************/

src/naemon/utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void signal_react(void); /* General signal reaction routines */
3737
void handle_sigxfsz(void); /* handle SIGXFSZ */
3838
int signal_parent(int); /* signal parent when daemonizing */
3939
int daemon_init(void); /* switches to daemon mode */
40+
void close_lockfile_fd(void); /* close lock_file file handle */
4041

4142
int init_check_stats(void);
4243
int update_check_stats(int, time_t);

0 commit comments

Comments
 (0)