2 * This program is free software; you can redistribute it and/or modify it under
3 * the terms of the GNU General Public License as published by the Free Software
4 * Foundation; either version 2 of the License, or (at your option) any later
7 * This program is distributed in the hope that it will be useful, but WITHOUT
8 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
9 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * You should have received a copy of the GNU General Public License along with
13 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
14 * Place - Suite 330, Boston, MA 02111-1307, USA.
16 ******************************************************************************
19 * 2) Add the "cl_respawn recover" function, for combining with recovery
20 * manager. But what's its strategy ?
21 * The pid will passed by environment
22 * 3) Add the function for "-l" option ?
23 ******************************************************************************
27 * A small respawn tool which will start a program as a child process, and
28 * unless it exits with the "magic" exit code, will restart the program again
29 * if it exits(dies). It is intended that this respawn program should be usable
30 * in resource agent scripts and other places. The respawn tool should properly
31 * log all restarts, and all exits which it doesn't respawn, and run itself as a
32 * client of the apphb application heartbeating program, so that it can be
33 * restarted if the program it is monitoring dies.
35 * Author: Sun Jiang Dong <sunjd@cn.ibm.com>
36 * Copyright (c) 2004 International Business Machines
39 #include <lha_internal.h>
51 #include <sys/types.h>
55 #include <clplumbing/cl_log.h>
56 #include <clplumbing/cl_signal.h>
57 #include <clplumbing/uids.h>
58 #include <clplumbing/lsb_exitcodes.h>
59 #include <clplumbing/GSource.h>
60 #include <clplumbing/proctrack.h>
61 #include <clplumbing/Gmain_timeout.h>
62 #include <clplumbing/cl_pidfile.h>
65 static const char * Simple_helpscreen =
66 "Usage cl_respawn [<options>] <monitored_program> [<arg1>] [<arg2>] ...\n"
67 "Options are as below:\n"
68 "-m magic_exit_code\n"
69 " When monitored_program exit as this magic_exit_code, then cl_respawn\n"
70 " will not try to respawn it.\n"
72 " Set the interval(ms) of application hearbeat or plumbing its client.\n"
74 " Set the warning time (ms) of application heartbeat.\n"
76 " Set the name of a pid file to use.\n"
77 "-r Recover itself from crash. Only called by other monitor programs like"
78 " recovery manager.\n"
79 "-l List the program monitored by cl_respawn.\n"
80 " Notice: donnot support yet.\n"
81 "-h Display this simple help.\n";
84 static void become_daemon(void);
85 static int run_client_as_child(char * client_argv[]);
86 static gboolean plumb_client_and_emit_apphb(gpointer data);
87 static gboolean cl_respawn_quit(int signo, gpointer user_data);
88 static void separate_argv(int * argc_p, char *** argv_p, char *** client_argv);
89 static int cmd_str_to_argv(char * cmd_str, char *** argv);
90 static void free_argv(char ** argv);
92 /* Functions for handling the child quit/abort event
94 static void monitoredProcessDied(ProcTrack* p, int status, int signo
95 , int exitcode, int waslogged);
96 static void monitoredProcessRegistered(ProcTrack* p);
97 static const char * monitoredProcessName(ProcTrack* p);
99 static ProcTrack_ops MonitoredProcessTrackOps = {
100 monitoredProcessDied,
101 monitoredProcessRegistered,
106 INSTANCE_NAME_LEN = 20,
107 APPHB_INTVL_DETLA = 30; /* Avoid the incorrect warning message */
109 static const unsigned long
110 DEFAULT_APPHB_INTERVAL = 2000, /* MS */
111 DEFAULT_APPHB_WARNTIME = 6000; /* MS */
113 static int MAGIC_EXIT_CODE = 100;
115 static const char * app_name = "cl_respawn";
116 static gboolean REGTO_APPHBD = FALSE;
117 static char * pidfile = NULL;
120 * This pid will equal to the PID of the process who was ever the child of
121 * that dead cl_respawn.
123 static pid_t monitored_PID = 0;
125 static const char * optstr = "rm:i:w:p:lh";
126 static GMainLoop * mainloop = NULL;
127 static gboolean IS_RECOVERY = FALSE;
129 static gboolean shutting_down = FALSE;
131 int main(int argc, char * argv[])
133 char app_instance[INSTANCE_NAME_LEN];
135 int interval = DEFAULT_APPHB_INTERVAL;
136 int apphb_warntime = DEFAULT_APPHB_WARNTIME;
137 char ** client_argv = NULL;
140 cl_log_set_entity(app_name);
141 cl_log_enable_stderr(TRUE);
142 cl_log_set_facility(HA_LOG_FACILITY);
144 if (argc == 1) { /* no arguments */
145 printf("%s\n", Simple_helpscreen);
146 exit(LSB_EXIT_EINVAL);
150 * Try to separate the option parameter between myself and the client.
151 * Maybe rewrite the argc and argv.
153 separate_argv(&argc, &argv, &client_argv);
159 cl_log(LOG_INFO, "client_argv: 0x%08lx", (unsigned long) client_argv);
160 cl_log(LOG_INFO, "Called arg");
162 for (j=0; argv[j] != NULL; ++j) {
163 cl_log(LOG_INFO, "argv[%d]: %s", j, argv[j]);
166 for (j=0; client_argv && client_argv[j] != NULL; ++j) {
168 cl_log(LOG_INFO, "client_argv[%d]: %s", j, client_argv[j]);
175 option_char = getopt(argc, argv, optstr);
177 if (option_char == -1) {
181 switch (option_char) {
188 MAGIC_EXIT_CODE = atoi(optarg);
194 interval = atoi(optarg);
207 apphb_warntime = atoi(optarg);
217 printf("%s\n",Simple_helpscreen);
221 cl_log(LOG_ERR, "getopt returned"
222 "character code %c.", option_char);
223 printf("%s\n",Simple_helpscreen);
224 return LSB_EXIT_EINVAL;
230 * Now I suppose recovery program only pass the client name via
231 * environment variables.
233 if ( (IS_RECOVERY == FALSE) && (client_argv == NULL) ) {
234 cl_log(LOG_ERR, "Please give the program name which will be "
235 "run as a child process of cl_respawn.");
236 printf("%s\n", Simple_helpscreen);
237 exit(LSB_EXIT_EINVAL);
240 if ((IS_RECOVERY == TRUE ) && ( client_argv == NULL)) {
242 * Here the client_argv must be NULL. At least now just
246 * From the environment variables to acquire the necessary
247 * information set by other daemons like recovery manager.
248 * RSP_PID: the PID of the process which need to be monitored.
249 * RSP_CMD: the command line to restart the program, which is
250 * the same as the input in command line as above.
252 if ( getenv("RSP_PID") == NULL ) {
253 cl_log(LOG_ERR, "cannot get monitored PID from the "
254 "environment variable which should be set by "
255 "the recovery program.");
256 exit(LSB_EXIT_EINVAL);
258 monitored_PID = atoi(getenv("RSP_PID"));
262 * client_argv == NULL" indicates no client program passed as
263 * a parameter by others such as a recovery manager, so expect
264 * it will be passed by environment variable RSP_CMD, see as
265 * below. If cannot get it, quit.
267 if (client_argv == NULL) {
268 if (getenv("RSP_CMD") == NULL) {
269 cl_log(LOG_ERR, "cannot get the argument of the "
270 "monitored program from the environment "
271 "variable, which should be set by the "
272 "recovery program.");
275 if (0!=cmd_str_to_argv(getenv("RSP_CMD"), &client_argv)) {
276 cl_log(LOG_ERR, "Failed to transfer the CLI "
277 "string to the argv[] style.");
278 exit(LSB_EXIT_EINVAL);
283 /* Not use the API 'daemon' since it's not a POSIX's */
289 cl_log(LOG_INFO,"%s", execv_argv[k]);
290 } while (execv_argv[++k] != NULL);
293 set_sigchld_proctrack(G_PRIORITY_HIGH,DEFAULT_MAXDISPATCHTIME);
295 if (( IS_RECOVERY == FALSE )) {
296 child_tmp = run_client_as_child(client_argv);
297 if (child_tmp > 0 ) {
298 cl_log(LOG_NOTICE, "started the monitored program %s, "
299 "whose PID is %d", client_argv[0], child_tmp);
301 exit(LSB_EXIT_GENERIC);
305 snprintf(app_instance, INSTANCE_NAME_LEN, "%s_%ldd"
306 , app_name, (long)getpid());
308 if (apphb_register(app_name, app_instance) != 0) {
309 cl_log(LOG_WARNING, "Failed to register with apphbd.");
310 cl_log(LOG_WARNING, "Maybe apphd isn't running.");
311 REGTO_APPHBD = FALSE;
314 cl_log(LOG_INFO, "Registered with apphbd.");
315 apphb_setinterval(interval);
316 apphb_setwarn(apphb_warntime);
317 /* To avoid the warning when app_interval is very small. */
320 Gmain_timeout_add(interval - APPHB_INTVL_DETLA
321 , plumb_client_and_emit_apphb, client_argv);
323 mainloop = g_main_new(FALSE);
324 g_main_run(mainloop);
326 if ( REGTO_APPHBD == TRUE ) {
335 run_client_as_child(char * execv_argv[])
340 if (execv_argv[0] == NULL) {
341 cl_log(LOG_ERR, "Null pointer to program name which need to"
343 return LSB_EXIT_EINVAL;
349 cl_log(LOG_ERR, "cannot start monitor program %s.",
352 } else if (pid > 0) { /* in the parent process */
353 NewTrackedProc( pid, 1, PT_LOGVERBOSE
354 , execv_argv, &MonitoredProcessTrackOps);
359 /* Now in child process */
360 execvp(execv_argv[0], execv_argv);
361 /* if go here, there must be something wrong */
362 cl_log(LOG_ERR, "%s",strerror(errno));
363 cl_log(LOG_ERR, "execving monitored program %s failed.", execv_argv[0]);
368 } while (execv_argv[++i] != NULL);
370 /* Since parameter error, donnot need to be respawned */
371 exit(MAGIC_EXIT_CODE);
375 * Notes: Since the work dir is changed to "/", the client name should include
376 * pathname or it's located in the system PATH
386 if ((runningpid=cl_read_pidfile(pidfile)) > 0) {
387 cl_log(LOG_WARNING, "pidfile [%s] says we're already running as pid [%d]"
388 , pidfile, runningpid);
391 if (cl_lock_pidfile(pidfile) != 0) {
392 cl_log(LOG_ERR, "Cannot create pidfile [%s]"
394 exit(LSB_EXIT_GENERIC);
403 cl_log(LOG_ERR, "cannot start daemon.");
404 exit(LSB_EXIT_GENERIC);
405 } else if (pid > 0) {
410 if (chdir("/") < 0) {
411 cl_log(LOG_ERR, "cannot chroot to /.");
412 exit(LSB_EXIT_GENERIC);
418 for (j=0; j < 3; ++j) {
420 (void)open("/dev/null", j == 0 ? O_RDONLY : O_RDWR);
423 CL_IGNORE_SIG(SIGINT);
424 CL_IGNORE_SIG(SIGHUP);
426 G_main_add_SignalHandler(G_PRIORITY_DEFAULT, SIGTERM, cl_respawn_quit, NULL, NULL);
430 plumb_client_and_emit_apphb(gpointer data)
433 char ** client_argv = (char **) data;
435 if ( REGTO_APPHBD == TRUE ) {
441 /* cl_log(LOG_NOTICE,"donnot emit hb for test."); */
442 if ( IS_RECOVERY == TRUE && !(CL_PID_EXISTS(monitored_PID)) ) {
443 cl_log(LOG_INFO, "process %d exited.", monitored_PID);
445 new_pid = run_client_as_child(client_argv);
447 cl_log(LOG_NOTICE, "restart the monitored program %s,"
448 " whose PID is %d", client_argv[0], new_pid);
451 * donnot let recovery manager restart me again, avoid
454 cl_log(LOG_ERR, "Failed to restart the monitored "
455 "program %s, will exit.", client_argv[0]);
456 cl_respawn_quit(SIGTERM, NULL);
464 cl_respawn_quit(int signo, gpointer user_data)
466 shutting_down = TRUE;
467 if (monitored_PID != 0) {
468 cl_log(LOG_INFO, "Killing pid [%d] with SIGTERM"
470 /* DisableProcLogging(); */
471 if (kill(monitored_PID, SIGTERM) < 0) {
478 if (mainloop != NULL && g_main_is_running(mainloop)) {
479 DisableProcLogging();
480 g_main_quit(mainloop);
483 DisableProcLogging();
490 separate_argv(int * argc_p, char *** argv_p, char *** client_argv_p)
492 /* Search the first no-option parameter */
495 *client_argv_p = NULL;
497 for (i=1; i < *argc_p; i++) {
498 if ( ((*argv_p)[i][0] != '-')
499 && (0 == stat((*argv_p)[i], &buf)) ) {
500 if ( S_ISREG(buf.st_mode)
501 && ((S_IXUSR| S_IXGRP | S_IXOTH) & buf.st_mode) ) {
508 * Cannot find a valid program name which will be run as a child
509 * process of cl_respawn, may be a recovery.
515 *client_argv_p = calloc(*argc_p - i + 1, sizeof(char*));
516 if (*client_argv_p == NULL) {
517 cl_perror("separate_argv:calloc: ");
521 for (j=i; j < *argc_p; j++) {
522 (*client_argv_p)[j-i] = (*argv_p)[j];
532 cmd_str_to_argv(char * cmd_str, char *** client_argv_p)
534 const int MAX_NUM_OF_PARAMETER = 80;
539 if (cmd_str == NULL) {
540 return LSB_EXIT_EINVAL;
543 *client_argv_p = calloc(MAX_NUM_OF_PARAMETER, sizeof(char *));
544 if (*client_argv_p == NULL) {
545 cl_perror("cmd_str_to_argv:calloc: ");
546 return LSB_EXIT_GENERIC;
551 next = strchr(pre,' ');
554 len_tmp = strnlen(pre, 80);
555 (*client_argv_p)[index] = calloc(len_tmp+1, sizeof(char));
556 if (((*client_argv_p)[index]) == NULL ) {
557 cl_perror("cmd_str_to_argv:calloc: ");
558 return LSB_EXIT_GENERIC;
560 strncpy((*client_argv_p)[index], pre, len_tmp);
564 (*client_argv_p)[index] = calloc(next-pre+1, sizeof(char));
565 if (((*client_argv_p)[index]) == NULL ) {
566 cl_perror("cmd_str_to_argv:calloc: ");
567 return LSB_EXIT_GENERIC;
569 strncpy((*client_argv_p)[index], pre, next-pre);
571 /* remove redundant spaces between parametes */
572 while ((char)(*next)==' ') {
577 if (++index >= MAX_NUM_OF_PARAMETER - 1) {
582 if (index >= MAX_NUM_OF_PARAMETER - 1) {
583 for (i = 0; i < MAX_NUM_OF_PARAMETER; i++) {
584 free((*client_argv_p)[i]);
586 free(*client_argv_p);
587 return LSB_EXIT_EINVAL;
590 (*client_argv_p)[index+1] = NULL;
596 monitoredProcessDied(ProcTrack* p, int status, int signo
597 , int exitcode, int waslogged)
600 char ** client_argv = (char **) p->privatedata;
601 const char * pname = p->ops->proctype(p);
604 cl_respawn_quit(SIGTERM, NULL);
605 p->privatedata = NULL;
609 if ( exitcode == MAGIC_EXIT_CODE) {
610 cl_log(LOG_INFO, "Don't restart the monitored program"
611 " %s [%d], since we got the magic exit code."
613 free_argv(client_argv);
614 cl_respawn_quit(SIGTERM, NULL); /* Does NOT always exit */
618 cl_log(LOG_INFO, "process %s[%d] exited, and its exit code is %d"
619 , pname, p->pid, exitcode);
620 if ( 0 < (new_pid = run_client_as_child(client_argv)) ) {
621 cl_log(LOG_NOTICE, "restarted the monitored program, whose PID "
624 cl_log(LOG_ERR, "Failed to restart the monitored program %s ,"
625 "will exit.", pname );
626 free_argv(client_argv);
627 cl_respawn_quit(SIGTERM, NULL); /* Does NOT always exit */
631 p->privatedata = NULL;
635 monitoredProcessRegistered(ProcTrack* p)
637 cl_log(LOG_INFO, "Child process [%s] started [ pid: %d ]."
638 , p->ops->proctype(p), p->pid);
642 monitoredProcessName(ProcTrack* p)
644 char ** argv = p->privatedata;
649 free_argv(char ** argv)
653 if ( argv == NULL ) {
658 if (argv[i] != NULL) {