df78dc
commit e79e5040a0e7efd622ecdd572bee40c90e59c3bd
df78dc
Author: Miroslav Lichvar <mlichvar@redhat.com>
df78dc
Date:   Fri Apr 13 17:11:58 2018 +0200
df78dc
df78dc
    timemaster: restart terminated processes.
df78dc
    
df78dc
    If a ptp4l or phc2sys process is terminated (e.g. due to a crash) and
df78dc
    timemaster was running for at least one second (i.e. it's not an error
df78dc
    in ptp4l/phc2sys configuration), start the process again. Restart all
df78dc
    processes corresponding to the same time source at the same time to
df78dc
    ensure phc2sys is always connected to the currently running ptp4l.
df78dc
    
df78dc
    Add a new option to disable the restarting.
df78dc
    
df78dc
    Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
df78dc
df78dc
diff --git a/timemaster.8 b/timemaster.8
df78dc
index e0e22eb..7288972 100644
df78dc
--- a/timemaster.8
df78dc
+++ b/timemaster.8
df78dc
@@ -87,6 +87,16 @@ Specify the first number in a sequence of SHM segments that will be used by
df78dc
 can be useful to avoid conflicts with time sources that are not started by
df78dc
 \fBtimemaster\fR, e.g. \fBgpsd\fR using segments number 0 and 1.
df78dc
 
df78dc
+.TP
df78dc
+.B restart_processes
df78dc
+Enable or disable restarting of processes started by \fBtimemaster\fR. If the
df78dc
+option is set to a non-zero value, all processes except \fBchronyd\fR and
df78dc
+\fBntpd\fR will be automatically restarted when terminated and \fBtimemaster\fR
df78dc
+is running for at least one second (i.e. the process did not terminate due to a
df78dc
+configuration error). If a process was terminated and is not started again,
df78dc
+\fBtimemaster\fR will kill the other processes and exit with a non-zero status.
df78dc
+The default value is 1 (enabled).
df78dc
+
df78dc
 .SS [ntp_server address]
df78dc
 
df78dc
 The \fBntp_server\fR section specifies an NTP server that should be used as a
df78dc
@@ -318,6 +328,7 @@ ptp4l_option delay_mechanism P2P
df78dc
 ntp_program chronyd
df78dc
 rundir /var/run/timemaster
df78dc
 first_shm_segment 1
df78dc
+restart_processes 0
df78dc
 
df78dc
 [chronyd]
df78dc
 path /usr/sbin/chronyd
df78dc
diff --git a/timemaster.c b/timemaster.c
df78dc
index fc3ba31..4ba921e 100644
df78dc
--- a/timemaster.c
df78dc
+++ b/timemaster.c
df78dc
@@ -44,6 +44,7 @@
df78dc
 #define DEFAULT_RUNDIR "/var/run/timemaster"
df78dc
 
df78dc
 #define DEFAULT_FIRST_SHM_SEGMENT 0
df78dc
+#define DEFAULT_RESTART_PROCESSES 1
df78dc
 
df78dc
 #define DEFAULT_NTP_PROGRAM CHRONYD
df78dc
 #define DEFAULT_NTP_MINPOLL 6
df78dc
@@ -108,6 +109,7 @@ struct timemaster_config {
df78dc
 	enum ntp_program ntp_program;
df78dc
 	char *rundir;
df78dc
 	int first_shm_segment;
df78dc
+	int restart_processes;
df78dc
 	struct program_config chronyd;
df78dc
 	struct program_config ntpd;
df78dc
 	struct program_config phc2sys;
df78dc
@@ -122,6 +124,9 @@ struct config_file {
df78dc
 struct script {
df78dc
 	struct config_file **configs;
df78dc
 	char ***commands;
df78dc
+	int **command_groups;
df78dc
+	int restart_groups;
df78dc
+	int no_restart_group;
df78dc
 };
df78dc
 
df78dc
 static void free_parray(void **a)
df78dc
@@ -385,6 +390,8 @@ static int parse_timemaster_settings(char **settings,
df78dc
 			replace_string(value, &config->rundir);
df78dc
 		} else if (!strcasecmp(name, "first_shm_segment")) {
df78dc
 			r = parse_int(value, &config->first_shm_segment);
df78dc
+		} else if (!strcasecmp(name, "restart_processes")) {
df78dc
+			r = parse_int(value, &config->restart_processes);
df78dc
 		} else {
df78dc
 			pr_err("unknown timemaster setting %s", name);
df78dc
 			return 1;
df78dc
@@ -508,6 +515,7 @@ static struct timemaster_config *config_parse(char *path)
df78dc
 	config->ntp_program = DEFAULT_NTP_PROGRAM;
df78dc
 	config->rundir = xstrdup(DEFAULT_RUNDIR);
df78dc
 	config->first_shm_segment = DEFAULT_FIRST_SHM_SEGMENT;
df78dc
+	config->restart_processes = DEFAULT_RESTART_PROCESSES;
df78dc
 
df78dc
 	init_program_config(&config->chronyd, "chronyd",
df78dc
 			    NULL, DEFAULT_CHRONYD_SETTINGS, NULL);
df78dc
@@ -632,6 +640,18 @@ static char *get_refid(char *prefix, unsigned int number)
df78dc
 	return NULL;
df78dc
 };
df78dc
 
df78dc
+static void add_command(char **command, int command_group,
df78dc
+			struct script *script)
df78dc
+{
df78dc
+	int *group;
df78dc
+
df78dc
+	parray_append((void ***)&script->commands, command);
df78dc
+
df78dc
+	group = xmalloc(sizeof(int));
df78dc
+	*group = command_group;
df78dc
+	parray_append((void ***)&script->command_groups, group);
df78dc
+}
df78dc
+
df78dc
 static void add_shm_source(int shm_segment, int poll, int dpoll, double delay,
df78dc
 			   char *ntp_options, char *prefix,
df78dc
 			   struct timemaster_config *config, char **ntp_config)
df78dc
@@ -671,8 +691,8 @@ static int add_ntp_source(struct ntp_server *source, char **ntp_config)
df78dc
 
df78dc
 static int add_ptp_source(struct ptp_domain *source,
df78dc
 			  struct timemaster_config *config, int *shm_segment,
df78dc
-			  int ***allocated_phcs, char **ntp_config,
df78dc
-			  struct script *script)
df78dc
+			  int *command_group, int ***allocated_phcs,
df78dc
+			  char **ntp_config, struct script *script)
df78dc
 {
df78dc
 	struct config_file *config_file;
df78dc
 	char **command, *uds_path, **interfaces, *message_tag;
df78dc
@@ -798,19 +818,19 @@ static int add_ptp_source(struct ptp_domain *source,
df78dc
 			/* HW time stamping */
df78dc
 			command = get_ptp4l_command(&config->ptp4l, config_file,
df78dc
 						    interfaces, 1);
df78dc
-			parray_append((void ***)&script->commands, command);
df78dc
+			add_command(command, *command_group, script);
df78dc
 
df78dc
 			command = get_phc2sys_command(&config->phc2sys,
df78dc
 						      source->domain,
df78dc
 						      source->phc2sys_poll,
df78dc
 						      *shm_segment, uds_path,
df78dc
 						      message_tag);
df78dc
-			parray_append((void ***)&script->commands, command);
df78dc
+			add_command(command, (*command_group)++, script);
df78dc
 		} else {
df78dc
 			/* SW time stamping */
df78dc
 			command = get_ptp4l_command(&config->ptp4l, config_file,
df78dc
 						    interfaces, 0);
df78dc
-			parray_append((void ***)&script->commands, command);
df78dc
+			add_command(command, (*command_group)++, script);
df78dc
 
df78dc
 			string_appendf(&config_file->content,
df78dc
 				       "clock_servo ntpshm\n"
df78dc
@@ -862,7 +882,8 @@ static char **get_ntpd_command(struct program_config *config,
df78dc
 }
df78dc
 
df78dc
 static struct config_file *add_ntp_program(struct timemaster_config *config,
df78dc
-					   struct script *script)
df78dc
+					   struct script *script,
df78dc
+					   int command_group)
df78dc
 {
df78dc
 	struct config_file *ntp_config = xmalloc(sizeof(*ntp_config));
df78dc
 	char **command = NULL;
df78dc
@@ -886,7 +907,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
df78dc
 	}
df78dc
 
df78dc
 	parray_append((void ***)&script->configs, ntp_config);
df78dc
-	parray_append((void ***)&script->commands, command);
df78dc
+	add_command(command, command_group, script);
df78dc
 
df78dc
 	return ntp_config;
df78dc
 }
df78dc
@@ -894,6 +915,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
df78dc
 static void script_destroy(struct script *script)
df78dc
 {
df78dc
 	char ***commands, **command;
df78dc
+	int **groups;
df78dc
 	struct config_file *config, **configs;
df78dc
 
df78dc
 	for (configs = script->configs; *configs; configs++) {
df78dc
@@ -911,6 +933,10 @@ static void script_destroy(struct script *script)
df78dc
 	}
df78dc
 	free(script->commands);
df78dc
 
df78dc
+	for (groups = script->command_groups; *groups; groups++)
df78dc
+		free(*groups);
df78dc
+	free(script->command_groups);
df78dc
+
df78dc
 	free(script);
df78dc
 }
df78dc
 
df78dc
@@ -920,12 +946,15 @@ static struct script *script_create(struct timemaster_config *config)
df78dc
 	struct source *source, **sources;
df78dc
 	struct config_file *ntp_config = NULL;
df78dc
 	int **allocated_phcs = (int **)parray_new();
df78dc
-	int ret = 0, shm_segment;
df78dc
+	int ret = 0, shm_segment, command_group = 0;
df78dc
 
df78dc
 	script->configs = (struct config_file **)parray_new();
df78dc
 	script->commands = (char ***)parray_new();
df78dc
+	script->command_groups = (int **)parray_new();
df78dc
+	script->no_restart_group = command_group;
df78dc
+	script->restart_groups = config->restart_processes;
df78dc
 
df78dc
-	ntp_config = add_ntp_program(config, script);
df78dc
+	ntp_config = add_ntp_program(config, script, command_group++);
df78dc
 	shm_segment = config->first_shm_segment;
df78dc
 
df78dc
 	for (sources = config->sources; (source = *sources); sources++) {
df78dc
@@ -936,7 +965,7 @@ static struct script *script_create(struct timemaster_config *config)
df78dc
 			break;
df78dc
 		case PTP_DOMAIN:
df78dc
 			if (add_ptp_source(&source->ptp, config, &shm_segment,
df78dc
-					   &allocated_phcs,
df78dc
+					   &command_group, &allocated_phcs,
df78dc
 					   &ntp_config->content, script))
df78dc
 				ret = 1;
df78dc
 			break;
df78dc
@@ -1063,10 +1092,11 @@ static int remove_config_files(struct config_file **configs)
df78dc
 
df78dc
 static int script_run(struct script *script)
df78dc
 {
df78dc
+	struct timespec ts_start, ts_now;
df78dc
 	sigset_t mask, old_mask;
df78dc
 	siginfo_t info;
df78dc
 	pid_t pid, *pids;
df78dc
-	int i, num_commands, status, ret = 0;
df78dc
+	int i, group, num_commands, status, quit = 0, ret = 0;
df78dc
 
df78dc
 	for (num_commands = 0; script->commands[num_commands]; num_commands++)
df78dc
 		;
df78dc
@@ -1101,7 +1131,9 @@ static int script_run(struct script *script)
df78dc
 		}
df78dc
 	}
df78dc
 
df78dc
-	/* wait for one of the blocked signals */
df78dc
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
df78dc
+
df78dc
+	/* process the blocked signals */
df78dc
 	while (1) {
df78dc
 		if (sigwaitinfo(&mask, &info) < 0) {
df78dc
 			if (errno == EINTR)
df78dc
@@ -1110,36 +1142,111 @@ static int script_run(struct script *script)
df78dc
 			break;
df78dc
 		}
df78dc
 
df78dc
-		/*
df78dc
-		 * assume only the first process (i.e. chronyd or ntpd) is
df78dc
-		 * essential and continue if other processes terminate
df78dc
-		 */
df78dc
-		if (info.si_signo == SIGCHLD && info.si_pid != pids[0]) {
df78dc
-			pr_info("process %d terminated (ignored)", info.si_pid);
df78dc
+		clock_gettime(CLOCK_MONOTONIC, &ts_now);
df78dc
+
df78dc
+		if (info.si_signo != SIGCHLD) {
df78dc
+			if (quit)
df78dc
+				continue;
df78dc
+
df78dc
+			quit = 1;
df78dc
+			pr_debug("exiting on signal %d", info.si_signo);
df78dc
+
df78dc
+			/* terminate remaining processes */
df78dc
+			for (i = 0; i < num_commands; i++) {
df78dc
+				if (pids[i] > 0) {
df78dc
+					pr_debug("killing process %d", pids[i]);
df78dc
+					kill(pids[i], SIGTERM);
df78dc
+				}
df78dc
+			}
df78dc
+
df78dc
 			continue;
df78dc
 		}
df78dc
 
df78dc
-		pr_info("received signal %d", info.si_signo);
df78dc
-		break;
df78dc
-	}
df78dc
+		/* wait for all terminated processes */
df78dc
+		while (1) {
df78dc
+			pid = waitpid(-1, &status, WNOHANG);
df78dc
+			if (pid <= 0)
df78dc
+				break;
df78dc
 
df78dc
-	/* kill all started processes */
df78dc
-	for (i = 0; i < num_commands; i++) {
df78dc
-		if (pids[i] > 0) {
df78dc
-			pr_debug("killing process %d", pids[i]);
df78dc
-			kill(pids[i], SIGTERM);
df78dc
+			if (!WIFEXITED(status)) {
df78dc
+				pr_info("process %d terminated abnormally",
df78dc
+					pid);
df78dc
+			} else {
df78dc
+				pr_info("process %d terminated with status %d",
df78dc
+					pid, WEXITSTATUS(status));
df78dc
+			}
df78dc
+
df78dc
+			for (i = 0; i < num_commands; i++) {
df78dc
+				if (pids[i] == pid)
df78dc
+					pids[i] = 0;
df78dc
+			}
df78dc
 		}
df78dc
-	}
df78dc
 
df78dc
-	while ((pid = wait(&status)) >= 0) {
df78dc
-		if (!WIFEXITED(status)) {
df78dc
-			pr_info("process %d terminated abnormally", pid);
df78dc
-			ret = 1;
df78dc
-		} else {
df78dc
-			if (WEXITSTATUS(status))
df78dc
+		/* wait for all processes to terminate when exiting */
df78dc
+		if (quit) {
df78dc
+			for (i = 0; i < num_commands; i++) {
df78dc
+				if (pids[i])
df78dc
+					break;
df78dc
+			}
df78dc
+			if (i == num_commands)
df78dc
+				break;
df78dc
+
df78dc
+			pr_debug("waiting for other processes to terminate");
df78dc
+			continue;
df78dc
+		}
df78dc
+
df78dc
+		/*
df78dc
+		 * terminate (and then restart if allowed) all processes in
df78dc
+		 * groups that have a terminated process
df78dc
+		 */
df78dc
+		for (group = 0; group < num_commands; group++) {
df78dc
+			int terminated = 0, running = 0;
df78dc
+
df78dc
+			for (i = 0; i < num_commands; i++) {
df78dc
+				if (*(script->command_groups[i]) != group)
df78dc
+					continue;
df78dc
+				if (pids[i])
df78dc
+					running++;
df78dc
+				else
df78dc
+					terminated++;
df78dc
+			}
df78dc
+
df78dc
+			if (!terminated)
df78dc
+				continue;
df78dc
+
df78dc
+			/*
df78dc
+			 * exit with a non-zero status if the group should not
df78dc
+			 * be restarted (i.e. chronyd/ntpd), timemaster is
df78dc
+			 * running only for a short time (and it is likely a
df78dc
+			 * configuration error), or restarting is disabled
df78dc
+			 * completely
df78dc
+			 */
df78dc
+			if (group == script->no_restart_group ||
df78dc
+			    ts_now.tv_sec - ts_start.tv_sec <= 1 ||
df78dc
+			    !script->restart_groups) {
df78dc
+				kill(getpid(), SIGTERM);
df78dc
 				ret = 1;
df78dc
-			pr_info("process %d terminated with status %d", pid,
df78dc
-				WEXITSTATUS(status));
df78dc
+				break;
df78dc
+			}
df78dc
+
df78dc
+			for (i = 0; i < num_commands; i++) {
df78dc
+				if (*(script->command_groups[i]) != group)
df78dc
+					continue;
df78dc
+
df78dc
+				/* terminate all processes in the group first */
df78dc
+				if (running && pids[i]) {
df78dc
+					pr_debug("killing process %d", pids[i]);
df78dc
+					kill(pids[i], SIGTERM);
df78dc
+				} else if (!running && !pids[i]) {
df78dc
+					pids[i] = start_program(script->commands[i],
df78dc
+								&old_mask);
df78dc
+					if (!pids[i])
df78dc
+						kill(getpid(), SIGTERM);
df78dc
+
df78dc
+					/* limit restarting rate */
df78dc
+					sleep(1);
df78dc
+				}
df78dc
+			}
df78dc
 		}
df78dc
 	}
df78dc
 
df78dc
@@ -1154,6 +1261,7 @@ static int script_run(struct script *script)
df78dc
 static void script_print(struct script *script)
df78dc
 {
df78dc
 	char ***commands, **command;
df78dc
+	int **groups;
df78dc
 	struct config_file *config, **configs;
df78dc
 
df78dc
 	for (configs = script->configs; *configs; configs++) {
df78dc
@@ -1162,7 +1270,9 @@ static void script_print(struct script *script)
df78dc
 	}
df78dc
 
df78dc
 	fprintf(stderr, "commands:\n\n");
df78dc
-	for (commands = script->commands; *commands; commands++) {
df78dc
+	for (commands = script->commands, groups = script->command_groups;
df78dc
+	     *commands; commands++, groups++) {
df78dc
+		fprintf(stderr, "[%d] ", **groups);
df78dc
 		for (command = *commands; *command; command++)
df78dc
 			fprintf(stderr, "%s ", *command);
df78dc
 		fprintf(stderr, "\n");