diff options
Diffstat (limited to 'usr.sbin/watchdogd/watchdogd.c')
| -rw-r--r-- | usr.sbin/watchdogd/watchdogd.c | 747 | 
1 files changed, 747 insertions, 0 deletions
| diff --git a/usr.sbin/watchdogd/watchdogd.c b/usr.sbin/watchdogd/watchdogd.c new file mode 100644 index 000000000000..27123f2143d0 --- /dev/null +++ b/usr.sbin/watchdogd/watchdogd.c @@ -0,0 +1,747 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2003-2004  Sean M. Kelly <smkelly@FreeBSD.org> + * Copyright (c) 2013 iXsystems.com, + *                    author: Alfred Perlstein <alfred@freebsd.org> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Software watchdog daemon. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/param.h> +#include <sys/rtprio.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/sysctl.h> +#include <sys/watchdog.h> + +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libutil.h> +#include <math.h> +#include <paths.h> +#include <signal.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <sysexits.h> +#include <syslog.h> +#include <unistd.h> + +#include <getopt.h> + +static long	fetchtimeout(int opt, +    const char *longopt, const char *myoptarg, int zero_ok); +static void	parseargs(int, char *[]); +static void	sighandler(int); +static void	watchdog_loop(void); +static int	watchdog_init(void); +static int	watchdog_onoff(int onoff); +static int	watchdog_patpat(sbintime_t); +static void	usage(void); +static int	tvtohz(struct timeval *tv); + +static int debugging = 0; +static int end_program = 0; +static const char *pidfile = _PATH_VARRUN "watchdogd.pid"; +static sbintime_t timeout = 128 * SBT_1S; +static u_int exit_timeout = WD_TO_NEVER; +static u_int pretimeout = 0; +static u_int timeout_sec; +static u_int nap = 10; +#ifdef notyet +static int passive = 0; +#endif +static int is_daemon = 0; +static int is_dry_run = 0;  /* do not arm the watchdog, only +			       report on timing of the watch +			       program */ +static int do_timedog = 0; +static int do_syslog = 1; +static int fd = -1; +static int carp_thresh_seconds = -1; +static char *test_cmd = NULL; + +static const char *getopt_shortopts; + +static int pretimeout_set; +static int pretimeout_act; +static int pretimeout_act_set; + +static int softtimeout_set; +static int softtimeout_act; +static int softtimeout_act_set; + +static struct option longopts[] = { +	{ "debug", no_argument, &debugging, 1 }, +	{ "pretimeout", required_argument, &pretimeout_set, 1 }, +	{ "pretimeout-action", required_argument, &pretimeout_act_set, 1 }, +	{ "softtimeout", no_argument, &softtimeout_set, 1 }, +	{ "softtimeout-action", required_argument, &softtimeout_act_set, 1 }, +	{ NULL, 0, NULL, 0} +}; + +/* + * Periodically pat the watchdog, preventing it from firing. + */ +int +main(int argc, char *argv[]) +{ +	struct rtprio rtp; +	struct pidfh *pfh; +	pid_t otherpid; + +	if (getuid() != 0) +		errx(EX_SOFTWARE, "not super user"); +		 +	parseargs(argc, argv); + +	if (do_syslog) +		openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR, +		    LOG_DAEMON); + +	rtp.type = RTP_PRIO_REALTIME; +	rtp.prio = 0; +	if (rtprio(RTP_SET, 0, &rtp) == -1) +		err(EX_OSERR, "rtprio"); + +	if (!is_dry_run && watchdog_init() == -1) +		errx(EX_SOFTWARE, "unable to initialize watchdog"); + +	if (is_daemon) { +		if (watchdog_onoff(1) == -1) +			err(EX_OSERR, "patting the dog"); + +		pfh = pidfile_open(pidfile, 0600, &otherpid); +		if (pfh == NULL) { +			if (errno == EEXIST) { +				watchdog_onoff(0); +				errx(EX_SOFTWARE, "%s already running, pid: %d", +				    getprogname(), otherpid); +			} +			warn("Cannot open or create pidfile"); +		} + +		if (debugging == 0 && daemon(0, 0) == -1) { +			watchdog_onoff(0); +			pidfile_remove(pfh); +			err(EX_OSERR, "daemon"); +		} + +		signal(SIGHUP, SIG_IGN); +		signal(SIGINT, sighandler); +		signal(SIGTERM, sighandler); + +		pidfile_write(pfh); +		if (madvise(0, 0, MADV_PROTECT) != 0) +			warn("madvise failed"); +		if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) +			warn("mlockall failed"); + +		watchdog_loop(); + +		/* exiting */ +		pidfile_remove(pfh); +		return (EX_OK); +	} else { +		if (watchdog_patpat(timeout) < 0) +			err(EX_OSERR, "patting the dog"); +		return (EX_OK); +	} +} + +/* + * Convert a timeout in seconds to N where 2^N nanoseconds is close to + * "seconds". + * + * The kernel expects the timeouts for watchdogs in "2^N nanosecond format". + */ +static sbintime_t +parse_timeout_to_sbt(char opt, const char *longopt, const char *myoptarg) +{ +	long a; +	sbintime_t rv; +	struct timeval tv; +	int ticks; +	char shortopt[] = "- "; + +	if (!longopt) +		shortopt[1] = opt; + +	a = fetchtimeout(opt, longopt, myoptarg, 1); + +	if (a == 0) +		rv = 0; +	else +		rv = a * SBT_1S; +	tv = sbttotv(rv); +	ticks = tvtohz(&tv); +	if (debugging) { +		printf("Timeout for %s%s " +		    "is " +		    "(in: %s sec -> out: %jd sec %ld us -> %d ticks)\n", +		    longopt ? "-" : "", longopt ? longopt : shortopt, +		    myoptarg, (intmax_t)tv.tv_sec, tv.tv_usec, ticks); +	} +	if (ticks <= 0) { +		errx(1, "Timeout for %s%s is too small, please choose a higher timeout.", longopt ? "-" : "", longopt ? longopt : shortopt); +	} + +	return (rv); +} + +/* + * Catch signals and begin shutdown process. + */ +static void +sighandler(int signum) +{ + +	if (signum == SIGINT || signum == SIGTERM) +		end_program = 1; +} + +/* + * Open the watchdog device. + */ +static int +watchdog_init(void) +{ + +	if (is_dry_run) +		return 0; + +	fd = open("/dev/" _PATH_WATCHDOG, O_RDWR); +	if (fd >= 0) +		return (0); +	warn("Could not open watchdog device"); +	return (-1); +} + +/* + * If we are doing timing, then get the time. + */ +static int +watchdog_getuptime(struct timespec *tp) +{ +	int error; + +	if (!do_timedog) +		return 0; + +	error = clock_gettime(CLOCK_UPTIME_FAST, tp); +	if (error) +		warn("clock_gettime"); +	return (error); +} + +static long +watchdog_check_dogfunction_time(struct timespec *tp_start, +    struct timespec *tp_end) +{ +	struct timeval tv_start, tv_end, tv_now, tv; +	const char *cmd_prefix, *cmd; +	struct timespec tp_now; +	int sec; + +	if (!do_timedog) +		return (0); + +	TIMESPEC_TO_TIMEVAL(&tv_start, tp_start); +	TIMESPEC_TO_TIMEVAL(&tv_end, tp_end); +	timersub(&tv_end, &tv_start, &tv); +	sec = tv.tv_sec; +	if (sec < carp_thresh_seconds) +		return (sec); + +	if (test_cmd) { +		cmd_prefix = "Watchdog program"; +		cmd = test_cmd; +	} else { +		cmd_prefix = "Watchdog operation"; +		cmd = "stat(\"/etc\", &sb)"; +	} +	if (do_syslog) +		syslog(LOG_CRIT, "%s: '%s' took too long: " +		    "%d.%06ld seconds >= %d seconds threshold", +		    cmd_prefix, cmd, sec, (long)tv.tv_usec, +		    carp_thresh_seconds); +	else +		warnx("%s: '%s' took too long: " +		    "%d.%06ld seconds >= %d seconds threshold", +		    cmd_prefix, cmd, sec, (long)tv.tv_usec, +		    carp_thresh_seconds); + +	/* +	 * Adjust the sleep interval again in case syslog(3) took a non-trivial +	 * amount of time to run. +	 */ +	if (watchdog_getuptime(&tp_now)) +		return (sec); +	TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now); +	timersub(&tv_now, &tv_start, &tv); +	sec = tv.tv_sec; + +	return (sec); +} + +/* + * Main program loop which is iterated every second. + */ +static void +watchdog_loop(void) +{ +	struct timespec ts_start, ts_end; +	struct stat sb; +	long waited; +	int error, failed; + +	while (end_program != 2) { +		failed = 0; + +		error = watchdog_getuptime(&ts_start); +		if (error) { +			end_program = 1; +			goto try_end; +		} + +		if (test_cmd != NULL) +			failed = system(test_cmd); +		else +			failed = stat("/etc", &sb); + +		error = watchdog_getuptime(&ts_end); +		if (error) { +			end_program = 1; +			goto try_end; +		} + +		if (failed == 0) +			watchdog_patpat(timeout); + +		waited = watchdog_check_dogfunction_time(&ts_start, &ts_end); +		if (nap - waited > 0) +			sleep(nap - waited); + +try_end: +		if (end_program != 0) { +			if (watchdog_onoff(0) == 0) { +				end_program = 2; +			} else { +				warnx("Could not stop the watchdog, not exiting"); +				end_program = 0; +			} +		} +	} +} + +/* + * Reset the watchdog timer. This function must be called periodically + * to keep the watchdog from firing. + */ +static int +watchdog_patpat(sbintime_t sbt) +{ + +	if (is_dry_run) +		return 0; + +	return ioctl(fd, WDIOC_SETTIMEOUT, &sbt); +} + +static int +watchdog_control(u_int control) +{ +	if (is_dry_run) +		return (0); + +	return ioctl(fd, WDIOC_CONTROL, &control); +} + +/* + * Toggle the kernel's watchdog. This routine is used to enable and + * disable the watchdog. + */ +static int +watchdog_onoff(int onoff) +{ +	int error; + +	/* fake successful watchdog op if a dry run */ +	if (is_dry_run) +		return 0; + +	if (onoff) { +		/* +		 * Call the WDIOC_SETSOFT regardless of softtimeout_set +		 * because we'll need to turn it off if someone had turned +		 * it on. +		 */ +		error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set); +		if (error) { +			warn("setting WDIOC_SETSOFT %d", softtimeout_set); +			return (error); +		} +		error = watchdog_patpat(timeout); +		if (error) { +			warn("watchdog_patpat failed"); +			goto failsafe; +		} +		if (softtimeout_act_set) { +			error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT, +			    &softtimeout_act); +			if (error) { +				warn("setting WDIOC_SETSOFTTIMEOUTACT %d", +				    softtimeout_act); +				goto failsafe; +			} +		} +		if (pretimeout_set) { +			error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout); +			if (error) { +				warn("setting WDIOC_SETPRETIMEOUT %d", +				    pretimeout); +				goto failsafe; +			} +		} +		if (pretimeout_act_set) { +			error = ioctl(fd, WDIOC_SETPRETIMEOUTACT, +			    &pretimeout_act); +			if (error) { +				warn("setting WDIOC_SETPRETIMEOUTACT %d", +				    pretimeout_act); +				goto failsafe; +			} +		} +		/* pat one more time for good measure */ +		return watchdog_patpat(timeout); +	 } else { +		return watchdog_control(WD_CTRL_DISABLE); +	 } +failsafe: +	watchdog_control(WD_CTRL_DISABLE); +	return (error); +} + +/* + * Tell user how to use the program. + */ +static void +usage(void) +{ +	if (is_daemon) +		fprintf(stderr, "usage:\n" +"  watchdogd [-dnSw] [-e cmd] [-I pidfile] [-s sleep] [-t timeout]\n" +"            [-T script_timeout] [-x exit_timeout]\n" +"            [--debug]\n" +"            [--pretimeout seconds] [-pretimeout-action action]\n" +"            [--softtimeout] [-softtimeout-action action]\n" +); +	else +		fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n"); +	exit(EX_USAGE); +} + +static long +fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok) +{ +	const char *errstr; +	char *p; +	long rv; + +	errstr = NULL; +	p = NULL; +	errno = 0; +	rv = strtol(myoptarg, &p, 0); +	if ((p != NULL && *p != '\0') || errno != 0) +		errstr = "is not a number"; +	if (rv < 0 || (!zero_ok && rv == 0)) +		errstr = "must be greater than zero"; +	if (errstr) { +		if (longopt)  +			errx(EX_USAGE, "--%s argument %s", longopt, errstr); +		else  +			errx(EX_USAGE, "-%c argument %s", opt, errstr); +	} +	return (rv); +} + +struct act_tbl { +	const char *at_act; +	int at_value; +}; + +static const struct act_tbl act_tbl[] = { +	{ "panic", WD_SOFT_PANIC }, +	{ "ddb", WD_SOFT_DDB }, +	{ "log", WD_SOFT_LOG }, +	{ "printf", WD_SOFT_PRINTF }, +	{ NULL, 0 } +}; + +static void +timeout_act_error(const char *lopt, const char *badact) +{ +	char *opts, *oldopts; +	int i; + +	opts = NULL; +	for (i = 0; act_tbl[i].at_act != NULL; i++) { +		oldopts = opts; +		if (asprintf(&opts, "%s%s%s", +		    oldopts == NULL ? "" : oldopts, +		    oldopts == NULL ? "" : ", ", +		    act_tbl[i].at_act) == -1) +			err(EX_OSERR, "malloc"); +		free(oldopts); +	} +	warnx("bad --%s argument '%s' must be one of (%s).", +	    lopt, badact, opts); +	usage(); +} + +/* + * Take a comma separated list of actions and or the flags + * together for the ioctl. + */ +static int +timeout_act_str2int(const char *lopt, const char *acts) +{ +	int i; +	char *dupacts, *tofree; +	char *o; +	int rv = 0; + +	tofree = dupacts = strdup(acts); +	if (!tofree) +		err(EX_OSERR, "malloc"); +	while ((o = strsep(&dupacts, ",")) != NULL) { +		for (i = 0; act_tbl[i].at_act != NULL; i++) { +			if (!strcmp(o, act_tbl[i].at_act)) { +				rv |= act_tbl[i].at_value; +				break; +			} +		} +		if (act_tbl[i].at_act == NULL) +			timeout_act_error(lopt, o); +	} +	free(tofree); +	return rv; +} + +/* + * Convert a timeval to a number of ticks. + * Mostly copied from the kernel. + */ +int +tvtohz(struct timeval *tv) +{ +	register unsigned long ticks; +	register long sec, usec; +	int hz; +	size_t hzsize; +	int error; +	int tick; + +	hzsize = sizeof(hz); + +	error = sysctlbyname("kern.hz", &hz, &hzsize, NULL, 0); +	if (error) +		err(1, "sysctlbyname kern.hz"); + +	tick = 1000000 / hz; + +	/* +	 * If the number of usecs in the whole seconds part of the time +	 * difference fits in a long, then the total number of usecs will +	 * fit in an unsigned long.  Compute the total and convert it to +	 * ticks, rounding up and adding 1 to allow for the current tick +	 * to expire.  Rounding also depends on unsigned long arithmetic +	 * to avoid overflow. +	 * +	 * Otherwise, if the number of ticks in the whole seconds part of +	 * the time difference fits in a long, then convert the parts to +	 * ticks separately and add, using similar rounding methods and +	 * overflow avoidance.  This method would work in the previous +	 * case but it is slightly slower and assumes that hz is integral. +	 * +	 * Otherwise, round the time difference down to the maximum +	 * representable value. +	 * +	 * If ints have 32 bits, then the maximum value for any timeout in +	 * 10ms ticks is 248 days. +	 */ +	sec = tv->tv_sec; +	usec = tv->tv_usec; +	if (usec < 0) { +		sec--; +		usec += 1000000; +	} +	if (sec < 0) { +#ifdef DIAGNOSTIC +		if (usec > 0) { +			sec++; +			usec -= 1000000; +		} +		printf("tvotohz: negative time difference %ld sec %ld usec\n", +		    sec, usec); +#endif +		ticks = 1; +	} else if (sec <= LONG_MAX / 1000000) +		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) +		    / tick + 1; +	else if (sec <= LONG_MAX / hz) +		ticks = sec * hz +		    + ((unsigned long)usec + (tick - 1)) / tick + 1; +	else +		ticks = LONG_MAX; +	if (ticks > INT_MAX) +		ticks = INT_MAX; +	return ((int)ticks); +} + +/* + * Handle the few command line arguments supported. + */ +static void +parseargs(int argc, char *argv[]) +{ +	struct timespec ts; +	int longindex; +	int c; +	const char *lopt; + +	/* Get the default value of timeout_sec from the default timeout. */ +	timeout_sec = sbintime_getsec(timeout); + +	/* +	 * if we end with a 'd' aka 'watchdogd' then we are the daemon program, +	 * otherwise run as a command line utility. +	 */ +	c = strlen(argv[0]); +	if (argv[0][c - 1] == 'd') +		is_daemon = 1; + +	if (is_daemon) +		getopt_shortopts = "I:de:ns:t:ST:wx:?"; +	else +		getopt_shortopts = "dt:?"; + +	while ((c = getopt_long(argc, argv, getopt_shortopts, longopts, +		    &longindex)) != -1) { +		switch (c) { +		case 'I': +			pidfile = optarg; +			break; +		case 'd': +			debugging = 1; +			break; +		case 'e': +			test_cmd = strdup(optarg); +			break; +		case 'n': +			is_dry_run = 1; +			break; +#ifdef notyet +		case 'p': +			passive = 1; +			break; +#endif +		case 's': +			nap = fetchtimeout(c, NULL, optarg, 0); +			break; +		case 'S': +			do_syslog = 0; +			break; +		case 't': +			timeout_sec = atoi(optarg); +			timeout = parse_timeout_to_sbt(c, NULL, optarg); +			if (debugging) +				printf("Timeout is %d\n", +				    (int)(timeout / SBT_1S)); +			break; +		case 'T': +			carp_thresh_seconds = +			    fetchtimeout(c, "NULL", optarg, 0); +			break; +		case 'w': +			do_timedog = 1; +			break; +		case 'x': +			exit_timeout = parse_timeout_to_sbt(c, NULL, optarg); +			if (exit_timeout != 0) +				exit_timeout |= WD_ACTIVE; +			break; +		case 0: +			lopt = longopts[longindex].name; +			if (!strcmp(lopt, "pretimeout")) { +				pretimeout = fetchtimeout(0, lopt, optarg, 0); +			} else if (!strcmp(lopt, "pretimeout-action")) { +				pretimeout_act = timeout_act_str2int(lopt, +				    optarg); +			} else if (!strcmp(lopt, "softtimeout-action")) { +				softtimeout_act = timeout_act_str2int(lopt, +				    optarg); +			} else { +		/*		warnx("bad option at index %d: %s", optind, +				    argv[optind]); +				usage(); +				*/ +			} +			break; +		case '?': +		default: +			usage(); +			/* NOTREACHED */ +		} +	} + +	if (nap > timeout_sec / 2) +		nap = timeout_sec / 2; + +	if (carp_thresh_seconds == -1) +		carp_thresh_seconds = nap; + +	if (argc != optind) +		errx(EX_USAGE, "extra arguments."); +	if (is_daemon && timeout < WD_TO_1SEC) +		errx(EX_USAGE, "-t argument is less than one second."); +	if (pretimeout_set) { +		if (pretimeout >= timeout_sec) { +			errx(EX_USAGE, +			    "pretimeout (%d) >= timeout (%d -> %ld)\n" +			    "see manual section TIMEOUT RESOLUTION", +			    pretimeout, timeout_sec, (long)ts.tv_sec); +		} +	} +} | 
