aboutsummaryrefslogtreecommitdiff
path: root/usr.sbin/watchdogd
diff options
context:
space:
mode:
authorSimon J. Gerraty <sjg@FreeBSD.org>2013-09-05 20:18:59 +0000
committerSimon J. Gerraty <sjg@FreeBSD.org>2013-09-05 20:18:59 +0000
commitd1d015864103b253b3fcb2f72a0da5b0cfeb31b6 (patch)
tree22b131dceb13c3df96da594fbaadb693504797c7 /usr.sbin/watchdogd
parent12d4083451fc39b3e831d4ea0bfa67d3b32cfb54 (diff)
parentb6f49c23a36f329cbf1e7f28078e17fd87f0e245 (diff)
downloadsrc-d1d015864103b253b3fcb2f72a0da5b0cfeb31b6.tar.gz
src-d1d015864103b253b3fcb2f72a0da5b0cfeb31b6.zip
Merge from head
Notes
Notes: svn path=/projects/bmake/; revision=255263
Diffstat (limited to 'usr.sbin/watchdogd')
-rw-r--r--usr.sbin/watchdogd/watchdogd.894
-rw-r--r--usr.sbin/watchdogd/watchdogd.c209
2 files changed, 279 insertions, 24 deletions
diff --git a/usr.sbin/watchdogd/watchdogd.8 b/usr.sbin/watchdogd/watchdogd.8
index b8a550516437..6176a2066741 100644
--- a/usr.sbin/watchdogd/watchdogd.8
+++ b/usr.sbin/watchdogd/watchdogd.8
@@ -27,7 +27,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd March 5, 2013
+.Dd July 27, 2013
.Dt WATCHDOGD 8
.Os
.Sh NAME
@@ -115,7 +115,7 @@ will terminate.
The
.Nm
utility recognizes the following runtime options:
-.Bl -tag -width ".Fl -softtimeout-action Ar action "
+.Bl -tag -width 30m
.It Fl I Ar file
Write the process ID of the
.Nm
@@ -208,6 +208,96 @@ device for
.Bl -tag -width ".Pa /var/run/watchdogd.pid" -compact
.It Pa /var/run/watchdogd.pid
.El
+.Sh EXAMPLES
+.Ss Debugging watchdogd and/or your watchdog script.
+This is a useful recipe for debugging
+.Nm
+and your watchdog script.
+.Pp
+(Note that ^C works oddly because
+.Nm
+calls
+.Xr system 3
+so the
+first ^C will terminate the "sleep" command.)
+.Pp
+Explanation of options used:
+.Bl -enum -offset indent -compact
+.It
+Set Debug on (--debug)
+.It
+Set the watchdog to trip at 30 seconds. (-t 30)
+.It
+Use of a softtimeout:
+.Bl -enum -offset indent -compact -nested
+.It
+Use a softtimeout (do not arm the hardware watchdog).
+(--softtimeout)
+.It
+Set the softtimeout action to do both kernel
+.Xr printf 9
+and
+.Xr log 9
+when it trips.
+(--softtimeout-action log,printf)
+.El
+.It
+Use of a pre-timeout:
+.Bl -enum -offset indent -compact -nested
+.It
+Set a pre-timeout of 15 seconds (this will later trigger a panic/dump).
+(--pretimeout 15)
+.It
+Set the action to also kernel
+.Xr printf 9
+and
+.Xr log 9
+when it trips.
+(--pretimeout-action log,printf)
+.El
+.It
+Use of a script:
+.Bl -enum -offset indent -compact -nested
+.It
+Run "sleep 60" as a shell command that acts as the watchdog (-e 'sleep 60')
+.It
+Warn us when the script takes longer than 1 second to run (-w)
+.El
+.El
+.Bd -literal
+watchdogd --debug -t 30 \\
+ --softtimeout --softtimeout-action log,printf \\
+ --pretimeout 15 --pretimeout-action log,printf \\
+ -e 'sleep 60' -w
+.Ed
+.Ss Production use of example
+.Bl -enum -offset indent -compact
+.It
+Set hard timeout to 120 seconds (-t 120)
+.It
+Set a panic to happen at 60 seconds (to trigger a
+.Xr crash 8
+for dump analysis):
+.Bl -enum -offset indent -compact -nested
+.It
+Use of pre-timeout (--pretimeout 60)
+.It
+Specify pre-timeout action (--pretimeout-action log,printf,panic )
+.El
+.It
+Use of a script:
+.Bl -enum -offset indent -compact -nested
+.It
+Run your script (-e '/path/to/your/script 60')
+.It
+Log if your script takes a longer than 15 seconds to run time. (-w -T 15)
+.El
+.El
+.Bd -literal
+watchdogd -t 120 \\
+ --pretimeout 60 --pretimeout-action log,printf,panic \\
+ -e '/path/to/your/script 60' -w -T 15
+.Ed
.Sh SEE ALSO
.Xr watchdog 4 ,
.Xr watchdog 8 ,
diff --git a/usr.sbin/watchdogd/watchdogd.c b/usr.sbin/watchdogd/watchdogd.c
index 5416751d4cc3..5fd16f56964b 100644
--- a/usr.sbin/watchdogd/watchdogd.c
+++ b/usr.sbin/watchdogd/watchdogd.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rtprio.h>
#include <sys/stat.h>
#include <sys/time.h>
+#include <sys/sysctl.h>
#include <sys/watchdog.h>
#include <err.h>
@@ -49,6 +50,7 @@ __FBSDID("$FreeBSD$");
#include <paths.h>
#include <signal.h>
#include <stdio.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
@@ -58,19 +60,25 @@ __FBSDID("$FreeBSD$");
#include <getopt.h>
+static long fetchtimeout(int opt,
+ const char *longopt, const char *myoptarg, int zero_ok);
static void parseargs(int, char *[]);
+static int seconds_to_pow2ns(int);
static void sighandler(int);
static void watchdog_loop(void);
static int watchdog_init(void);
static int watchdog_onoff(int onoff);
static int watchdog_patpat(u_int timeout);
static void usage(void);
+static int tstotv(struct timeval *tv, struct timespec *ts);
+static int tvtohz(struct timeval *tv);
static int debugging = 0;
static int end_program = 0;
static const char *pidfile = _PATH_VARRUN "watchdogd.pid";
static u_int timeout = WD_TO_128SEC;
static u_int pretimeout = 0;
+static u_int timeout_sec;
static u_int passive = 0;
static int is_daemon = 0;
static int is_dry_run = 0; /* do not arm the watchdog, only
@@ -183,6 +191,59 @@ main(int argc, char *argv[])
}
}
+static void
+pow2ns_to_ts(int pow2ns, struct timespec *ts)
+{
+ uint64_t ns;
+
+ ns = 1ULL << pow2ns;
+ ts->tv_sec = ns / 1000000000ULL;
+ ts->tv_nsec = ns % 1000000000ULL;
+}
+
+/*
+ * Convert a timeout in seconds to N where 2^N nanoseconds is close to
+ * "seconds".
+ *
+ * The kernel expects the timeouts for watchdogs in "2^N nanosecond format".
+ */
+static u_int
+parse_timeout_to_pow2ns(char opt, const char *longopt, const char *myoptarg)
+{
+ double a;
+ u_int rv;
+ struct timespec ts;
+ struct timeval tv;
+ int ticks;
+ char shortopt[] = "- ";
+
+ if (!longopt)
+ shortopt[1] = opt;
+
+ a = fetchtimeout(opt, longopt, myoptarg, 1);
+
+ if (a == 0)
+ rv = WD_TO_NEVER;
+ else
+ rv = seconds_to_pow2ns(a);
+ pow2ns_to_ts(rv, &ts);
+ tstotv(&tv, &ts);
+ ticks = tvtohz(&tv);
+ if (debugging) {
+ printf("Timeout for %s%s "
+ "is 2^%d nanoseconds "
+ "(in: %s sec -> out: %jd sec %ld ns -> %d ticks)\n",
+ longopt ? "-" : "", longopt ? longopt : shortopt,
+ rv,
+ myoptarg, (intmax_t)ts.tv_sec, ts.tv_nsec, ticks);
+ }
+ if (ticks <= 0) {
+ errx(1, "Timeout for %s%s is too small, please choose a higher timeout.", longopt ? "-" : "", longopt ? longopt : shortopt);
+ }
+
+ return (rv);
+}
+
/*
* Catch signals and begin shutdown process.
*/
@@ -427,7 +488,7 @@ usage(void)
}
static long
-fetchtimeout(int opt, const char *longopt, const char *myoptarg)
+fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok)
{
const char *errstr;
char *p;
@@ -439,7 +500,7 @@ fetchtimeout(int opt, const char *longopt, const char *myoptarg)
rv = strtol(myoptarg, &p, 0);
if ((p != NULL && *p != '\0') || errno != 0)
errstr = "is not a number";
- if (rv <= 0)
+ if (rv < 0 || (!zero_ok && rv == 0))
errstr = "must be greater than zero";
if (errstr) {
if (longopt)
@@ -513,6 +574,110 @@ timeout_act_str2int(const char *lopt, const char *acts)
return rv;
}
+int
+tstotv(struct timeval *tv, struct timespec *ts)
+{
+
+ tv->tv_sec = ts->tv_sec;
+ tv->tv_usec = ts->tv_nsec / 1000;
+ return 0;
+}
+
+/*
+ * Convert a timeval to a number of ticks.
+ * Mostly copied from the kernel.
+ */
+int
+tvtohz(struct timeval *tv)
+{
+ register unsigned long ticks;
+ register long sec, usec;
+ int hz;
+ size_t hzsize;
+ int error;
+ int tick;
+
+ hzsize = sizeof(hz);
+
+ error = sysctlbyname("kern.hz", &hz, &hzsize, NULL, 0);
+ if (error)
+ err(1, "sysctlbyname kern.hz");
+
+ tick = 1000000 / hz;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ sec = tv->tv_sec;
+ usec = tv->tv_usec;
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ if (usec > 0) {
+ sec++;
+ usec -= 1000000;
+ }
+ printf("tvotohz: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return ((int)ticks);
+}
+
+static int
+seconds_to_pow2ns(int seconds)
+{
+ uint64_t power;
+ uint64_t ns;
+ uint64_t shifted;
+
+ if (seconds <= 0)
+ errx(1, "seconds %d < 0", seconds);
+ ns = ((uint64_t)seconds) * 1000000000ULL;
+ power = flsll(ns);
+ shifted = 1ULL << power;
+ if (shifted <= ns) {
+ power++;
+ }
+ if (debugging) {
+ printf("shifted %lld\n", (long long)shifted);
+ printf("seconds_to_pow2ns: seconds: %d, ns %lld, power %d\n",
+ seconds, (long long)ns, (int)power);
+ }
+ return (power);
+}
+
+
/*
* Handle the few command line arguments supported.
*/
@@ -521,9 +686,7 @@ parseargs(int argc, char *argv[])
{
int longindex;
int c;
- char *p;
const char *lopt;
- double a;
/*
* if we end with a 'd' aka 'watchdogd' then we are the daemon program,
@@ -559,30 +722,21 @@ parseargs(int argc, char *argv[])
break;
#endif
case 's':
- nap = fetchtimeout(c, NULL, optarg);
+ nap = fetchtimeout(c, NULL, optarg, 0);
break;
case 'S':
do_syslog = 0;
break;
case 't':
- p = NULL;
- errno = 0;
- a = strtod(optarg, &p);
- if ((p != NULL && *p != '\0') || errno != 0)
- errx(EX_USAGE, "-t argument is not a number");
- if (a < 0)
- errx(EX_USAGE, "-t argument must be positive");
-
- if (a == 0)
- timeout = WD_TO_NEVER;
- else
- timeout = flsll(a * 1e9);
- if (debugging)
- printf("Timeout is 2^%d nanoseconds\n",
- timeout);
+ timeout_sec = atoi(optarg);
+ timeout = parse_timeout_to_pow2ns(c, NULL, optarg);
+ if (debugging)
+ printf("Timeout is 2^%d nanoseconds\n",
+ timeout);
break;
case 'T':
- carp_thresh_seconds = fetchtimeout(c, "NULL", optarg);
+ carp_thresh_seconds =
+ fetchtimeout(c, "NULL", optarg, 0);
break;
case 'w':
do_timedog = 1;
@@ -590,7 +744,7 @@ parseargs(int argc, char *argv[])
case 0:
lopt = longopts[longindex].name;
if (!strcmp(lopt, "pretimeout")) {
- pretimeout = fetchtimeout(0, lopt, optarg);
+ pretimeout = fetchtimeout(0, lopt, optarg, 0);
} else if (!strcmp(lopt, "pretimeout-action")) {
pretimeout_act = timeout_act_str2int(lopt,
optarg);
@@ -618,4 +772,15 @@ parseargs(int argc, char *argv[])
errx(EX_USAGE, "extra arguments.");
if (is_daemon && timeout < WD_TO_1SEC)
errx(EX_USAGE, "-t argument is less than one second.");
+ if (pretimeout_set) {
+ struct timespec ts;
+
+ pow2ns_to_ts(timeout, &ts);
+ if (pretimeout >= (uintmax_t)ts.tv_sec) {
+ errx(EX_USAGE,
+ "pretimeout (%d) >= timeout (%d -> %ld)\n"
+ "see manual section TIMEOUT RESOLUTION",
+ pretimeout, timeout_sec, (long)ts.tv_sec);
+ }
+ }
}