From 8df78c41d60219744f69ac9ec4e2842407fdca40 Mon Sep 17 00:00:00 2001 From: Jeff Roberson Date: Thu, 17 Apr 2008 04:20:10 +0000 Subject: - Make SCHED_STATS more generic by adding a wrapper to create the variables and sysctl nodes. - In reset walk the children of kern_sched_stats and reset the counters via the oid_arg1 pointer. This allows us to add arbitrary counters to the tree and still reset them properly. - Define a set of switch types to be passed with flags to mi_switch(). These types are named SWT_*. These types correspond to SCHED_STATS counters and are automatically handled in this way. - Make the new SWT_ types more specific than the older switch stats. There are now stats for idle switches, remote idle wakeups, remote preemption ithreads idling, etc. - Add switch statistics for ULE's pickcpu algorithm. These stats include how much migration there is, how often affinity was successful, how often threads were migrated to the local cpu on wakeup, etc. Sponsored by: Nokia --- sys/kern/kern_intr.c | 4 +-- sys/kern/kern_subr.c | 2 +- sys/kern/kern_switch.c | 66 ++++++++++++++++++++++++++++------------------ sys/kern/kern_synch.c | 6 ++++- sys/kern/kern_thread.c | 4 +-- sys/kern/sched_4bsd.c | 10 +++---- sys/kern/sched_ule.c | 38 ++++++++++++++++++++------ sys/kern/subr_sleepqueue.c | 6 ++--- sys/kern/subr_trap.c | 3 +-- sys/kern/subr_turnstile.c | 3 +-- sys/sys/proc.h | 24 ++++++++++++++--- sys/sys/sched.h | 18 +++++++------ sys/sys/sysctl.h | 1 + sys/vm/vm_glue.c | 2 +- sys/vm/vm_zeroidle.c | 2 +- 15 files changed, 122 insertions(+), 67 deletions(-) diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index e626988f5bea..5e464f9f3a0d 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -1231,7 +1231,7 @@ ithread_loop(void *arg) if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { TD_SET_IWAIT(td); ie->ie_count = 0; - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IWAIT, NULL); } thread_unlock(td); } @@ -1389,7 +1389,7 @@ ithread_loop(void *arg) if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { TD_SET_IWAIT(td); ie->ie_count = 0; - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IWAIT, NULL); } thread_unlock(td); } diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index 2101026ce5e4..c93f262626ee 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -456,7 +456,7 @@ uio_yield(void) DROP_GIANT(); thread_lock(td); sched_prio(td, td->td_user_pri); - mi_switch(SW_INVOL, NULL); + mi_switch(SW_INVOL | SWT_RELINQUISH, NULL); thread_unlock(td); PICKUP_GIANT(); } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 10bfb73dd3dc..b7cb0b6b82db 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -73,25 +73,35 @@ static int kern_sched_preemption = 0; SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, &kern_sched_preemption, 0, "Kernel preemption enabled"); +/* + * Support for scheduler stats exported via kern.sched.stats. All stats may + * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere + * with SCHED_STAT_DEFINE(). + */ #ifdef SCHED_STATS -long switch_preempt; -long switch_owepreempt; -long switch_turnstile; -long switch_sleepq; -long switch_sleepqtimo; -long switch_relinquish; -long switch_needresched; -static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, ""); -SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, ""); +long sched_switch_stats[SWT_COUNT]; /* Switch reasons from mi_switch(). */ + +SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); +SCHED_STAT_DEFINE_VAR(uncategorized, &sched_switch_stats[SWT_NONE], ""); +SCHED_STAT_DEFINE_VAR(preempt, &sched_switch_stats[SWT_PREEMPT], ""); +SCHED_STAT_DEFINE_VAR(owepreempt, &sched_switch_stats[SWT_OWEPREEMPT], ""); +SCHED_STAT_DEFINE_VAR(turnstile, &sched_switch_stats[SWT_TURNSTILE], ""); +SCHED_STAT_DEFINE_VAR(sleepq, &sched_switch_stats[SWT_SLEEPQ], ""); +SCHED_STAT_DEFINE_VAR(sleepqtimo, &sched_switch_stats[SWT_SLEEPQTIMO], ""); +SCHED_STAT_DEFINE_VAR(relinquish, &sched_switch_stats[SWT_RELINQUISH], ""); +SCHED_STAT_DEFINE_VAR(needresched, &sched_switch_stats[SWT_NEEDRESCHED], ""); +SCHED_STAT_DEFINE_VAR(idle, &sched_switch_stats[SWT_IDLE], ""); +SCHED_STAT_DEFINE_VAR(iwait, &sched_switch_stats[SWT_IWAIT], ""); +SCHED_STAT_DEFINE_VAR(suspend, &sched_switch_stats[SWT_SUSPEND], ""); +SCHED_STAT_DEFINE_VAR(remotepreempt, &sched_switch_stats[SWT_REMOTEPREEMPT], + ""); +SCHED_STAT_DEFINE_VAR(remotewakeidle, &sched_switch_stats[SWT_REMOTEWAKEIDLE], + ""); + static int sysctl_stats_reset(SYSCTL_HANDLER_ARGS) { + struct sysctl_oid *p; int error; int val; @@ -101,14 +111,15 @@ sysctl_stats_reset(SYSCTL_HANDLER_ARGS) return (error); if (val == 0) return (0); - switch_preempt = 0; - switch_owepreempt = 0; - switch_turnstile = 0; - switch_sleepq = 0; - switch_sleepqtimo = 0; - switch_relinquish = 0; - switch_needresched = 0; - + /* + * Traverse the list of children of _kern_sched_stats and reset each + * to 0. Skip the reset entry. + */ + SLIST_FOREACH(p, oidp->oid_parent, oid_link) { + if (p == oidp || p->oid_arg1 == NULL) + continue; + *(long *)p->oid_arg1 = 0; + } return (0); } @@ -164,6 +175,7 @@ void critical_exit(void) { struct thread *td; + int flags; td = curthread; KASSERT(td->td_critnest != 0, @@ -175,8 +187,12 @@ critical_exit(void) td->td_critnest = 1; thread_lock(td); td->td_critnest--; - SCHED_STAT_INC(switch_owepreempt); - mi_switch(SW_INVOL|SW_PREEMPT, NULL); + flags = SW_INVOL | SW_PREEMPT; + if (TD_IS_IDLETHREAD(td)) + flags |= SWT_IDLE; + else + flags |= SWT_OWEPREEMPT; + mi_switch(flags, NULL); thread_unlock(td); } } else diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index b4defe929a4c..c322ace3970f 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" +#include "opt_sched.h" #include #include @@ -390,6 +391,9 @@ mi_switch(int flags, struct thread *newtd) td->td_ru.ru_nvcsw++; else td->td_ru.ru_nivcsw++; +#ifdef SCHED_STATS + SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); +#endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. @@ -533,7 +537,7 @@ yield(struct thread *td, struct yield_args *uap) thread_lock(td); sched_prio(td, PRI_MAX_TIMESHARE); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); td->td_retval[0] = 0; return (0); diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 8745e5e9b990..a3d5da72f53e 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -723,7 +723,7 @@ thread_suspend_check(int return_instead) td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); - mi_switch(SW_INVOL, NULL); + mi_switch(SW_INVOL | SWT_SUSPEND, NULL); if (return_instead == 0) td->td_flags &= ~TDF_BOUNDARY; thread_unlock(td); @@ -756,7 +756,7 @@ thread_suspend_switch(struct thread *td) sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 68798019bd64..ed5cf628bbd0 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -316,8 +316,7 @@ maybe_preempt(struct thread *td) TD_SET_RUNNING(td); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_name); - SCHED_STAT_INC(switch_preempt); - mi_switch(SW_INVOL|SW_PREEMPT, td); + mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td); /* * td's lock pointer may have changed. We have to return with it * locked. @@ -1332,7 +1331,7 @@ sched_preempt(struct thread *td) if (td->td_critnest > 1) td->td_owepreempt = 1; else - mi_switch(SW_INVOL | SW_PREEMPT, NULL); + mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL); thread_unlock(td); } @@ -1397,8 +1396,7 @@ void sched_relinquish(struct thread *td) { thread_lock(td); - SCHED_STAT_INC(switch_relinquish); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } @@ -1448,7 +1446,7 @@ sched_idletd(void *dummy) cpu_idle(); mtx_lock_spin(&sched_lock); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IDLE, NULL); mtx_unlock_spin(&sched_lock); } } diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 911b169ce85d..c03f7c86f786 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -909,7 +909,7 @@ tdq_idled(struct tdq *tdq) } spinlock_exit(); TDQ_UNLOCK(steal); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(curthread); return (0); @@ -1073,6 +1073,13 @@ sched_setcpu(struct thread *td, int cpu, int flags) return (tdq); } +SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); +SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); +SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); +SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); +SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); +SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); + static int sched_pickcpu(struct thread *td, int flags) { @@ -1098,8 +1105,10 @@ sched_pickcpu(struct thread *td, int flags) * the interrupt. */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && - curthread->td_intr_nesting_level) + curthread->td_intr_nesting_level && ts->ts_cpu != self) { + SCHED_STAT_INC(pickcpu_intrbind); ts->ts_cpu = self; + } /* * If the thread can run on the last cpu and the affinity has not * expired or it is idle run it there. @@ -1107,10 +1116,14 @@ sched_pickcpu(struct thread *td, int flags) pri = td->td_priority; tdq = TDQ_CPU(ts->ts_cpu); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) { - if (tdq->tdq_lowpri > PRI_MIN_IDLE) + if (tdq->tdq_lowpri > PRI_MIN_IDLE) { + SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); - if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) + } + if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) { + SCHED_STAT_INC(pickcpu_affinity); return (ts->ts_cpu); + } } /* * Search for the highest level in the tree that still has affinity. @@ -1129,8 +1142,13 @@ sched_pickcpu(struct thread *td, int flags) * Compare the lowest loaded cpu to current cpu. */ if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && - TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) + TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) { + SCHED_STAT_INC(pickcpu_local); cpu = self; + } else + SCHED_STAT_INC(pickcpu_lowest); + if (cpu != ts->ts_cpu) + SCHED_STAT_INC(pickcpu_migration); KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); return (cpu); } @@ -1989,10 +2007,15 @@ sched_preempt(struct thread *td) TDQ_LOCK_ASSERT(tdq, MA_OWNED); tdq->tdq_ipipending = 0; if (td->td_priority > tdq->tdq_lowpri) { + int flags; + + flags = SW_INVOL | SW_PREEMPT; if (td->td_critnest > 1) td->td_owepreempt = 1; + else if (TD_IS_IDLETHREAD(td)) + mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL); else - mi_switch(SW_INVOL | SW_PREEMPT, NULL); + mi_switch(flags | SWT_REMOTEPREEMPT, NULL); } thread_unlock(td); } @@ -2378,8 +2401,7 @@ void sched_relinquish(struct thread *td) { thread_lock(td); - SCHED_STAT_INC(switch_relinquish); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c index 9edd56e61ac7..1dbd1db3616d 100644 --- a/sys/kern/subr_sleepqueue.c +++ b/sys/kern/subr_sleepqueue.c @@ -486,8 +486,7 @@ sleepq_switch(void *wchan, int pri) sched_sleep(td, pri); thread_lock_set(td, &sc->sc_lock); TD_SET_SLEEPING(td); - SCHED_STAT_INC(switch_sleepq); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_SLEEPQ, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); @@ -527,8 +526,7 @@ sleepq_check_timeout(void) else if (callout_stop(&td->td_slpcallout) == 0) { td->td_flags |= TDF_TIMEOUT; TD_SET_SLEEPING(td); - SCHED_STAT_INC(switch_sleepqtimo); - mi_switch(SW_INVOL, NULL); + mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL); } return (0); } diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index a92abd28c6db..3d1948dce4e1 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -211,8 +211,7 @@ ast(struct trapframe *framep) #endif thread_lock(td); sched_prio(td, td->td_user_pri); - SCHED_STAT_INC(switch_needresched); - mi_switch(SW_INVOL, NULL); + mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL); thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c index fd6cdbdf70cf..7b8270ac75ce 100644 --- a/sys/kern/subr_turnstile.c +++ b/sys/kern/subr_turnstile.c @@ -741,8 +741,7 @@ turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) td->td_tid, lock, lock->lo_name); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); - SCHED_STAT_INC(switch_turnstile); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_TURNSTILE, NULL); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", diff --git a/sys/sys/proc.h b/sys/sys/proc.h index a3e055f48543..86adbb1c4c1c 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -588,10 +588,26 @@ struct proc { #ifdef _KERNEL -/* Flags for mi_switch(). */ -#define SW_VOL 0x0001 /* Voluntary switch. */ -#define SW_INVOL 0x0002 /* Involuntary switch. */ -#define SW_PREEMPT 0x0004 /* The invol switch is a preemption */ +/* Types and flags for mi_switch(). */ +#define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ +#define SWT_NONE 0 /* Unspecified switch. */ +#define SWT_PREEMPT 1 /* Switching due to preemption. */ +#define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ +#define SWT_TURNSTILE 3 /* Turnstile contention. */ +#define SWT_SLEEPQ 4 /* Sleepq wait. */ +#define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ +#define SWT_RELINQUISH 6 /* yield call. */ +#define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ +#define SWT_IDLE 8 /* Switching from the idle thread. */ +#define SWT_IWAIT 9 /* Waiting for interrupts. */ +#define SWT_SUSPEND 10 /* Thread suspended. */ +#define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ +#define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ +#define SWT_COUNT 13 /* Number of switch types. */ +/* Flags */ +#define SW_VOL 0x0100 /* Voluntary switch. */ +#define SW_INVOL 0x0200 /* Involuntary switch. */ +#define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 diff --git a/sys/sys/sched.h b/sys/sys/sched.h index fa57055936be..bbd2199c7e58 100644 --- a/sys/sys/sched.h +++ b/sys/sys/sched.h @@ -154,17 +154,19 @@ sched_unpin(void) #define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */ #define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */ -/* Switch stats. */ +/* Scheduler stats. */ #ifdef SCHED_STATS -extern long switch_preempt; -extern long switch_owepreempt; -extern long switch_turnstile; -extern long switch_sleepq; -extern long switch_sleepqtimo; -extern long switch_relinquish; -extern long switch_needresched; +extern long sched_switch_stats[SWT_COUNT]; + +#define SCHED_STAT_DEFINE_VAR(name, ptr, descr) \ + SYSCTL_LONG(_kern_sched_stats, OID_AUTO, name, CTLFLAG_RD, ptr, 0, descr) +#define SCHED_STAT_DEFINE(name, descr) \ + unsigned long name; \ + SCHED_STAT_DEFINE_VAR(name, &name, descr) #define SCHED_STAT_INC(var) atomic_add_long(&(var), 1) #else +#define SCHED_STAT_DEFINE_VAR(name, descr, ptr) +#define SCHED_STAT_DEFINE(name, descr) #define SCHED_STAT_INC(var) #endif diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h index 59543ba16a7c..ebd83e7ba300 100644 --- a/sys/sys/sysctl.h +++ b/sys/sys/sysctl.h @@ -632,6 +632,7 @@ SYSCTL_DECL(_kern_features); SYSCTL_DECL(_kern_ipc); SYSCTL_DECL(_kern_proc); SYSCTL_DECL(_kern_sched); +SYSCTL_DECL(_kern_sched_stats); SYSCTL_DECL(_sysctl); SYSCTL_DECL(_vm); SYSCTL_DECL(_vm_stats); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 9f9dc5a74305..462c4603efa1 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -735,7 +735,7 @@ loop: thread_lock(&thread0); if (!proc0_rescan) { TD_SET_IWAIT(&thread0); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IWAIT, NULL); } proc0_rescan = 0; thread_unlock(&thread0); diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c index c82de5a45bb6..9e1970ab9450 100644 --- a/sys/vm/vm_zeroidle.c +++ b/sys/vm/vm_zeroidle.c @@ -127,7 +127,7 @@ vm_pagezero(void __unused *arg) #ifndef PREEMPTION if (sched_runnable()) { thread_lock(curthread); - mi_switch(SW_VOL, NULL); + mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(curthread); } #endif -- cgit v1.2.3