1 files changed, 159 insertions, 43 deletions
diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
index a63f669fea75..9104f1e0880a 100644
--- a/sys/kern/subr_epoch.c
+++ b/sys/kern/subr_epoch.c
@@ -55,6 +55,27 @@ __FBSDID("$FreeBSD$");
 
 static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation");
 
+#ifdef __amd64__
+#define EPOCH_ALIGN CACHE_LINE_SIZE*2
+#else
+#define EPOCH_ALIGN CACHE_LINE_SIZE
+#endif
+
+TAILQ_HEAD (epoch_tdlist, epoch_tracker);
+typedef struct epoch_record {
+	ck_epoch_record_t er_record;
+	volatile struct epoch_tdlist er_tdlist;
+	volatile uint32_t er_gen;
+	uint32_t er_cpuid;
+} __aligned(EPOCH_ALIGN)     *epoch_record_t;
+
+struct epoch {
+	struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
+	epoch_record_t e_pcpu_record;
+	int	e_idx;
+	int	e_flags;
+};
+
 /* arbitrary --- needs benchmarking */
 #define MAX_ADAPTIVE_SPIN 100
 #define MAX_EPOCHS 64
@@ -119,11 +140,15 @@ epoch_init(void *arg __unused)
 	epoch_call_count = counter_u64_alloc(M_WAITOK);
 	epoch_call_task_count = counter_u64_alloc(M_WAITOK);
 
-	pcpu_zone_record = uma_zcreate("epoch_record pcpu", sizeof(struct epoch_record),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU);
+	pcpu_zone_record = uma_zcreate("epoch_record pcpu",
+	    sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 	CPU_FOREACH(cpu) {
-		GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, epoch_call_task, NULL);
-		taskqgroup_attach_cpu(qgroup_softirq, DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, -1, "epoch call task");
+		GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
+		    epoch_call_task, NULL);
+		taskqgroup_attach_cpu(qgroup_softirq,
+		    DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, -1,
+		    "epoch call task");
 	}
 	inited = 1;
 	global_epoch = epoch_alloc(0);
@@ -150,13 +175,21 @@ epoch_ctor(epoch_t epoch)
 	CPU_FOREACH(cpu) {
 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 		bzero(er, sizeof(*er));
-		ck_epoch_register(&epoch->e_epoch, &er->er_read_record, NULL);
-		ck_epoch_register(&epoch->e_epoch, &er->er_write_record, NULL);
+		ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
 		TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
 		er->er_cpuid = cpu;
 	}
 }
 
+static void
+epoch_adjust_prio(struct thread *td, u_char prio)
+{
+
+	thread_lock(td);
+	sched_prio(td, prio);
+	thread_unlock(td);
+}
+
 epoch_t
 epoch_alloc(int flags)
 {
@@ -192,51 +225,126 @@ epoch_free(epoch_t epoch)
 	free(epoch, M_EPOCH);
 }
 
+static epoch_record_t
+epoch_currecord(epoch_t epoch)
+{
+
+	return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu));
+}
+
+#define INIT_CHECK(epoch)					\
+	do {							\
+		if (__predict_false((epoch) == NULL))		\
+			return;					\
+	} while (0)
+
 void
-epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et)
+epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et)
 {
+	struct epoch_record *er;
+	struct thread *td;
+
+	MPASS(cold || epoch != NULL);
+	INIT_CHECK(epoch);
+	MPASS(epoch->e_flags & EPOCH_PREEMPT);
+#ifdef EPOCH_TRACKER_DEBUG
+	et->et_magic_pre = EPOCH_MAGIC0;
+	et->et_magic_post = EPOCH_MAGIC1;
+#endif
+	td = curthread;
+	et->et_td = td;
+	td->td_epochnest++;
+	critical_enter();
+	sched_pin();
 
-	epoch_enter_preempt(epoch, et);
+	td->td_pre_epoch_prio = td->td_priority;
+	er = epoch_currecord(epoch);
+	TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
+	ck_epoch_begin(&er->er_record, &et->et_section);
+	critical_exit();
 }
 
 void
-epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et)
+epoch_enter(epoch_t epoch)
 {
+	struct thread *td;
+	epoch_record_t er;
+
+	MPASS(cold || epoch != NULL);
+	INIT_CHECK(epoch);
+	td = curthread;
 
-	epoch_exit_preempt(epoch, et);
+	td->td_epochnest++;
+	critical_enter();
+	er = epoch_currecord(epoch);
+	ck_epoch_begin(&er->er_record, NULL);
 }
 
 void
-epoch_enter_KBI(epoch_t epoch)
+epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et)
 {
+	struct epoch_record *er;
+	struct thread *td;
 
-	epoch_enter(epoch);
+	INIT_CHECK(epoch);
+	td = curthread;
+	critical_enter();
+	sched_unpin();
+	MPASS(td->td_epochnest);
+	td->td_epochnest--;
+	er = epoch_currecord(epoch);
+	MPASS(epoch->e_flags & EPOCH_PREEMPT);
+	MPASS(et != NULL);
+	MPASS(et->et_td == td);
+#ifdef EPOCH_TRACKER_DEBUG
+	MPASS(et->et_magic_pre == EPOCH_MAGIC0);
+	MPASS(et->et_magic_post == EPOCH_MAGIC1);
+	et->et_magic_pre = 0;
+	et->et_magic_post = 0;
+#endif
+#ifdef INVARIANTS
+	et->et_td = (void*)0xDEADBEEF;
+#endif
+	ck_epoch_end(&er->er_record, &et->et_section);
+	TAILQ_REMOVE(&er->er_tdlist, et, et_link);
+	er->er_gen++;
+	if (__predict_false(td->td_pre_epoch_prio != td->td_priority))
+		epoch_adjust_prio(td, td->td_pre_epoch_prio);
+	critical_exit();
 }
 
 void
-epoch_exit_KBI(epoch_t epoch)
+epoch_exit(epoch_t epoch)
 {
+	struct thread *td;
+	epoch_record_t er;
 
-	epoch_exit(epoch);
+	INIT_CHECK(epoch);
+	td = curthread;
+	MPASS(td->td_epochnest);
+	td->td_epochnest--;
+	er = epoch_currecord(epoch);
+	ck_epoch_end(&er->er_record, NULL);
+	critical_exit();
 }
 
 /*
- * epoch_block_handler_preempt is a callback from the ck code when another thread is
- * currently in an epoch section.
+ * epoch_block_handler_preempt() is a callback from the CK code when another
+ * thread is currently in an epoch section.
  */
 static void
-epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
-    void *arg __unused)
+epoch_block_handler_preempt(struct ck_epoch *global __unused,
+    ck_epoch_record_t *cr, void *arg __unused)
 {
 	epoch_record_t record;
 	struct thread *td, *owner, *curwaittd;
-	struct epoch_thread *tdwait;
+	struct epoch_tracker *tdwait;
 	struct turnstile *ts;
 	struct lock_object *lock;
 	int spincount, gen;
 	int locksheld __unused;
 
-	record = __containerof(cr, struct epoch_record, er_read_record);
+	record = __containerof(cr, struct epoch_record, er_record);
 	td = curthread;
 	locksheld = td->td_locks;
 	spincount = 0;
@@ -318,25 +426,27 @@ epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t
 		if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
 		    ((ts = curwaittd->td_blocked) != NULL)) {
 			/*
-			 * We unlock td to allow turnstile_wait to reacquire the
-			 * the thread lock. Before unlocking it we enter a critical
-			 * section to prevent preemption after we reenable interrupts
-			 * by dropping the thread lock in order to prevent curwaittd
-			 * from getting to run.
+			 * We unlock td to allow turnstile_wait to reacquire
+			 * the thread lock. Before unlocking it we enter a
+			 * critical section to prevent preemption after we
+			 * reenable interrupts by dropping the thread lock in
+			 * order to prevent curwaittd from getting to run.
 			 */
 			critical_enter();
 			thread_unlock(td);
 			owner = turnstile_lock(ts, &lock);
 			/*
-			 * The owner pointer indicates that the lock succeeded. Only
-			 * in case we hold the lock and the turnstile we locked is still
-			 * the one that curwaittd is blocked on can we continue. Otherwise
-			 * The turnstile pointer has been changed out from underneath
-			 * us, as in the case where the lock holder has signalled curwaittd,
+			 * The owner pointer indicates that the lock succeeded.
+			 * Only in case we hold the lock and the turnstile we
+			 * locked is still the one that curwaittd is blocked on
+			 * can we continue. Otherwise the turnstile pointer has
+			 * been changed out from underneath us, as in the case
+			 * where the lock holder has signalled curwaittd,
 			 * and we need to continue.
 			 */
 			if (owner != NULL && ts == curwaittd->td_blocked) {
-				MPASS(TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd));
+				MPASS(TD_IS_INHIBITED(curwaittd) &&
+				    TD_ON_LOCK(curwaittd));
 				critical_exit();
 				turnstile_wait(ts, owner, curwaittd->td_tsqueue);
 				counter_u64_add(turnstile_count, 1);
@@ -386,9 +496,8 @@ epoch_wait_preempt(epoch_t epoch)
 	if ((epoch->e_flags & EPOCH_LOCKED) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "epoch_wait() can be long running");
-	KASSERT(!in_epoch(epoch),
-			("epoch_wait_preempt() called in the middle "
-			 "of an epoch section of the same epoch"));
+	KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
+	    "of an epoch section of the same epoch"));
 #endif
 	thread_lock(td);
 	DROP_GIANT();
@@ -401,7 +510,8 @@ epoch_wait_preempt(epoch_t epoch)
 	td->td_pinned = 0;
 	sched_bind(td, old_cpu);
 
-	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL);
+	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
+	    NULL);
 
 	/* restore CPU binding, if any */
 	if (was_bound != 0) {
@@ -462,7 +572,7 @@ epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t
 	critical_enter();
 	*DPCPU_PTR(epoch_cb_count) += 1;
 	er = epoch_currecord(epoch);
-	ck_epoch_call(&er->er_write_record, cb, (ck_epoch_cb_t *)callback);
+	ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
 	critical_exit();
 	return;
 boottime:
@@ -486,7 +596,7 @@ epoch_call_task(void *arg __unused)
 		if (__predict_false((epoch = allepochs[i]) == NULL))
 			continue;
 		er = epoch_currecord(epoch);
-		record = &er->er_write_record;
+		record = &er->er_record;
 		if ((npending = record->n_pending) == 0)
 			continue;
 		ck_epoch_poll_deferred(record, &cb_stack);
@@ -502,7 +612,7 @@ epoch_call_task(void *arg __unused)
 	head = ck_stack_batch_pop_npsc(&cb_stack);
 	for (cursor = head; cursor != NULL; cursor = next) {
 		struct ck_epoch_entry *entry =
-		ck_epoch_entry_container(cursor);
+		    ck_epoch_entry_container(cursor);
 
 		next = CK_STACK_NEXT(cursor);
 		entry->function(entry);
@@ -512,7 +622,7 @@ epoch_call_task(void *arg __unused)
 int
 in_epoch_verbose(epoch_t epoch, int dump_onfail)
 {
-	struct epoch_thread *tdwait;
+	struct epoch_tracker *tdwait;
 	struct thread *td;
 	epoch_record_t er;
 
@@ -548,9 +658,15 @@ in_epoch(epoch_t epoch)
 }
 
 void
-epoch_adjust_prio(struct thread *td, u_char prio)
+epoch_thread_init(struct thread *td)
 {
-	thread_lock(td);
-	sched_prio(td, prio);
-	thread_unlock(td);
+
+	td->td_et = malloc(sizeof(struct epoch_tracker), M_EPOCH, M_WAITOK);
+}
+
+void
+epoch_thread_fini(struct thread *td)
+{
+
+	free(td->td_et, M_EPOCH);
 }