From 90836c3270786135dcae04c9eff80cdef9fe602c Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Sun, 5 Feb 2017 03:26:34 +0000
Subject: mtx: switch to fcmpset

The found value is passed to locking routines in order to reduce cacheline
accesses.

mtx_unlock grows an explicit check for regular unlock. On ll/sc architectures
the routine can fail even if the lock could have been handled by the inline
primitive.

Discussed with:	jhb
Tested by:	pho (previous version)
---
 sys/kern/kern_mutex.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'sys/kern/kern_mutex.c')

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 35bbd5f78c3f..8e6133d6bfde 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -455,12 +455,11 @@ _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
-__mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
+__mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v, uintptr_t tid, int opts,
     const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
-	uintptr_t v;
 #ifdef ADAPTIVE_MUTEXES
 	volatile struct thread *owner;
 #endif
@@ -489,7 +488,6 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 	lock_delay_arg_init(&lda, NULL);
 #endif
 	m = mtxlock2mtx(c);
-	v = MTX_READ_VALUE(m);
 
 	if (__predict_false(lv_mtx_owner(v) == (struct thread *)tid)) {
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
@@ -520,9 +518,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 
 	for (;;) {
 		if (v == MTX_UNOWNED) {
-			if (_mtx_obtain_lock(m, tid))
+			if (_mtx_obtain_lock_fetch(m, &v, tid))
 				break;
-			v = MTX_READ_VALUE(m);
 			continue;
 		}
 #ifdef KDTRACE_HOOKS
@@ -674,12 +671,11 @@ _mtx_lock_spin_failed(struct mtx *m)
  * is handled inline.
  */
 void
-_mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
-    const char *file, int line)
+_mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v, uintptr_t tid,
+    int opts, const char *file, int line)
 {
 	struct mtx *m;
 	struct lock_delay_arg lda;
-	uintptr_t v;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
@@ -706,12 +702,10 @@ _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
 #ifdef KDTRACE_HOOKS
 	spin_time -= lockstat_nsecs(&m->lock_object);
 #endif
-	v = MTX_READ_VALUE(m);
 	for (;;) {
 		if (v == MTX_UNOWNED) {
-			if (_mtx_obtain_lock(m, tid))
+			if (_mtx_obtain_lock_fetch(m, &v, tid))
 				break;
-			v = MTX_READ_VALUE(m);
 			continue;
 		}
 		/* Give interrupts a chance while we spin. */
@@ -796,14 +790,11 @@ retry:
 			    m->lock_object.lo_name, file, line));
 		WITNESS_CHECKORDER(&m->lock_object,
 		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
-		v = MTX_READ_VALUE(m);
 		for (;;) {
-			if (v == MTX_UNOWNED) {
-				if (_mtx_obtain_lock(m, tid))
-					break;
-				v = MTX_READ_VALUE(m);
+			if (_mtx_obtain_lock_fetch(m, &v, tid))
+				break;
+			if (v == MTX_UNOWNED)
 				continue;
-			}
 			if (v == tid) {
 				m->mtx_recurse++;
 				break;
@@ -896,11 +887,18 @@ __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
+	uintptr_t v;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
+	v = MTX_READ_VALUE(m);
+
+	if (v == (uintptr_t)curthread) {
+		if (_mtx_release_lock(m, (uintptr_t)curthread))
+			return;
+	}
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
-- 
cgit v1.2.3


From 08da2677755f0874ea99c5a42181aacc3eb17690 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Sun, 5 Feb 2017 08:04:11 +0000
Subject: mtx: move lockstat handling out of inline primitives

Lockstat requires checking if it is enabled and if so, calling a 6 argument
function. Further, determining whether to call it on unlock requires
pre-reading the lock value.

This is problematic in at least 3 ways:
- more branches in the hot path than necessary
- additional cacheline ping pong under contention
- bigger code

Instead, check first if lockstat handling is necessary and if so, just fall
back to regular locking routines. For this purpose a new macro is introduced
(LOCKSTAT_PROFILE_ENABLED).

LOCK_PROFILING uninlines all primitives. Fold in the current inline lock
variant into the _mtx_lock_flags to retain the support. With this change
the inline variants are not used when LOCK_PROFILING is defined and thus
can ignore its existence.

This results in:
   text	   data	    bss	    dec	    hex	filename
22259667	1303208	4994976	28557851	1b3c21b	kernel.orig
21797315	1303208	4994976	28095499	1acb40b	kernel.patched

i.e. about 3% reduction in text size.

A remaining action is to remove spurious arguments for internal kernel
consumers.
---
 sys/kern/kern_mutex.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'sys/kern/kern_mutex.c')

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 8e6133d6bfde..f49d87c81999 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -265,6 +265,7 @@ void
 __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
+	uintptr_t tid, v;
 
 	if (SCHEDULER_STOPPED())
 		return;
@@ -282,7 +283,13 @@ __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 	WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
 	    LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 
-	__mtx_lock(m, curthread, opts, file, line);
+	tid = (uintptr_t)curthread;
+	v = MTX_UNOWNED;
+	if (!_mtx_obtain_lock_fetch(m, &v, tid))
+		_mtx_lock_sleep(m, v, tid, opts, file, line);
+	else
+		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire,
+		    m, 0, 0, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
@@ -310,7 +317,7 @@ __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 	    line);
 	mtx_assert(m, MA_OWNED);
 
-	__mtx_unlock(m, curthread, opts, file, line);
+	__mtx_unlock_sleep(c, opts, file, line);
 	TD_LOCKS_DEC(curthread);
 }
 
@@ -887,20 +894,17 @@ __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
-	uintptr_t v;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
-	v = MTX_READ_VALUE(m);
 
-	if (v == (uintptr_t)curthread) {
+	if (!mtx_recursed(m)) {
+		LOCKSTAT_PROFILE_RELEASE_LOCK(adaptive__release, m);
 		if (_mtx_release_lock(m, (uintptr_t)curthread))
 			return;
-	}
-
-	if (mtx_recursed(m)) {
+	} else {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
-- 
cgit v1.2.3


From cae4ab7f3718736b51c048440a5eadd7d7251623 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Sun, 5 Feb 2017 09:35:17 +0000
Subject: mtx: fix up _mtx_obtain_lock_fetch usage in thread lock

Since _mtx_obtain_lock_fetch no longer sets the argument to MTX_UNOWNED,
callers have to do it on their own.
---
 sys/kern/kern_mutex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/kern/kern_mutex.c')

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index f49d87c81999..76a009d0cfca 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -782,6 +782,7 @@ thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
 #ifdef KDTRACE_HOOKS
 	spin_time -= lockstat_nsecs(&td->td_lock->lock_object);
 #endif
+	v = MTX_UNOWNED;
 	for (;;) {
 retry:
 		spinlock_enter();
-- 
cgit v1.2.3


From dc0896512cd91cb390cc35ba96e9285cd04383da Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Sun, 5 Feb 2017 09:53:13 +0000
Subject: mtx: fixup r313278, the assignemnt was supposed to go inside the loop

---
 sys/kern/kern_mutex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/kern/kern_mutex.c')

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 76a009d0cfca..83977bb6425e 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -782,9 +782,9 @@ thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
 #ifdef KDTRACE_HOOKS
 	spin_time -= lockstat_nsecs(&td->td_lock->lock_object);
 #endif
-	v = MTX_UNOWNED;
 	for (;;) {
 retry:
+		v = MTX_UNOWNED;
 		spinlock_enter();
 		m = td->td_lock;
 		KASSERT(m->mtx_lock != MTX_DESTROYED,
-- 
cgit v1.2.3