4 files changed, 132 insertions, 374 deletions
diff --git a/contrib/libcxxrt/atomic.h b/contrib/libcxxrt/atomic.h
index 701d05337cf1..131ca9f57798 100644
--- a/contrib/libcxxrt/atomic.h
+++ b/contrib/libcxxrt/atomic.h
@@ -1,102 +1,30 @@
 
 #ifndef __has_builtin
-#	define __has_builtin(x) 0
+#define __has_builtin(x) 0
 #endif
 #ifndef __has_feature
-#	define __has_feature(x) 0
-#endif
-#ifndef __has_extension
-#	define __has_extension(x) 0
-#endif
-
-#if !__has_extension(c_atomic)
-#	define _Atomic(T) T
+#define __has_feature(x) 0
 #endif
+/**
+ * Swap macro that enforces a happens-before relationship with a corresponding
+ * ATOMIC_LOAD.
+ */
 #if __has_builtin(__c11_atomic_exchange)
-#	define ATOMIC_BUILTIN(name) __c11_atomic_##name
+#define ATOMIC_SWAP(addr, val)\
+	__c11_atomic_exchange(reinterpret_cast<_Atomic(__typeof__(val))*>(addr), val, __ATOMIC_ACQ_REL)
+#elif __has_builtin(__sync_swap)
+#define ATOMIC_SWAP(addr, val)\
+	__sync_swap(addr, val)
 #else
-#	define ATOMIC_BUILTIN(name) __atomic_##name##_n
+#define ATOMIC_SWAP(addr, val)\
+	__sync_lock_test_and_set(addr, val)
 #endif
 
-namespace
-{
-	/**
-	 * C++11 memory orders.  We only need a subset of them.
-	 */
-	enum memory_order
-	{
-		/**
-		 * Acquire order.
-		 */
-		acquire = __ATOMIC_ACQUIRE,
-
-		/**
-		 * Release order.
-		 */
-		release = __ATOMIC_RELEASE,
-
-		/**
-		 * Sequentially consistent memory ordering.
-		 */
-		seqcst = __ATOMIC_SEQ_CST
-	};
-
-	/**
-	 * Atomic, implements a subset of `std::atomic`.
-	 */
-	template<typename T>
-	class atomic
-	{
-		/**
-		 * The underlying value.  Use C11 atomic qualification if available.
-		 */
-		_Atomic(T) val;
-
-		public:
-		/**
-		 * Constructor, takes a value.
-		 */
-		atomic(T init) : val(init) {}
-
-		/**
-		 * Atomically load with the specified memory order.
-		 */
-		T load(memory_order order = memory_order::seqcst)
-		{
-			return ATOMIC_BUILTIN(load)(&val, order);
-		}
-
-		/**
-		 * Atomically store with the specified memory order.
-		 */
-		void store(T v, memory_order order = memory_order::seqcst)
-		{
-			return ATOMIC_BUILTIN(store)(&val, v, order);
-		}
-
-		/**
-		 * Atomically exchange with the specified memory order.
-		 */
-		T exchange(T v, memory_order order = memory_order::seqcst)
-		{
-			return ATOMIC_BUILTIN(exchange)(&val, v, order);
-		}
-
-		/**
-		 * Atomically exchange with the specified memory order.
-		 */
-		bool compare_exchange(T &          expected,
-		                      T            desired,
-		                      memory_order order = memory_order::seqcst)
-		{
-#if __has_builtin(__c11_atomic_compare_exchange_strong)
-			return __c11_atomic_compare_exchange_strong(
-			  &val, &expected, desired, order, order);
+#if __has_builtin(__c11_atomic_load)
+#define ATOMIC_LOAD(addr)\
+	__c11_atomic_load(reinterpret_cast<_Atomic(__typeof__(*addr))*>(addr), __ATOMIC_ACQUIRE)
 #else
-			return __atomic_compare_exchange_n(
-			  &val, &expected, desired, true, order, order);
+#define ATOMIC_LOAD(addr)\
+	(__sync_synchronize(), *addr)
 #endif
-		}
-	};
-} // namespace
-#undef ATOMIC_BUILTIN
+
diff --git a/contrib/libcxxrt/exception.cc b/contrib/libcxxrt/exception.cc
index 2f1dc4030ba4..0fb26ddb4ed2 100644
--- a/contrib/libcxxrt/exception.cc
+++ b/contrib/libcxxrt/exception.cc
@@ -1,6 +1,5 @@
 /* 
  * Copyright 2010-2011 PathScale, Inc. All rights reserved.
- * Copyright 2021 David Chisnall. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -290,9 +289,9 @@ using namespace ABI_NAMESPACE;
 
 
 /** The global termination handler. */
-static atomic<terminate_handler> terminateHandler = abort;
+static terminate_handler terminateHandler = abort;
 /** The global unexpected exception handler. */
-static atomic<unexpected_handler> unexpectedHandler = std::terminate;
+static unexpected_handler unexpectedHandler = std::terminate;
 
 /** Key used for thread-local data. */
 static pthread_key_t eh_key;
@@ -745,12 +744,12 @@ static void throw_exception(__cxa_exception *ex)
 	ex->unexpectedHandler = info->unexpectedHandler;
 	if (0 == ex->unexpectedHandler)
 	{
-		ex->unexpectedHandler = unexpectedHandler.load();
+		ex->unexpectedHandler = unexpectedHandler;
 	}
 	ex->terminateHandler  = info->terminateHandler;
 	if (0 == ex->terminateHandler)
 	{
-		ex->terminateHandler = terminateHandler.load();
+		ex->terminateHandler = terminateHandler;
 	}
 	info->globals.uncaughtExceptions++;
 
@@ -1450,7 +1449,7 @@ namespace std
 	{
 		if (thread_local_handlers) { return pathscale::set_unexpected(f); }
 
-		return unexpectedHandler.exchange(f);
+		return ATOMIC_SWAP(&unexpectedHandler, f);
 	}
 	/**
 	 * Sets the function that is called to terminate the program.
@@ -1459,7 +1458,7 @@ namespace std
 	{
 		if (thread_local_handlers) { return pathscale::set_terminate(f); }
 
-		return terminateHandler.exchange(f);
+		return ATOMIC_SWAP(&terminateHandler, f);
 	}
 	/**
 	 * Terminates the program, calling a custom terminate implementation if
@@ -1475,7 +1474,7 @@ namespace std
 			// return.
 			abort();
 		}
-		terminateHandler.load()();
+		terminateHandler();
 	}
 	/**
 	 * Called when an unexpected exception is encountered (i.e. an exception
@@ -1492,7 +1491,7 @@ namespace std
 			// return.
 			abort();
 		}
-		unexpectedHandler.load()();
+		unexpectedHandler();
 	}
 	/**
 	 * Returns whether there are any exceptions currently being thrown that
@@ -1522,7 +1521,7 @@ namespace std
 		{
 			return info->unexpectedHandler;
 		}
-		return unexpectedHandler.load();
+		return ATOMIC_LOAD(&unexpectedHandler);
 	}
 	/**
 	 * Returns the current terminate handler.
@@ -1534,7 +1533,7 @@ namespace std
 		{
 			return info->terminateHandler;
 		}
-		return terminateHandler.load();
+		return ATOMIC_LOAD(&terminateHandler);
 	}
 }
 #if defined(__arm__) && !defined(__ARM_DWARF_EH__)
diff --git a/contrib/libcxxrt/guard.cc b/contrib/libcxxrt/guard.cc
index 515992563a10..34d294cf7432 100644
--- a/contrib/libcxxrt/guard.cc
+++ b/contrib/libcxxrt/guard.cc
@@ -1,6 +1,5 @@
-/*
+/* 
  * Copyright 2010-2012 PathScale, Inc. All rights reserved.
- * Copyright 2021 David Chisnall. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -11,7 +10,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
- *
+ * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
@@ -39,305 +38,126 @@
  * value as a low-overhead lock.  Because statics (in most sane code) are
  * accessed far more times than they are initialised, this lock implementation
  * is heavily optimised towards the case where the static has already been
- * initialised.
+ * initialised.  
  */
-#include "atomic.h"
-#include <assert.h>
-#include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <assert.h>
+#include "atomic.h"
 
 // Older GCC doesn't define __LITTLE_ENDIAN__
 #ifndef __LITTLE_ENDIAN__
-// If __BYTE_ORDER__ is defined, use that instead
+	// If __BYTE_ORDER__ is defined, use that instead
 #	ifdef __BYTE_ORDER__
 #		if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #			define __LITTLE_ENDIAN__
 #		endif
-// x86 and ARM are the most common little-endian CPUs, so let's have a
-// special case for them (ARM is already special cased).  Assume everything
-// else is big endian.
+	// x86 and ARM are the most common little-endian CPUs, so let's have a
+	// special case for them (ARM is already special cased).  Assume everything
+	// else is big endian.
 #	elif defined(__x86_64) || defined(__i386)
 #		define __LITTLE_ENDIAN__
 #	endif
 #endif
 
+
 /*
- * The Itanium C++ ABI defines guard words that are 64-bit (32-bit on AArch32)
- * values with one bit defined to indicate that the guarded variable is and
- * another bit to indicate that it's currently locked (initialisation in
- * progress).  The bit to use depends on the byte order of the target.
- *
- * On many 32-bit platforms, 64-bit atomics are unavailable (or slow) and so we
- * treat the two halves of the 64-bit word as independent values and
+ * The least significant bit of the guard variable indicates that the object
+ * has been initialised, the most significant bit is used for a spinlock.
  */
-namespace
-{
-	/**
-	 * The state of the guard variable when an attempt is made to lock it.
-	 */
-	enum class GuardState
-	{
-		/**
-		 * The lock is not held but is not needed because initialisation is
-		 * one.
-		 */
-		InitDone,
-
-		/**
-		 * Initialisation is not done but the lock is held by the caller.
-		 */
-		InitLockSucceeded,
-
-		/**
-		 * Attempting to acquire the lock failed.
-		 */
-		InitLockFailed
-	};
-
-	/**
-	 * Class encapsulating a single atomic word being used to represent the
-	 * guard.  The word size is defined by the type of `GuardWord`.  The bit
-	 * used to indicate the locked state is `1<<LockedBit`, the bit used to
-	 * indicate the initialised state is `1<<InitBit`.
-	 */
-	template<typename GuardWord, int LockedBit, int InitBit>
-	struct SingleWordGuard
-	{
-		/**
-		 * The value indicating that the lock bit is set (and no other bits).
-		 */
-		static constexpr GuardWord locked = static_cast<GuardWord>(1)
-		                                    << LockedBit;
-
-		/**
-		 * The value indicating that the initialised bit is set (and all other
-		 * bits are zero).
-		 */
-		static constexpr GuardWord initialised = static_cast<GuardWord>(1)
-		                                         << InitBit;
-
-		/**
-		 * The guard variable.
-		 */
-		atomic<GuardWord> val;
-
-		public:
-		/**
-		 * Release the lock and set the initialised state.  In the single-word
-		 * implementation here, these are both done by a single store.
-		 */
-		void unlock(bool isInitialised)
-		{
-			val.store(isInitialised ? initialised : 0, memory_order::release);
-#ifndef NDEBUG
-			GuardWord init_state = initialised;
-			assert(*reinterpret_cast<uint8_t*>(&init_state) != 0);
-#endif
-		}
-
-		/**
-		 * Try to acquire the lock.  This has a tri-state return, indicating
-		 * either that the lock was acquired, it wasn't acquired because it was
-		 * contended, or it wasn't acquired because the guarded variable is
-		 * already initialised.
-		 */
-		GuardState try_lock()
-		{
-			GuardWord old = 0;
-			// Try to acquire the lock, assuming that we are in the state where
-			// the lock is not held and the variable is not initialised (so the
-			// expected value is 0).
-			if (val.compare_exchange(old, locked))
-			{
-				return GuardState::InitLockSucceeded;
-			}
-			// If the CAS failed and the old value indicates that this is
-			// initialised, return that initialisation is done and skip further
-			// retries.
-			if (old == initialised)
-			{
-				return GuardState::InitDone;
-			}
-			// Otherwise, report failure.
-			return GuardState::InitLockFailed;
-		}
-
-		/**
-		 * Check whether the guard indicates that the variable is initialised.
-		 */
-		bool is_initialised()
-		{
-			return (val.load(memory_order::acquire) & initialised) ==
-			       initialised;
-		}
-	};
-
-	/**
-	 * Class encapsulating using two 32-bit atomic values to represent a 64-bit
-	 * guard variable.
-	 */
-	template<int LockedBit, int InitBit>
-	class DoubleWordGuard
-	{
-		/**
-		 * The value of `lock_word` when the lock is held.
-		 */
-		static constexpr uint32_t locked = static_cast<uint32_t>(1)
-		                                   << LockedBit;
-
-		/**
-		 * The value of `init_word` when the guarded variable is initialised.
-		 */
-		static constexpr uint32_t initialised = static_cast<uint32_t>(1)
-		                                        << InitBit;
-
-		/**
-		 * The word used for the initialised flag.  This is always the first
-		 * word irrespective of endian because the generated code compares the
-		 * first byte in memory against 0.
-		 */
-		atomic<uint32_t> init_word;
-
-		/**
-		 * The word used for the lock.
-		 */
-		atomic<uint32_t> lock_word;
-
-		public:
-		/**
-		 * Try to acquire the lock.  This has a tri-state return, indicating
-		 * either that the lock was acquired, it wasn't acquired because it was
-		 * contended, or it wasn't acquired because the guarded variable is
-		 * already initialised.
-		 */
-		GuardState try_lock()
-		{
-			uint32_t old = 0;
-			// Try to acquire the lock
-			if (lock_word.compare_exchange(old, locked))
-			{
-				// If we succeeded, check if initialisation has happened.  In
-				// this version, we don't have atomic manipulation of both the
-				// lock and initialised bits together.  Instead, we have an
-				// ordering rule that the initialised bit is only ever updated
-				// with the lock held.
-				if (is_initialised())
-				{
-					// If another thread did manage to initialise this, release
-					// the lock and notify the caller that initialisation is
-					// done.
-					lock_word.store(initialised, memory_order::release);
-					return GuardState::InitDone;
-				}
-				return GuardState::InitLockSucceeded;
-			}
-			return GuardState::InitLockFailed;
-		}
-
-		/**
-		 * Set the initialised state and release the lock.  In this
-		 * implementation, this is ordered, not atomic: the initialise bit is
-		 * set while the lock is held.
-		 */
-		void unlock(bool isInitialised)
-		{
-			init_word.store(isInitialised ? initialised : 0,
-			                  memory_order::release);
-			lock_word.store(0, memory_order::release);
-			assert((*reinterpret_cast<uint8_t*>(this) != 0) == isInitialised);
-		}
-
-		/**
-		 * Return whether the guarded variable is initialised.
-		 */
-		bool is_initialised()
-		{
-			return (init_word.load(memory_order::acquire) & initialised) ==
-			       initialised;
-		}
-	};
-
-	// Check that the two implementations are the correct size.
-	static_assert(sizeof(SingleWordGuard<uint32_t, 31, 0>) == sizeof(uint32_t),
-	              "Single-word 32-bit guard must be 32 bits");
-	static_assert(sizeof(SingleWordGuard<uint64_t, 63, 0>) == sizeof(uint64_t),
-	              "Single-word 64-bit guard must be 64 bits");
-	static_assert(sizeof(DoubleWordGuard<31, 0>) == sizeof(uint64_t),
-	              "Double-word guard must be 64 bits");
-
 #ifdef __arm__
-	/**
-	 * The Arm PCS defines a variant of the Itanium ABI with 32-bit lock words.
-	 */
-	using Guard = SingleWordGuard<uint32_t, 31, 0>;
+// ARM ABI - 32-bit guards.
+typedef uint32_t guard_t;
+typedef uint32_t guard_lock_t;
+static const uint32_t LOCKED = static_cast<guard_t>(1) << 31;
+static const uint32_t INITIALISED = 1;
+#define LOCK_PART(guard) (guard)
+#define INIT_PART(guard) (guard)
 #elif defined(_LP64)
+typedef uint64_t guard_t;
+typedef uint64_t guard_lock_t;
 #	if defined(__LITTLE_ENDIAN__)
-	/**
-	 * On little-endian 64-bit platforms the guard word is a single 64-bit
-	 * atomic with the lock in the high bit and the initialised flag in the low
-	 * bit.
-	 */
-	using Guard = SingleWordGuard<uint64_t, 63, 0>;
+static const guard_t LOCKED = static_cast<guard_t>(1) << 63;
+static const guard_t INITIALISED = 1;
 #	else
-	/**
-	 * On bit-endian 64-bit platforms, the guard word is a single 64-bit atomic
-	 * with the lock in the low bit and the initialised bit in the highest
-	 * byte.
-	 */
-	using Guard = SingleWordGuard<uint64_t, 0, 56>;
+static const guard_t LOCKED = 1;
+static const guard_t INITIALISED = static_cast<guard_t>(1) << 56;
 #	endif
+#define LOCK_PART(guard) (guard)
+#define INIT_PART(guard) (guard)
 #else
+typedef uint32_t guard_lock_t;
 #	if defined(__LITTLE_ENDIAN__)
-	/**
-	 * 32-bit platforms use the same layout as 64-bit.
-	 */
-	using Guard = DoubleWordGuard<31, 0>;
+typedef struct {
+	uint32_t init_half;
+	uint32_t lock_half;
+} guard_t;
+static const uint32_t LOCKED = static_cast<guard_lock_t>(1) << 31;
+static const uint32_t INITIALISED = 1;
 #	else
-	/**
-	 * 32-bit platforms use the same layout as 64-bit.
-	 */
-	using Guard = DoubleWordGuard<0, 24>;
+typedef struct {
+	uint32_t init_half;
+	uint32_t lock_half;
+} guard_t;
+static_assert(sizeof(guard_t) == sizeof(uint64_t), "");
+static const uint32_t LOCKED = 1;
+static const uint32_t INITIALISED = static_cast<guard_lock_t>(1) << 24;
 #	endif
+#define LOCK_PART(guard) (&(guard)->lock_half)
+#define INIT_PART(guard) (&(guard)->init_half)
 #endif
-
-} // namespace
+static const guard_lock_t INITIAL = 0;
 
 /**
  * Acquires a lock on a guard, returning 0 if the object has already been
  * initialised, and 1 if it has not.  If the object is already constructed then
  * this function just needs to read a byte from memory and return.
  */
-extern "C" int __cxa_guard_acquire(Guard *guard_object)
+extern "C" int __cxa_guard_acquire(volatile guard_t *guard_object)
 {
-	// Check if this is already initialised.  If so, we don't have to do
-	// anything.
-	if (guard_object->is_initialised())
-	{
+	guard_lock_t old;
+	// Not an atomic read, doesn't establish a happens-before relationship, but
+	// if one is already established and we end up seeing an initialised state
+	// then it's a fast path, otherwise we'll do something more expensive than
+	// this test anyway...
+	if (INITIALISED == *INIT_PART(guard_object))
 		return 0;
-	}
-	// Spin trying to acquire the lock.  If we fail to acquire the lock the
-	// first time then another thread will *probably* initialise it, but if the
-	// constructor throws an exception then we may have to try again in this
-	// thread.
+	// Spin trying to do the initialisation
 	for (;;)
 	{
-		// Try to acquire the lock.
-		switch (guard_object->try_lock())
-		{
-			// If we failed to acquire the lock but another thread has
-			// initialised the lock while we were waiting, return immediately
-			// indicating that initialisation is not required.
-			case GuardState::InitDone:
-				return 0;
-			// If we acquired the lock, return immediately to start
-			// initialisation.
-			case GuardState::InitLockSucceeded:
+		// Loop trying to move the value of the guard from 0 (not
+		// locked, not initialised) to the locked-uninitialised
+		// position.
+		old = __sync_val_compare_and_swap(LOCK_PART(guard_object),
+		    INITIAL, LOCKED);
+		if (old == INITIAL) {
+			// Lock obtained.  If lock and init bit are
+			// in separate words, check for init race.
+			if (INIT_PART(guard_object) == LOCK_PART(guard_object))
+				return 1;
+			if (INITIALISED != *INIT_PART(guard_object))
 				return 1;
-			// If we didn't acquire the lock, pause and retry.
-			case GuardState::InitLockFailed:
-				break;
+
+			// No need for a memory barrier here,
+			// see first comment.
+			*LOCK_PART(guard_object) = INITIAL;
+			return 0;
 		}
+		// If lock and init bit are in the same word, check again
+		// if we are done.
+		if (INIT_PART(guard_object) == LOCK_PART(guard_object) &&
+		    old == INITIALISED)
+			return 0;
+
+		assert(old == LOCKED);
+		// Another thread holds the lock.
+		// If lock and init bit are in different words, check
+		// if we are done before yielding and looping.
+		if (INIT_PART(guard_object) != LOCK_PART(guard_object) &&
+		    INITIALISED == *INIT_PART(guard_object))
+			return 0;
 		sched_yield();
 	}
 }
@@ -346,16 +166,28 @@ extern "C" int __cxa_guard_acquire(Guard *guard_object)
  * Releases the lock without marking the object as initialised.  This function
  * is called if initialising a static causes an exception to be thrown.
  */
-extern "C" void __cxa_guard_abort(Guard *guard_object)
+extern "C" void __cxa_guard_abort(volatile guard_t *guard_object)
 {
-	guard_object->unlock(false);
+	__attribute__((unused))
+	bool reset = __sync_bool_compare_and_swap(LOCK_PART(guard_object),
+	    LOCKED, INITIAL);
+	assert(reset);
 }
-
 /**
  * Releases the guard and marks the object as initialised.  This function is
  * called after successful initialisation of a static.
  */
-extern "C" void __cxa_guard_release(Guard *guard_object)
+extern "C" void __cxa_guard_release(volatile guard_t *guard_object)
 {
-	guard_object->unlock(true);
+	guard_lock_t old;
+	if (INIT_PART(guard_object) == LOCK_PART(guard_object))
+		old = LOCKED;
+	else
+		old = INITIAL;
+	__attribute__((unused))
+	bool reset = __sync_bool_compare_and_swap(INIT_PART(guard_object),
+	    old, INITIALISED);
+	assert(reset);
+	if (INIT_PART(guard_object) != LOCK_PART(guard_object))
+		*LOCK_PART(guard_object) = INITIAL;
 }
diff --git a/contrib/libcxxrt/memory.cc b/contrib/libcxxrt/memory.cc
index 7beb048ae914..6dd43a5b897e 100644
--- a/contrib/libcxxrt/memory.cc
+++ b/contrib/libcxxrt/memory.cc
@@ -51,7 +51,7 @@ typedef void (*new_handler)();
  * The function to call when allocation fails.  By default, there is no
  * handler and a bad allocation exception is thrown if an allocation fails.
  */
-static atomic<new_handler> new_handl{nullptr};
+static new_handler new_handl;
 
 namespace std
 {
@@ -61,13 +61,12 @@ namespace std
 	__attribute__((weak))
 	new_handler set_new_handler(new_handler handler)
 	{
-		return new_handl.exchange(handler);
+		return ATOMIC_SWAP(&new_handl, handler);
 	}
-
 	__attribute__((weak))
 	new_handler get_new_handler(void)
 	{
-		return new_handl.load();
+		return ATOMIC_LOAD(&new_handl);
 	}
 }