57 files changed, 572 insertions, 523 deletions
diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index e603dfa05356..f9e1bc805092 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -20,13 +20,18 @@ Here is the specification for this library:
 
 http://gcc.gnu.org/onlinedocs/gccint/Libgcc.html#Libgcc
 
+Please note that the libgcc specification explicitly mentions actual types of
+arguments and returned values being expressed with machine modes.
+In some cases particular types such as "int", "unsigned", "long long", etc.
+may be specified just as examples there.
+
 Here is a synopsis of the contents of this library:
 
-typedef      int si_int;
-typedef unsigned su_int;
+typedef  int32_t si_int;
+typedef uint32_t su_int;
 
-typedef          long long di_int;
-typedef unsigned long long du_int;
+typedef  int64_t di_int;
+typedef uint64_t du_int;
 
 // Integral bit manipulation
 
@@ -38,24 +43,24 @@ ti_int __ashrti3(ti_int a, si_int b);      // a >> b  arithmetic (sign fill)
 di_int __lshrdi3(di_int a, si_int b);      // a >> b  logical    (zero fill)
 ti_int __lshrti3(ti_int a, si_int b);      // a >> b  logical    (zero fill)
 
-si_int __clzsi2(si_int a);  // count leading zeros
-si_int __clzdi2(di_int a);  // count leading zeros
-si_int __clzti2(ti_int a);  // count leading zeros
-si_int __ctzsi2(si_int a);  // count trailing zeros
-si_int __ctzdi2(di_int a);  // count trailing zeros
-si_int __ctzti2(ti_int a);  // count trailing zeros
+int __clzsi2(si_int a);  // count leading zeros
+int __clzdi2(di_int a);  // count leading zeros
+int __clzti2(ti_int a);  // count leading zeros
+int __ctzsi2(si_int a);  // count trailing zeros
+int __ctzdi2(di_int a);  // count trailing zeros
+int __ctzti2(ti_int a);  // count trailing zeros
 
-si_int __ffssi2(si_int a);  // find least significant 1 bit
-si_int __ffsdi2(di_int a);  // find least significant 1 bit
-si_int __ffsti2(ti_int a);  // find least significant 1 bit
+int __ffssi2(si_int a);  // find least significant 1 bit
+int __ffsdi2(di_int a);  // find least significant 1 bit
+int __ffsti2(ti_int a);  // find least significant 1 bit
 
-si_int __paritysi2(si_int a);  // bit parity
-si_int __paritydi2(di_int a);  // bit parity
-si_int __parityti2(ti_int a);  // bit parity
+int __paritysi2(si_int a);  // bit parity
+int __paritydi2(di_int a);  // bit parity
+int __parityti2(ti_int a);  // bit parity
 
-si_int __popcountsi2(si_int a);  // bit population
-si_int __popcountdi2(di_int a);  // bit population
-si_int __popcountti2(ti_int a);  // bit population
+int __popcountsi2(si_int a);  // bit population
+int __popcountdi2(di_int a);  // bit population
+int __popcountti2(ti_int a);  // bit population
 
 uint32_t __bswapsi2(uint32_t a);   // a byteswapped
 uint64_t __bswapdi2(uint64_t a);   // a byteswapped
@@ -169,10 +174,10 @@ long double __floatuntixf(tu_int a);
 
 //  Floating point raised to integer power
 
-float       __powisf2(      float a, si_int b);  // a ^ b
-double      __powidf2(     double a, si_int b);  // a ^ b
-long double __powixf2(long double a, si_int b);  // a ^ b
-long double __powitf2(long double a, si_int b);  // ppc only, a ^ b
+float       __powisf2(      float a, int b);  // a ^ b
+double      __powidf2(     double a, int b);  // a ^ b
+long double __powixf2(long double a, int b);  // a ^ b
+long double __powitf2(long double a, int b);  // ppc only, a ^ b
 
 //  Complex arithmetic
 
diff --git a/compiler-rt/lib/builtins/absvsi2.c b/compiler-rt/lib/builtins/absvsi2.c
index 44ada169e7e6..9d5de7e8a3f2 100644
--- a/compiler-rt/lib/builtins/absvsi2.c
+++ b/compiler-rt/lib/builtins/absvsi2.c
@@ -18,7 +18,7 @@
 
 COMPILER_RT_ABI si_int __absvsi2(si_int a) {
   const int N = (int)(sizeof(si_int) * CHAR_BIT);
-  if (a == (1 << (N - 1)))
+  if (a == ((si_int)1 << (N - 1)))
     compilerrt_abort();
   const si_int t = a >> (N - 1);
   return (a ^ t) - t;
diff --git a/compiler-rt/lib/builtins/ashldi3.c b/compiler-rt/lib/builtins/ashldi3.c
index 7c81057a2284..04f22228f11d 100644
--- a/compiler-rt/lib/builtins/ashldi3.c
+++ b/compiler-rt/lib/builtins/ashldi3.c
@@ -16,7 +16,7 @@
 
 // Precondition:  0 <= b < bits_in_dword
 
-COMPILER_RT_ABI di_int __ashldi3(di_int a, si_int b) {
+COMPILER_RT_ABI di_int __ashldi3(di_int a, int b) {
   const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT);
   dwords input;
   dwords result;
diff --git a/compiler-rt/lib/builtins/ashrdi3.c b/compiler-rt/lib/builtins/ashrdi3.c
index b9939132205c..934a5c47fd69 100644
--- a/compiler-rt/lib/builtins/ashrdi3.c
+++ b/compiler-rt/lib/builtins/ashrdi3.c
@@ -16,7 +16,7 @@
 
 // Precondition:  0 <= b < bits_in_dword
 
-COMPILER_RT_ABI di_int __ashrdi3(di_int a, si_int b) {
+COMPILER_RT_ABI di_int __ashrdi3(di_int a, int b) {
   const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT);
   dwords input;
   dwords result;
diff --git a/compiler-rt/lib/builtins/atomic.c b/compiler-rt/lib/builtins/atomic.c
index 32b3a0f9ad23..8634a72e77d1 100644
--- a/compiler-rt/lib/builtins/atomic.c
+++ b/compiler-rt/lib/builtins/atomic.c
@@ -23,6 +23,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -293,8 +294,8 @@ OPTIMISED_CASES
 #undef OPTIMISED_CASE
 
 #define OPTIMISED_CASE(n, lockfree, type)                                      \
-  int __atomic_compare_exchange_##n(type *ptr, type *expected, type desired,   \
-                                    int success, int failure) {                \
+  bool __atomic_compare_exchange_##n(type *ptr, type *expected, type desired,  \
+                                     int success, int failure) {               \
     if (lockfree)                                                              \
       return __c11_atomic_compare_exchange_strong(                             \
           (_Atomic(type) *)ptr, expected, desired, success, failure);          \
@@ -303,11 +304,11 @@ OPTIMISED_CASES
     if (*ptr == *expected) {                                                   \
       *ptr = desired;                                                          \
       unlock(l);                                                               \
-      return 1;                                                                \
+      return true;                                                             \
     }                                                                          \
     *expected = *ptr;                                                          \
     unlock(l);                                                                 \
-    return 0;                                                                  \
+    return false;                                                              \
   }
 OPTIMISED_CASES
 #undef OPTIMISED_CASE
diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c
index e83e21254e85..72e02e613de5 100644
--- a/compiler-rt/lib/builtins/clear_cache.c
+++ b/compiler-rt/lib/builtins/clear_cache.c
@@ -147,6 +147,16 @@ void __clear_cache(void *start, void *end) {
 
   for (uintptr_t dword = start_dword; dword < end_dword; dword += dword_size)
     __asm__ volatile("flush %0" : : "r"(dword));
+#elif defined(__riscv) && defined(__linux__)
+#define __NR_riscv_flush_icache (244 + 15)
+  register void *start_reg __asm("a0") = start;
+  const register void *end_reg __asm("a1") = end;
+  const register long flags __asm("a2") = 0;
+  const register long syscall_nr __asm("a7") = __NR_riscv_flush_icache;
+  __asm __volatile("ecall"
+                   : "=r"(start_reg)
+                   : "r"(start_reg), "r"(end_reg), "r"(flags), "r"(syscall_nr));
+  assert(start_reg == 0 && "Cache flush syscall failed.");
 #else
 #if __APPLE__
   // On Darwin, sys_icache_invalidate() provides this functionality
diff --git a/compiler-rt/lib/builtins/clzdi2.c b/compiler-rt/lib/builtins/clzdi2.c
index a0bacb2ae39e..12c17982a5cb 100644
--- a/compiler-rt/lib/builtins/clzdi2.c
+++ b/compiler-rt/lib/builtins/clzdi2.c
@@ -21,15 +21,15 @@
 // ctz instruction, gcc resolves __builtin_clz to __clzdi2 rather than
 // __clzsi2, leading to infinite recursion.
 #define __builtin_clz(a) __clzsi2(a)
-extern si_int __clzsi2(si_int);
+extern int __clzsi2(si_int);
 #endif
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __clzdi2(di_int a) {
+COMPILER_RT_ABI int __clzdi2(di_int a) {
   dwords x;
   x.all = a;
   const si_int f = -(x.s.high == 0);
-  return __builtin_clz((x.s.high & ~f) | (x.s.low & f)) +
+  return clzsi((x.s.high & ~f) | (x.s.low & f)) +
          (f & ((si_int)(sizeof(si_int) * CHAR_BIT)));
 }
diff --git a/compiler-rt/lib/builtins/clzsi2.c b/compiler-rt/lib/builtins/clzsi2.c
index 3f9f27f41331..d75f56d937b0 100644
--- a/compiler-rt/lib/builtins/clzsi2.c
+++ b/compiler-rt/lib/builtins/clzsi2.c
@@ -16,7 +16,7 @@
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __clzsi2(si_int a) {
+COMPILER_RT_ABI int __clzsi2(si_int a) {
   su_int x = (su_int)a;
   si_int t = ((x & 0xFFFF0000) == 0) << 4; // if (x is small) t = 16 else 0
   x >>= 16 - t;                            // x = [0 - 0xFFFF]
diff --git a/compiler-rt/lib/builtins/clzti2.c b/compiler-rt/lib/builtins/clzti2.c
index 0c787104caa2..25d30119f271 100644
--- a/compiler-rt/lib/builtins/clzti2.c
+++ b/compiler-rt/lib/builtins/clzti2.c
@@ -18,7 +18,7 @@
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __clzti2(ti_int a) {
+COMPILER_RT_ABI int __clzti2(ti_int a) {
   twords x;
   x.all = a;
   const di_int f = -(x.s.high == 0);
diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
index fb619037d398..8346bb62dcfb 100644
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ b/compiler-rt/lib/builtins/cpu_model.c
@@ -82,6 +82,8 @@ enum ProcessorSubtypes {
   INTEL_COREI7_ICELAKE_SERVER,
   AMDFAM17H_ZNVER2,
   INTEL_COREI7_CASCADELAKE,
+  INTEL_COREI7_TIGERLAKE,
+  INTEL_COREI7_COOPERLAKE,
   CPU_SUBTYPE_MAX
 };
 
@@ -122,7 +124,9 @@ enum ProcessorFeatures {
   FEATURE_VPCLMULQDQ,
   FEATURE_AVX512VNNI,
   FEATURE_AVX512BITALG,
-  FEATURE_AVX512BF16
+  FEATURE_AVX512BF16,
+  FEATURE_AVX512VP2INTERSECT,
+  CPU_FEATURE_MAX
 };
 
 // The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
@@ -268,13 +272,17 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
   }
 }
 
-static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                                            unsigned Brand_id,
-                                            unsigned Features,
-                                            unsigned Features2, unsigned *Type,
-                                            unsigned *Subtype) {
-  if (Brand_id != 0)
-    return;
+static const char *
+getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                                const unsigned *Features,
+                                unsigned *Type, unsigned *Subtype) {
+#define testFeature(F)                                                         \
+  (Features[F / 32] & (F % 32)) != 0
+
+  // We select CPU strings to match the code in Host.cpp, but we don't use them
+  // in compiler-rt.
+  const char *CPU = 0;
+
   switch (Family) {
   case 6:
     switch (Model) {
@@ -285,13 +293,17 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                // 0Fh. All processors are manufactured using the 65 nm process.
     case 0x16: // Intel Celeron processor model 16h. All processors are
                // manufactured using the 65 nm process
+      CPU = "core2";
+      *Type = INTEL_CORE2;
+      break;
     case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
                // 17h. All processors are manufactured using the 45 nm process.
                //
                // 45nm: Penryn , Wolfdale, Yorkfield (XE)
     case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
                // the 45 nm process.
-      *Type = INTEL_CORE2; // "penryn"
+      CPU = "penryn";
+      *Type = INTEL_CORE2;
       break;
     case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 45 nm process.
@@ -299,25 +311,29 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                // As found in a Summer 2010 model iMac.
     case 0x1f:
     case 0x2e:              // Nehalem EX
-      *Type = INTEL_COREI7; // "nehalem"
+      CPU = "nehalem";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_NEHALEM;
       break;
     case 0x25: // Intel Core i7, laptop version.
     case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 32 nm process.
     case 0x2f: // Westmere EX
-      *Type = INTEL_COREI7; // "westmere"
+      CPU = "westmere";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_WESTMERE;
       break;
     case 0x2a: // Intel Core i7 processor. All processors are manufactured
                // using the 32 nm process.
     case 0x2d:
-      *Type = INTEL_COREI7; //"sandybridge"
+      CPU = "sandybridge";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_SANDYBRIDGE;
       break;
     case 0x3a:
     case 0x3e:              // Ivy Bridge EP
-      *Type = INTEL_COREI7; // "ivybridge"
+      CPU = "ivybridge";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_IVYBRIDGE;
       break;
 
@@ -326,7 +342,8 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x3f:
     case 0x45:
     case 0x46:
-      *Type = INTEL_COREI7; // "haswell"
+      CPU = "haswell";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_HASWELL;
       break;
 
@@ -335,7 +352,8 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x47:
     case 0x4f:
     case 0x56:
-      *Type = INTEL_COREI7; // "broadwell"
+      CPU = "broadwell";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_BROADWELL;
       break;
 
@@ -344,37 +362,49 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x5e:              // Skylake desktop
     case 0x8e:              // Kaby Lake mobile
     case 0x9e:              // Kaby Lake desktop
-      *Type = INTEL_COREI7; // "skylake"
+    case 0xa5:              // Comet Lake-H/S
+    case 0xa6:              // Comet Lake-U
+      CPU = "skylake";
+      *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_SKYLAKE;
       break;
 
     // Skylake Xeon:
     case 0x55:
       *Type = INTEL_COREI7;
-      if (Features2 & (1 << (FEATURE_AVX512VNNI - 32)))
-        *Subtype = INTEL_COREI7_CASCADELAKE; // "cascadelake"
-      else
-        *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
+      if (testFeature(FEATURE_AVX512BF16)) {
+        CPU = "cooperlake";
+        *Subtype = INTEL_COREI7_COOPERLAKE;
+      } else if (testFeature(FEATURE_AVX512VNNI)) {
+        CPU = "cascadelake";
+        *Subtype = INTEL_COREI7_CASCADELAKE;
+      } else {
+        CPU = "skylake-avx512";
+        *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
+      }
       break;
 
     // Cannonlake:
     case 0x66:
+      CPU = "cannonlake";
       *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_CANNONLAKE; // "cannonlake"
+      *Subtype = INTEL_COREI7_CANNONLAKE;
       break;
 
     // Icelake:
     case 0x7d:
     case 0x7e:
+      CPU = "icelake-client";
       *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_CLIENT; // "icelake-client"
+      *Subtype = INTEL_COREI7_ICELAKE_CLIENT;
       break;
 
     // Icelake Xeon:
     case 0x6a:
     case 0x6c:
+      CPU = "icelake-server";
       *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_SERVER; // "icelake-server"
+      *Subtype = INTEL_COREI7_ICELAKE_SERVER;
       break;
 
     case 0x1c: // Most 45 nm Intel Atom processors
@@ -382,8 +412,9 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x27: // 32 nm Atom Medfield
     case 0x35: // 32 nm Atom Midview
     case 0x36: // 32 nm Atom Midview
+      CPU = "bonnell";
       *Type = INTEL_BONNELL;
-      break; // "bonnell"
+      break;
 
     // Atom Silvermont codes from the Intel software optimization guide.
     case 0x37:
@@ -392,26 +423,32 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x5a:
     case 0x5d:
     case 0x4c: // really airmont
+      CPU = "silvermont";
       *Type = INTEL_SILVERMONT;
-      break; // "silvermont"
+      break;
     // Goldmont:
     case 0x5c: // Apollo Lake
     case 0x5f: // Denverton
+      CPU = "goldmont";
       *Type = INTEL_GOLDMONT;
       break; // "goldmont"
     case 0x7a:
+      CPU = "goldmont-plus";
       *Type = INTEL_GOLDMONT_PLUS;
       break;
     case 0x86:
+      CPU = "tremont";
       *Type = INTEL_TREMONT;
       break;
 
     case 0x57:
-      *Type = INTEL_KNL; // knl
+      CPU = "knl";
+      *Type = INTEL_KNL;
       break;
 
     case 0x85:
-      *Type = INTEL_KNM; // knm
+      CPU = "knm";
+      *Type = INTEL_KNM;
       break;
 
     default: // Unknown family 6 CPU.
@@ -421,17 +458,22 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
   default:
     break; // Unknown.
   }
+
+  return CPU;
 }
 
-static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                                          unsigned Features, unsigned Features2,
-                                          unsigned *Type, unsigned *Subtype) {
-  // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
-  // appears to be no way to generate the wide variety of AMD-specific targets
-  // from the information returned from CPUID.
+static const char *
+getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                              const unsigned *Features,
+                              unsigned *Type, unsigned *Subtype) {
+  // We select CPU strings to match the code in Host.cpp, but we don't use them
+  // in compiler-rt.
+  const char *CPU = 0;
+
   switch (Family) {
   case 16:
-    *Type = AMDFAM10H; // "amdfam10"
+    CPU = "amdfam10";
+    *Type = AMDFAM10H;
     switch (Model) {
     case 2:
       *Subtype = AMDFAM10H_BARCELONA;
@@ -445,60 +487,62 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 20:
+    CPU = "btver1";
     *Type = AMD_BTVER1;
-    break; // "btver1";
+    break;
   case 21:
+    CPU = "bdver1";
     *Type = AMDFAM15H;
     if (Model >= 0x60 && Model <= 0x7f) {
+      CPU = "bdver4";
       *Subtype = AMDFAM15H_BDVER4;
-      break; // "bdver4"; 60h-7Fh: Excavator
+      break; // 60h-7Fh: Excavator
     }
     if (Model >= 0x30 && Model <= 0x3f) {
+      CPU = "bdver3";
       *Subtype = AMDFAM15H_BDVER3;
-      break; // "bdver3"; 30h-3Fh: Steamroller
+      break; // 30h-3Fh: Steamroller
     }
     if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
+      CPU = "bdver2";
       *Subtype = AMDFAM15H_BDVER2;
-      break; // "bdver2"; 02h, 10h-1Fh: Piledriver
+      break; // 02h, 10h-1Fh: Piledriver
     }
     if (Model <= 0x0f) {
       *Subtype = AMDFAM15H_BDVER1;
-      break; // "bdver1"; 00h-0Fh: Bulldozer
+      break; // 00h-0Fh: Bulldozer
     }
     break;
   case 22:
+    CPU = "btver2";
     *Type = AMD_BTVER2;
-    break; // "btver2"
+    break;
   case 23:
+    CPU = "znver1";
     *Type = AMDFAM17H;
     if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+      CPU = "znver2";
       *Subtype = AMDFAM17H_ZNVER2;
-      break; // "znver2"; 30h-3fh, 71h: Zen2
+      break; // 30h-3fh, 71h: Zen2
     }
     if (Model <= 0x0f) {
       *Subtype = AMDFAM17H_ZNVER1;
-      break; // "znver1"; 00h-0Fh: Zen1
+      break; // 00h-0Fh: Zen1
     }
     break;
   default:
-    break; // "generic"
+    break; // Unknown AMD CPU.
   }
+
+  return CPU;
 }
 
 static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
-                                 unsigned *FeaturesOut,
-                                 unsigned *Features2Out) {
-  unsigned Features = 0;
-  unsigned Features2 = 0;
+                                 unsigned *Features) {
   unsigned EAX, EBX;
 
 #define setFeature(F)                                                          \
-  do {                                                                         \
-    if (F < 32)                                                                \
-      Features |= 1U << (F & 0x1f);                                            \
-    else if (F < 64)                                                           \
-      Features2 |= 1U << ((F - 32) & 0x1f);                                    \
-  } while (0)
+  Features[F / 32] |= 1U << (F % 32)
 
   if ((EDX >> 15) & 1)
     setFeature(FEATURE_CMOV);
@@ -590,6 +634,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_AVX5124VNNIW);
   if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX5124FMAPS);
+  if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VP2INTERSECT);
 
   bool HasLeaf7Subleaf1 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
@@ -607,9 +653,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_XOP);
   if (HasExtLeaf1 && ((ECX >> 16) & 1))
     setFeature(FEATURE_FMA4);
-
-  *FeaturesOut = Features;
-  *Features2Out = Features2;
 #undef setFeature
 }
 
@@ -641,7 +684,7 @@ struct __processor_model {
 #ifndef _WIN32
 __attribute__((visibility("hidden")))
 #endif
-unsigned int __cpu_features2;
+unsigned int __cpu_features2 = 0;
 
 // A constructor function that is sets __cpu_model and __cpu_features2 with
 // the right values.  This needs to run only once.  This constructor is
@@ -653,40 +696,38 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   unsigned EAX, EBX, ECX, EDX;
   unsigned MaxLeaf = 5;
   unsigned Vendor;
-  unsigned Model, Family, Brand_id;
-  unsigned Features = 0;
-  unsigned Features2 = 0;
+  unsigned Model, Family;
+  unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
 
   // This function needs to run just once.
   if (__cpu_model.__cpu_vendor)
     return 0;
 
-  if (!isCpuIdSupported())
-    return -1;
-
-  // Assume cpuid insn present. Run in level 0 to get vendor id.
-  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
+  if (!isCpuIdSupported() ||
+      getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
     return -1;
   }
+
   getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
   detectX86FamilyModel(EAX, &Family, &Model);
-  Brand_id = EBX & 0xff;
 
   // Find available features.
-  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
-  __cpu_model.__cpu_features[0] = Features;
-  __cpu_features2 = Features2;
+  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]);
+
+  assert((sizeof(Features)/sizeof(Features[0])) == 2);
+  __cpu_model.__cpu_features[0] = Features[0];
+  __cpu_features2 = Features[1];
 
   if (Vendor == SIG_INTEL) {
     // Get CPU type.
-    getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
-                                    Features2, &(__cpu_model.__cpu_type),
+    getIntelProcessorTypeAndSubtype(Family, Model, &Features[0],
+                                    &(__cpu_model.__cpu_type),
                                     &(__cpu_model.__cpu_subtype));
     __cpu_model.__cpu_vendor = VENDOR_INTEL;
   } else if (Vendor == SIG_AMD) {
     // Get CPU type.
-    getAMDProcessorTypeAndSubtype(Family, Model, Features, Features2,
+    getAMDProcessorTypeAndSubtype(Family, Model, &Features[0],
                                   &(__cpu_model.__cpu_type),
                                   &(__cpu_model.__cpu_subtype));
     __cpu_model.__cpu_vendor = VENDOR_AMD;
diff --git a/compiler-rt/lib/builtins/ctzdi2.c b/compiler-rt/lib/builtins/ctzdi2.c
index 9384aa6055a1..26c908d876ac 100644
--- a/compiler-rt/lib/builtins/ctzdi2.c
+++ b/compiler-rt/lib/builtins/ctzdi2.c
@@ -21,15 +21,15 @@
 // ctz instruction, gcc resolves __builtin_ctz to __ctzdi2 rather than
 // __ctzsi2, leading to infinite recursion.
 #define __builtin_ctz(a) __ctzsi2(a)
-extern si_int __ctzsi2(si_int);
+extern int __ctzsi2(si_int);
 #endif
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __ctzdi2(di_int a) {
+COMPILER_RT_ABI int __ctzdi2(di_int a) {
   dwords x;
   x.all = a;
   const si_int f = -(x.s.low == 0);
-  return __builtin_ctz((x.s.high & f) | (x.s.low & ~f)) +
+  return ctzsi((x.s.high & f) | (x.s.low & ~f)) +
          (f & ((si_int)(sizeof(si_int) * CHAR_BIT)));
 }
diff --git a/compiler-rt/lib/builtins/ctzsi2.c b/compiler-rt/lib/builtins/ctzsi2.c
index 09c6863b74e3..ed95c6057933 100644
--- a/compiler-rt/lib/builtins/ctzsi2.c
+++ b/compiler-rt/lib/builtins/ctzsi2.c
@@ -16,7 +16,7 @@
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __ctzsi2(si_int a) {
+COMPILER_RT_ABI int __ctzsi2(si_int a) {
   su_int x = (su_int)a;
   si_int t = ((x & 0x0000FFFF) == 0)
              << 4; // if (x has no small bits) t = 16 else 0
diff --git a/compiler-rt/lib/builtins/ctzti2.c b/compiler-rt/lib/builtins/ctzti2.c
index 2a1312c8437d..fb136d0de1c0 100644
--- a/compiler-rt/lib/builtins/ctzti2.c
+++ b/compiler-rt/lib/builtins/ctzti2.c
@@ -18,7 +18,7 @@
 
 // Precondition: a != 0
 
-COMPILER_RT_ABI si_int __ctzti2(ti_int a) {
+COMPILER_RT_ABI int __ctzti2(ti_int a) {
   twords x;
   x.all = a;
   const di_int f = -(x.s.low == 0);
diff --git a/compiler-rt/lib/builtins/ffsdi2.c b/compiler-rt/lib/builtins/ffsdi2.c
index 9c1a24260956..beae5530430e 100644
--- a/compiler-rt/lib/builtins/ffsdi2.c
+++ b/compiler-rt/lib/builtins/ffsdi2.c
@@ -15,13 +15,13 @@
 // Returns: the index of the least significant 1-bit in a, or
 // the value zero if a is zero. The least significant bit is index one.
 
-COMPILER_RT_ABI si_int __ffsdi2(di_int a) {
+COMPILER_RT_ABI int __ffsdi2(di_int a) {
   dwords x;
   x.all = a;
   if (x.s.low == 0) {
     if (x.s.high == 0)
       return 0;
-    return __builtin_ctz(x.s.high) + (1 + sizeof(si_int) * CHAR_BIT);
+    return ctzsi(x.s.high) + (1 + sizeof(si_int) * CHAR_BIT);
   }
-  return __builtin_ctz(x.s.low) + 1;
+  return ctzsi(x.s.low) + 1;
 }
diff --git a/compiler-rt/lib/builtins/ffssi2.c b/compiler-rt/lib/builtins/ffssi2.c
index cba1f72fdc61..ddb52927f8db 100644
--- a/compiler-rt/lib/builtins/ffssi2.c
+++ b/compiler-rt/lib/builtins/ffssi2.c
@@ -15,9 +15,9 @@
 // Returns: the index of the least significant 1-bit in a, or
 // the value zero if a is zero. The least significant bit is index one.
 
-COMPILER_RT_ABI si_int __ffssi2(si_int a) {
+COMPILER_RT_ABI int __ffssi2(si_int a) {
   if (a == 0) {
     return 0;
   }
-  return __builtin_ctz(a) + 1;
+  return ctzsi(a) + 1;
 }
diff --git a/compiler-rt/lib/builtins/ffsti2.c b/compiler-rt/lib/builtins/ffsti2.c
index a2d7ce08ada1..a2177d148a09 100644
--- a/compiler-rt/lib/builtins/ffsti2.c
+++ b/compiler-rt/lib/builtins/ffsti2.c
@@ -17,7 +17,7 @@
 // Returns: the index of the least significant 1-bit in a, or
 // the value zero if a is zero. The least significant bit is index one.
 
-COMPILER_RT_ABI si_int __ffsti2(ti_int a) {
+COMPILER_RT_ABI int __ffsti2(ti_int a) {
   twords x;
   x.all = a;
   if (x.s.low == 0) {
diff --git a/compiler-rt/lib/builtins/floatdidf.c b/compiler-rt/lib/builtins/floatdidf.c
index 8f887314b9e1..b2d8f2b44b6d 100644
--- a/compiler-rt/lib/builtins/floatdidf.c
+++ b/compiler-rt/lib/builtins/floatdidf.c
@@ -87,7 +87,7 @@ COMPILER_RT_ABI double __floatdidf(di_int a) {
   }
   double_bits fb;
   fb.u.s.high = ((su_int)s & 0x80000000) |        // sign
-                ((e + 1023) << 20) |              // exponent
+                ((su_int)(e + 1023) << 20) |      // exponent
                 ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
   fb.u.s.low = (su_int)a;                         // mantissa-low
   return fb.f;
diff --git a/compiler-rt/lib/builtins/floatdisf.c b/compiler-rt/lib/builtins/floatdisf.c
index cd9e0a3b78a5..faaa1bcb3c8e 100644
--- a/compiler-rt/lib/builtins/floatdisf.c
+++ b/compiler-rt/lib/builtins/floatdisf.c
@@ -26,7 +26,7 @@ COMPILER_RT_ABI float __floatdisf(di_int a) {
   const di_int s = a >> (N - 1);
   a = (a ^ s) - s;
   int sd = N - __builtin_clzll(a); // number of significant digits
-  int e = sd - 1;                  // exponent
+  si_int e = sd - 1;               // exponent
   if (sd > FLT_MANT_DIG) {
     //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
     //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
diff --git a/compiler-rt/lib/builtins/floatsidf.c b/compiler-rt/lib/builtins/floatsidf.c
index 2c66167d794d..28cf32f6388b 100644
--- a/compiler-rt/lib/builtins/floatsidf.c
+++ b/compiler-rt/lib/builtins/floatsidf.c
@@ -17,7 +17,7 @@
 
 #include "int_lib.h"
 
-COMPILER_RT_ABI fp_t __floatsidf(int a) {
+COMPILER_RT_ABI fp_t __floatsidf(si_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
 
@@ -33,14 +33,14 @@ COMPILER_RT_ABI fp_t __floatsidf(int a) {
   }
 
   // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - __builtin_clz(a);
+  const int exponent = (aWidth - 1) - clzsi(a);
   rep_t result;
 
   // Shift a into the significand field and clear the implicit bit.  Extra
   // cast to unsigned int is necessary to get the correct behavior for
   // the input INT_MIN.
   const int shift = significandBits - exponent;
-  result = (rep_t)(unsigned int)a << shift ^ implicitBit;
+  result = (rep_t)(su_int)a << shift ^ implicitBit;
 
   // Insert the exponent
   result += (rep_t)(exponent + exponentBias) << significandBits;
@@ -50,7 +50,7 @@ COMPILER_RT_ABI fp_t __floatsidf(int a) {
 
 #if defined(__ARM_EABI__)
 #if defined(COMPILER_RT_ARMHF_TARGET)
-AEABI_RTABI fp_t __aeabi_i2d(int a) { return __floatsidf(a); }
+AEABI_RTABI fp_t __aeabi_i2d(si_int a) { return __floatsidf(a); }
 #else
 COMPILER_RT_ALIAS(__floatsidf, __aeabi_i2d)
 #endif
diff --git a/compiler-rt/lib/builtins/floatundidf.c b/compiler-rt/lib/builtins/floatundidf.c
index e7c6aae5ce38..4c445b118080 100644
--- a/compiler-rt/lib/builtins/floatundidf.c
+++ b/compiler-rt/lib/builtins/floatundidf.c
@@ -90,7 +90,7 @@ COMPILER_RT_ABI double __floatundidf(du_int a) {
     // a is now rounded to DBL_MANT_DIG bits
   }
   double_bits fb;
-  fb.u.s.high = ((e + 1023) << 20) |              // exponent
+  fb.u.s.high = ((su_int)(e + 1023) << 20) |      // exponent
                 ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
   fb.u.s.low = (su_int)a;                         // mantissa-low
   return fb.f;
diff --git a/compiler-rt/lib/builtins/floatundisf.c b/compiler-rt/lib/builtins/floatundisf.c
index 87841b761ded..00d61b0c6310 100644
--- a/compiler-rt/lib/builtins/floatundisf.c
+++ b/compiler-rt/lib/builtins/floatundisf.c
@@ -24,7 +24,7 @@ COMPILER_RT_ABI float __floatundisf(du_int a) {
     return 0.0F;
   const unsigned N = sizeof(du_int) * CHAR_BIT;
   int sd = N - __builtin_clzll(a); // number of significant digits
-  int e = sd - 1;                  // 8 exponent
+  si_int e = sd - 1;               // 8 exponent
   if (sd > FLT_MANT_DIG) {
     //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
     //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
diff --git a/compiler-rt/lib/builtins/floatunsidf.c b/compiler-rt/lib/builtins/floatunsidf.c
index 2c01c3041434..9b3e5fea0e45 100644
--- a/compiler-rt/lib/builtins/floatunsidf.c
+++ b/compiler-rt/lib/builtins/floatunsidf.c
@@ -17,7 +17,7 @@
 
 #include "int_lib.h"
 
-COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) {
+COMPILER_RT_ABI fp_t __floatunsidf(su_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
 
@@ -26,7 +26,7 @@ COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) {
     return fromRep(0);
 
   // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - __builtin_clz(a);
+  const int exponent = (aWidth - 1) - clzsi(a);
   rep_t result;
 
   // Shift a into the significand field and clear the implicit bit.
@@ -40,7 +40,7 @@ COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) {
 
 #if defined(__ARM_EABI__)
 #if defined(COMPILER_RT_ARMHF_TARGET)
-AEABI_RTABI fp_t __aeabi_ui2d(unsigned int a) { return __floatunsidf(a); }
+AEABI_RTABI fp_t __aeabi_ui2d(su_int a) { return __floatunsidf(a); }
 #else
 COMPILER_RT_ALIAS(__floatunsidf, __aeabi_ui2d)
 #endif
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index d2083c426722..fb512672e35e 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -21,7 +21,7 @@ typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
 static const int srcSigBits = 23;
-#define src_rep_t_clz __builtin_clz
+#define src_rep_t_clz clzsi
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index e2a906681c46..bd1f180f499e 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -46,7 +46,7 @@ typedef float fp_t;
 #define REP_C UINT32_C
 #define significandBits 23
 
-static __inline int rep_clz(rep_t a) { return __builtin_clz(a); }
+static __inline int rep_clz(rep_t a) { return clzsi(a); }
 
 // 32x32 --> 64 bit multiply
 static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
@@ -69,9 +69,9 @@ static __inline int rep_clz(rep_t a) {
   return __builtin_clzl(a);
 #else
   if (a & REP_C(0xffffffff00000000))
-    return __builtin_clz(a >> 32);
+    return clzsi(a >> 32);
   else
-    return 32 + __builtin_clz(a & REP_C(0xffffffff));
+    return 32 + clzsi(a & REP_C(0xffffffff));
 #endif
 }
 
diff --git a/compiler-rt/lib/builtins/fp_mode.h b/compiler-rt/lib/builtins/fp_mode.h
index 51bec0431a40..4ba682c384f2 100644
--- a/compiler-rt/lib/builtins/fp_mode.h
+++ b/compiler-rt/lib/builtins/fp_mode.h
@@ -23,7 +23,7 @@ typedef enum {
   FE_TOWARDZERO
 } FE_ROUND_MODE;
 
-FE_ROUND_MODE __fe_getround();
-int __fe_raise_inexact();
+FE_ROUND_MODE __fe_getround(void);
+int __fe_raise_inexact(void);
 
 #endif // FP_MODE_H
diff --git a/compiler-rt/lib/builtins/hexagon/dffma.S b/compiler-rt/lib/builtins/hexagon/dffma.S
index c201d3d8be5e..843e88b3cab8 100644
--- a/compiler-rt/lib/builtins/hexagon/dffma.S
+++ b/compiler-rt/lib/builtins/hexagon/dffma.S
@@ -104,13 +104,11 @@
         .type __hexagon_fmadf4,@function
 	.global __hexagon_fmadf5
         .type __hexagon_fmadf5,@function
-	.global fma
-	.type fma,@function
 	Q6_ALIAS(fmadf5)
 	.p2align 5
 __hexagon_fmadf4:
 __hexagon_fmadf5:
-fma:
+.Lfma_begin:
 	{
 		P_TMP = dfclass(A,#2)
 		P_TMP = dfclass(B,#2)
@@ -561,7 +559,7 @@ fma:
 		B = insert(BTMP,#63,#0)
 		AH -= asl(TMP,#HI_MANTBITS)
 	}
-	jump fma
+	jump .Lfma_begin
 
 .Lfma_ab_tiny:
 	ATMP = combine(##0x00100000,#0)
@@ -569,7 +567,7 @@ fma:
 		A = insert(ATMP,#63,#0)
 		B = insert(ATMP,#63,#0)
 	}
-	jump fma
+	jump .Lfma_begin
 
 .Lab_inf:
 	{
diff --git a/compiler-rt/lib/builtins/hexagon/fabs_opt.S b/compiler-rt/lib/builtins/hexagon/fabs_opt.S
deleted file mode 100644
index 6bf9b84b3d20..000000000000
--- a/compiler-rt/lib/builtins/hexagon/fabs_opt.S
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------Hexagon builtin routine ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-.macro FUNCTION_BEGIN name
-.text
-.p2align 5
-.globl \name
-.type  \name, @function
-\name:
-.endm
-
-.macro FUNCTION_END name
-.size  \name, . - \name
-.endm
-
-FUNCTION_BEGIN fabs
-  {
-    r1 = clrbit(r1, #31)
-    jumpr r31
-  }
-FUNCTION_END fabs
-
-FUNCTION_BEGIN fabsf
-  {
-    r0 = clrbit(r0, #31)
-    jumpr r31
-  }
-FUNCTION_END fabsf
-
-  .globl fabsl
-  .set fabsl, fabs
diff --git a/compiler-rt/lib/builtins/hexagon/fma_opt.S b/compiler-rt/lib/builtins/hexagon/fma_opt.S
deleted file mode 100644
index 7f566adffd6a..000000000000
--- a/compiler-rt/lib/builtins/hexagon/fma_opt.S
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------Hexagon builtin routine ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-.macro FUNCTION_BEGIN name
-.text
-.p2align 5
-.globl \name
-.type  \name, @function
-\name:
-.endm
-
-.macro FUNCTION_END name
-.size  \name, . - \name
-.endm
-
-FUNCTION_BEGIN fmaf
-  r2 += sfmpy(r0, r1)
-  {
-    r0 = r2
-    jumpr r31
-  }
-FUNCTION_END fmaf
-
-  .globl fmal
-  .set fmal, fma
diff --git a/compiler-rt/lib/builtins/hexagon/fmax_opt.S b/compiler-rt/lib/builtins/hexagon/fmax_opt.S
deleted file mode 100644
index 81d711dff8d2..000000000000
--- a/compiler-rt/lib/builtins/hexagon/fmax_opt.S
+++ /dev/null
@@ -1,29 +0,0 @@
-//===----------------------Hexagon builtin routine ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-.macro FUNCTION_BEGIN name
-.text
-.p2align 5
-.globl \name
-.type  \name, @function
-\name:
-.endm
-
-.macro FUNCTION_END name
-.size  \name, . - \name
-.endm
-
-FUNCTION_BEGIN fmaxf
-  {
-    r0 = sfmax(r0, r1)
-    jumpr r31
-  }
-FUNCTION_END fmaxf
-
-  .globl fmaxl
-  .set fmaxl, fmax
diff --git a/compiler-rt/lib/builtins/hexagon/fmin_opt.S b/compiler-rt/lib/builtins/hexagon/fmin_opt.S
deleted file mode 100644
index d043f1d7a698..000000000000
--- a/compiler-rt/lib/builtins/hexagon/fmin_opt.S
+++ /dev/null
@@ -1,29 +0,0 @@
-//===----------------------Hexagon builtin routine ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-.macro FUNCTION_BEGIN name
-.text
-.p2align 5
-.globl \name
-.type  \name, @function
-\name:
-.endm
-
-.macro FUNCTION_END name
-.size  \name, . - \name
-.endm
-
-FUNCTION_BEGIN fminf
-  {
-    r0 = sfmin(r0, r1)
-    jumpr r31
-  }
-FUNCTION_END fminf
-
-  .globl fminl
-  .set fminl, fmin
diff --git a/compiler-rt/lib/builtins/i386/floatdidf.S b/compiler-rt/lib/builtins/i386/floatdidf.S
index ab7422c312dc..d588e770364e 100644
--- a/compiler-rt/lib/builtins/i386/floatdidf.S
+++ b/compiler-rt/lib/builtins/i386/floatdidf.S
@@ -4,7 +4,7 @@
 
 #include "../assembly.h"
 
-// double __floatundidf(du_int a);
+// double __floatdidf(du_int a);
 
 #ifdef __i386__
 
diff --git a/compiler-rt/lib/builtins/i386/floatdixf.S b/compiler-rt/lib/builtins/i386/floatdixf.S
index df70f5f9e6e3..19dd0835a9c5 100644
--- a/compiler-rt/lib/builtins/i386/floatdixf.S
+++ b/compiler-rt/lib/builtins/i386/floatdixf.S
@@ -4,7 +4,7 @@
 
 #include "../assembly.h"
 
-// float __floatdixf(di_int a);
+// long double __floatdixf(di_int a);
 
 #ifdef __i386__
 
diff --git a/compiler-rt/lib/builtins/int_div_impl.inc b/compiler-rt/lib/builtins/int_div_impl.inc
new file mode 100644
index 000000000000..de0373889078
--- /dev/null
+++ b/compiler-rt/lib/builtins/int_div_impl.inc
@@ -0,0 +1,70 @@
+//===-- int_div_impl.inc - Integer division ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helpers used by __udivsi3, __umodsi3, __udivdi3, and __umodsi3.
+//
+//===----------------------------------------------------------------------===//
+
+#define clz(a) (sizeof(a) == sizeof(unsigned long long) ? __builtin_clzll(a) : clzsi(a))
+
+// Adapted from Figure 3-40 of The PowerPC Compiler Writer's Guide
+static __inline fixuint_t __udivXi3(fixuint_t n, fixuint_t d) {
+  const unsigned N = sizeof(fixuint_t) * CHAR_BIT;
+  // d == 0 cases are unspecified.
+  unsigned sr = (d ? clz(d) : N) - (n ? clz(n) : N);
+  // 0 <= sr <= N - 1 or sr is very large.
+  if (sr > N - 1) // n < d
+    return 0;
+  if (sr == N - 1) // d == 1
+    return n;
+  ++sr;
+  // 1 <= sr <= N - 1. Shifts do not trigger UB.
+  fixuint_t r = n >> sr;
+  n <<= N - sr;
+  fixuint_t carry = 0;
+  for (; sr > 0; --sr) {
+    r = (r << 1) | (n >> (N - 1));
+    n = (n << 1) | carry;
+    // Branch-less version of:
+    // carry = 0;
+    // if (r >= d) r -= d, carry = 1;
+    const fixint_t s = (fixint_t)(d - r - 1) >> (N - 1);
+    carry = s & 1;
+    r -= d & s;
+  }
+  n = (n << 1) | carry;
+  return n;
+}
+
+// Mostly identical to __udivXi3 but the return values are different.
+static __inline fixuint_t __umodXi3(fixuint_t n, fixuint_t d) {
+  const unsigned N = sizeof(fixuint_t) * CHAR_BIT;
+  // d == 0 cases are unspecified.
+  unsigned sr = (d ? clz(d) : N) - (n ? clz(n) : N);
+  // 0 <= sr <= N - 1 or sr is very large.
+  if (sr > N - 1) // n < d
+    return n;
+  if (sr == N - 1) // d == 1
+    return 0;
+  ++sr;
+  // 1 <= sr <= N - 1. Shifts do not trigger UB.
+  fixuint_t r = n >> sr;
+  n <<= N - sr;
+  fixuint_t carry = 0;
+  for (; sr > 0; --sr) {
+    r = (r << 1) | (n >> (N - 1));
+    n = (n << 1) | carry;
+    // Branch-less version of:
+    // carry = 0;
+    // if (r >= d) r -= d, carry = 1;
+    const fixint_t s = (fixint_t)(d - r - 1) >> (N - 1);
+    carry = s & 1;
+    r -= d & s;
+  }
+  return r;
+}
diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h
index 3092f68c084a..991c4a99ea6e 100644
--- a/compiler-rt/lib/builtins/int_lib.h
+++ b/compiler-rt/lib/builtins/int_lib.h
@@ -48,12 +48,20 @@
 #define XSTR(a) STR(a)
 #define SYMBOL_NAME(name) XSTR(__USER_LABEL_PREFIX__) #name
 
-#if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__)
+#if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__) ||           \
+    defined(_AIX)
 #define COMPILER_RT_ALIAS(name, aliasname) \
   COMPILER_RT_ABI __typeof(name) aliasname __attribute__((__alias__(#name)));
 #elif defined(__APPLE__)
+#if defined(VISIBILITY_HIDDEN)
+#define COMPILER_RT_ALIAS_VISIBILITY(name) \
+  __asm__(".private_extern " SYMBOL_NAME(name));
+#else
+#define COMPILER_RT_ALIAS_VISIBILITY(name)
+#endif
 #define COMPILER_RT_ALIAS(name, aliasname) \
   __asm__(".globl " SYMBOL_NAME(aliasname)); \
+  COMPILER_RT_ALIAS_VISIBILITY(aliasname) \
   __asm__(SYMBOL_NAME(aliasname) " = " SYMBOL_NAME(name)); \
   COMPILER_RT_ABI __typeof(name) aliasname;
 #elif defined(_WIN32)
@@ -84,8 +92,8 @@
 // Include internal utility function declarations.
 #include "int_util.h"
 
-COMPILER_RT_ABI si_int __paritysi2(si_int a);
-COMPILER_RT_ABI si_int __paritydi2(di_int a);
+COMPILER_RT_ABI int __paritysi2(si_int a);
+COMPILER_RT_ABI int __paritydi2(di_int a);
 
 COMPILER_RT_ABI di_int __divdi3(di_int a, di_int b);
 COMPILER_RT_ABI si_int __divsi3(si_int a, si_int b);
@@ -94,7 +102,7 @@ COMPILER_RT_ABI su_int __udivsi3(su_int n, su_int d);
 COMPILER_RT_ABI su_int __udivmodsi4(su_int a, su_int b, su_int *rem);
 COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem);
 #ifdef CRT_HAS_128BIT
-COMPILER_RT_ABI si_int __clzti2(ti_int a);
+COMPILER_RT_ABI int __clzti2(ti_int a);
 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
 #endif
 
@@ -102,14 +110,14 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
 #if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>
 
-uint32_t __inline __builtin_ctz(uint32_t value) {
+int __inline __builtin_ctz(uint32_t value) {
   unsigned long trailing_zero = 0;
   if (_BitScanForward(&trailing_zero, value))
     return trailing_zero;
   return 32;
 }
 
-uint32_t __inline __builtin_clz(uint32_t value) {
+int __inline __builtin_clz(uint32_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse(&leading_zero, value))
     return 31 - leading_zero;
@@ -117,14 +125,14 @@ uint32_t __inline __builtin_clz(uint32_t value) {
 }
 
 #if defined(_M_ARM) || defined(_M_X64)
-uint32_t __inline __builtin_clzll(uint64_t value) {
+int __inline __builtin_clzll(uint64_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse64(&leading_zero, value))
     return 63 - leading_zero;
   return 64;
 }
 #else
-uint32_t __inline __builtin_clzll(uint64_t value) {
+int __inline __builtin_clzll(uint64_t value) {
   if (value == 0)
     return 64;
   uint32_t msh = (uint32_t)(value >> 32);
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index f89220d54350..705355a4840d 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -22,11 +22,20 @@
 #ifdef si_int
 #undef si_int
 #endif
-typedef int si_int;
-typedef unsigned su_int;
+typedef int32_t si_int;
+typedef uint32_t su_int;
+#if UINT_MAX == 0xFFFFFFFF
+#define clzsi __builtin_clz
+#define ctzsi __builtin_ctz
+#elif ULONG_MAX == 0xFFFFFFFF
+#define clzsi __builtin_clzl
+#define ctzsi __builtin_ctzl
+#else
+#error could not determine appropriate clzsi macro for this system
+#endif
 
-typedef long long di_int;
-typedef unsigned long long du_int;
+typedef int64_t di_int;
+typedef uint64_t du_int;
 
 typedef union {
   di_int all;
@@ -135,9 +144,12 @@ typedef struct {
 // Check if the target supports 80 bit extended precision long doubles.
 // Notably, on x86 Windows, MSVC only provides a 64-bit long double, but GCC
 // still makes it 80 bits. Clang will match whatever compiler it is trying to
-// be compatible with.
-#if ((defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)) ||      \
-    defined(__m68k__) || defined(__ia64__)
+// be compatible with. On 32-bit x86 Android, long double is 64 bits, while on
+// x86_64 Android, long double is 128 bits.
+#if (defined(__i386__) || defined(__x86_64__)) &&                              \
+    !(defined(_MSC_VER) || defined(__ANDROID__))
+#define HAS_80_BIT_LONG_DOUBLE 1
+#elif defined(__m68k__) || defined(__ia64__)
 #define HAS_80_BIT_LONG_DOUBLE 1
 #else
 #define HAS_80_BIT_LONG_DOUBLE 0
diff --git a/compiler-rt/lib/builtins/lshrdi3.c b/compiler-rt/lib/builtins/lshrdi3.c
index 97e08e1e9ba0..6072152583ac 100644
--- a/compiler-rt/lib/builtins/lshrdi3.c
+++ b/compiler-rt/lib/builtins/lshrdi3.c
@@ -16,7 +16,7 @@
 
 // Precondition:  0 <= b < bits_in_dword
 
-COMPILER_RT_ABI di_int __lshrdi3(di_int a, si_int b) {
+COMPILER_RT_ABI di_int __lshrdi3(di_int a, int b) {
   const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT);
   udwords input;
   udwords result;
diff --git a/compiler-rt/lib/builtins/paritydi2.c b/compiler-rt/lib/builtins/paritydi2.c
index dd9d45e63ea4..58e85f89e043 100644
--- a/compiler-rt/lib/builtins/paritydi2.c
+++ b/compiler-rt/lib/builtins/paritydi2.c
@@ -14,7 +14,7 @@
 
 // Returns: 1 if number of bits is odd else returns 0
 
-COMPILER_RT_ABI si_int __paritydi2(di_int a) {
+COMPILER_RT_ABI int __paritydi2(di_int a) {
   dwords x;
   x.all = a;
   return __paritysi2(x.s.high ^ x.s.low);
diff --git a/compiler-rt/lib/builtins/paritysi2.c b/compiler-rt/lib/builtins/paritysi2.c
index 3efa961f2f85..a4b84e080632 100644
--- a/compiler-rt/lib/builtins/paritysi2.c
+++ b/compiler-rt/lib/builtins/paritysi2.c
@@ -14,7 +14,7 @@
 
 // Returns: 1 if number of bits is odd else returns 0
 
-COMPILER_RT_ABI si_int __paritysi2(si_int a) {
+COMPILER_RT_ABI int __paritysi2(si_int a) {
   su_int x = (su_int)a;
   x ^= x >> 16;
   x ^= x >> 8;
diff --git a/compiler-rt/lib/builtins/parityti2.c b/compiler-rt/lib/builtins/parityti2.c
index f3942ba8378c..79e920d8a02d 100644
--- a/compiler-rt/lib/builtins/parityti2.c
+++ b/compiler-rt/lib/builtins/parityti2.c
@@ -16,7 +16,7 @@
 
 // Returns: 1 if number of bits is odd else returns 0
 
-COMPILER_RT_ABI si_int __parityti2(ti_int a) {
+COMPILER_RT_ABI int __parityti2(ti_int a) {
   twords x;
   x.all = a;
   return __paritydi2(x.s.high ^ x.s.low);
diff --git a/compiler-rt/lib/builtins/popcountdi2.c b/compiler-rt/lib/builtins/popcountdi2.c
index 9bbc39c6608a..20dd0b0239ef 100644
--- a/compiler-rt/lib/builtins/popcountdi2.c
+++ b/compiler-rt/lib/builtins/popcountdi2.c
@@ -14,7 +14,7 @@
 
 // Returns: count of 1 bits
 
-COMPILER_RT_ABI si_int __popcountdi2(di_int a) {
+COMPILER_RT_ABI int __popcountdi2(di_int a) {
   du_int x2 = (du_int)a;
   x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL);
   // Every 2 bits holds the sum of every pair of bits (32)
diff --git a/compiler-rt/lib/builtins/popcountsi2.c b/compiler-rt/lib/builtins/popcountsi2.c
index 75e592a778d9..4d346c45d9ce 100644
--- a/compiler-rt/lib/builtins/popcountsi2.c
+++ b/compiler-rt/lib/builtins/popcountsi2.c
@@ -14,7 +14,7 @@
 
 // Returns: count of 1 bits
 
-COMPILER_RT_ABI si_int __popcountsi2(si_int a) {
+COMPILER_RT_ABI int __popcountsi2(si_int a) {
   su_int x = (su_int)a;
   x = x - ((x >> 1) & 0x55555555);
   // Every 2 bits holds the sum of every pair of bits
diff --git a/compiler-rt/lib/builtins/popcountti2.c b/compiler-rt/lib/builtins/popcountti2.c
index 853fd722309e..79cbb2fb34c0 100644
--- a/compiler-rt/lib/builtins/popcountti2.c
+++ b/compiler-rt/lib/builtins/popcountti2.c
@@ -17,7 +17,7 @@
 
 // Returns: count of 1 bits
 
-COMPILER_RT_ABI si_int __popcountti2(ti_int a) {
+COMPILER_RT_ABI int __popcountti2(ti_int a) {
   tu_int x3 = (tu_int)a;
   x3 = x3 - ((x3 >> 1) &
              (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
diff --git a/compiler-rt/lib/builtins/powidf2.c b/compiler-rt/lib/builtins/powidf2.c
index 9697588484e7..81058af50829 100644
--- a/compiler-rt/lib/builtins/powidf2.c
+++ b/compiler-rt/lib/builtins/powidf2.c
@@ -14,7 +14,7 @@
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI double __powidf2(double a, si_int b) {
+COMPILER_RT_ABI double __powidf2(double a, int b) {
   const int recip = b < 0;
   double r = 1;
   while (1) {
diff --git a/compiler-rt/lib/builtins/powisf2.c b/compiler-rt/lib/builtins/powisf2.c
index 469402348825..d0ab26167bbd 100644
--- a/compiler-rt/lib/builtins/powisf2.c
+++ b/compiler-rt/lib/builtins/powisf2.c
@@ -14,7 +14,7 @@
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI float __powisf2(float a, si_int b) {
+COMPILER_RT_ABI float __powisf2(float a, int b) {
   const int recip = b < 0;
   float r = 1;
   while (1) {
diff --git a/compiler-rt/lib/builtins/powitf2.c b/compiler-rt/lib/builtins/powitf2.c
index fcbdb4c2ee2a..8e639a03a3c4 100644
--- a/compiler-rt/lib/builtins/powitf2.c
+++ b/compiler-rt/lib/builtins/powitf2.c
@@ -10,13 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "int_lib.h"
+#define QUAD_PRECISION
+#include "fp_lib.h"
 
-#if _ARCH_PPC
+#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI long double __powitf2(long double a, si_int b) {
+COMPILER_RT_ABI long double __powitf2(long double a, int b) {
   const int recip = b < 0;
   long double r = 1;
   while (1) {
diff --git a/compiler-rt/lib/builtins/powixf2.c b/compiler-rt/lib/builtins/powixf2.c
index b7b52095afa1..3edfe9fd7af5 100644
--- a/compiler-rt/lib/builtins/powixf2.c
+++ b/compiler-rt/lib/builtins/powixf2.c
@@ -16,7 +16,7 @@
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI long double __powixf2(long double a, si_int b) {
+COMPILER_RT_ABI long double __powixf2(long double a, int b) {
   const int recip = b < 0;
   long double r = 1;
   while (1) {
diff --git a/compiler-rt/lib/builtins/riscv/int_mul_impl.inc b/compiler-rt/lib/builtins/riscv/int_mul_impl.inc
new file mode 100644
index 000000000000..50951d5f4195
--- /dev/null
+++ b/compiler-rt/lib/builtins/riscv/int_mul_impl.inc
@@ -0,0 +1,31 @@
+//===-- int_mul_impl.inc - Integer multiplication -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helpers used by __mulsi3, __muldi3.
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(__riscv_mul)
+	.text
+	.align 2
+
+	.globl __mulxi3
+	.type  __mulxi3, @function
+__mulxi3:
+	mv     a2, a0
+	mv     a0, zero
+.L1:
+	andi   a3, a1, 1
+	beqz   a3, .L2
+	add    a0, a0, a2
+.L2:
+	srli   a1, a1, 1
+	slli   a2, a2, 1
+	bnez   a1, .L1
+	ret
+#endif
diff --git a/compiler-rt/lib/builtins/riscv/muldi3.S b/compiler-rt/lib/builtins/riscv/muldi3.S
new file mode 100644
index 000000000000..9e292e8dd8b9
--- /dev/null
+++ b/compiler-rt/lib/builtins/riscv/muldi3.S
@@ -0,0 +1,11 @@
+//===--- muldi3.S - Integer multiplication routines -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#if __riscv_xlen == 64
+#define __mulxi3 __muldi3
+#include "int_mul_impl.inc"
+#endif
diff --git a/compiler-rt/lib/builtins/riscv/mulsi3.S b/compiler-rt/lib/builtins/riscv/mulsi3.S
index 5464919b26b9..cfafb7a0d7b3 100644
--- a/compiler-rt/lib/builtins/riscv/mulsi3.S
+++ b/compiler-rt/lib/builtins/riscv/mulsi3.S
@@ -1,4 +1,4 @@
-//===--- mulsi3.S - Integer multiplication routines routines ---===//
+//===--- mulsi3.S - Integer multiplication routines -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,22 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !defined(__riscv_mul) && __riscv_xlen == 32
-	.text
-	.align 2
-
-	.globl __mulsi3
-	.type  __mulsi3, @function
-__mulsi3:
-	mv     a2, a0
-	mv     a0, zero
-.L1:
-	andi   a3, a1, 1
-	beqz   a3, .L2
-	add    a0, a0, a2
-.L2:
-	srli   a1, a1, 1
-	slli   a2, a2, 1
-	bnez   a1, .L1
-	ret
+#if __riscv_xlen == 32
+#define __mulxi3 __mulsi3
+#include "int_mul_impl.inc"
 #endif
diff --git a/compiler-rt/lib/builtins/udivdi3.c b/compiler-rt/lib/builtins/udivdi3.c
index a23139ec947f..74319cbe71c3 100644
--- a/compiler-rt/lib/builtins/udivdi3.c
+++ b/compiler-rt/lib/builtins/udivdi3.c
@@ -12,8 +12,12 @@
 
 #include "int_lib.h"
 
+typedef du_int fixuint_t;
+typedef di_int fixint_t;
+#include "int_div_impl.inc"
+
 // Returns: a / b
 
 COMPILER_RT_ABI du_int __udivdi3(du_int a, du_int b) {
-  return __udivmoddi4(a, b, 0);
+  return __udivXi3(a, b);
 }
diff --git a/compiler-rt/lib/builtins/udivmoddi4.c b/compiler-rt/lib/builtins/udivmoddi4.c
index 5b297c32d790..10b41df28f84 100644
--- a/compiler-rt/lib/builtins/udivmoddi4.c
+++ b/compiler-rt/lib/builtins/udivmoddi4.c
@@ -87,7 +87,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) {
     // K K
     // ---
     // K 0
-    sr = __builtin_clz(d.s.high) - __builtin_clz(n.s.high);
+    sr = clzsi(d.s.high) - clzsi(n.s.high);
     // 0 <= sr <= n_uword_bits - 2 or sr large
     if (sr > n_uword_bits - 2) {
       if (rem)
@@ -120,7 +120,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) {
       // K X
       // ---
       // 0 K
-      sr = 1 + n_uword_bits + __builtin_clz(d.s.low) - __builtin_clz(n.s.high);
+      sr = 1 + n_uword_bits + clzsi(d.s.low) - clzsi(n.s.high);
       // 2 <= sr <= n_udword_bits - 1
       // q.all = n.all << (n_udword_bits - sr);
       // r.all = n.all >> sr;
@@ -145,7 +145,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) {
       // K X
       // ---
       // K K
-      sr = __builtin_clz(d.s.high) - __builtin_clz(n.s.high);
+      sr = clzsi(d.s.high) - clzsi(n.s.high);
       // 0 <= sr <= n_uword_bits - 1 or sr large
       if (sr > n_uword_bits - 1) {
         if (rem)
diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c
index dd14a8b579ca..55def37c9e1f 100644
--- a/compiler-rt/lib/builtins/udivmodti4.c
+++ b/compiler-rt/lib/builtins/udivmodti4.c
@@ -14,182 +14,145 @@
 
 #ifdef CRT_HAS_128BIT
 
+// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits.
+// Remainder stored in r.
+// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division
+// fallback. For a correctness proof see the reference for this algorithm
+// in Knuth, Volume 2, section 4.3.1, Algorithm D.
+UNUSED
+static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v,
+                                            du_int *r) {
+  const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
+  const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits)
+  du_int un1, un0;                                // Norm. dividend LSD's
+  du_int vn1, vn0;                                // Norm. divisor digits
+  du_int q1, q0;                                  // Quotient digits
+  du_int un64, un21, un10;                        // Dividend digit pairs
+  du_int rhat;                                    // A remainder
+  si_int s;                                       // Shift amount for normalization
+
+  s = __builtin_clzll(v);
+  if (s > 0) {
+    // Normalize the divisor.
+    v = v << s;
+    un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
+    un10 = u0 << s; // Shift dividend left
+  } else {
+    // Avoid undefined behavior of (u0 >> 64).
+    un64 = u1;
+    un10 = u0;
+  }
+
+  // Break divisor up into two 32-bit digits.
+  vn1 = v >> (n_udword_bits / 2);
+  vn0 = v & 0xFFFFFFFF;
+
+  // Break right half of dividend into two digits.
+  un1 = un10 >> (n_udword_bits / 2);
+  un0 = un10 & 0xFFFFFFFF;
+
+  // Compute the first quotient digit, q1.
+  q1 = un64 / vn1;
+  rhat = un64 - q1 * vn1;
+
+  // q1 has at most error 2. No more than 2 iterations.
+  while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+    q1 = q1 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b)
+      break;
+  }
+
+  un21 = un64 * b + un1 - q1 * v;
+
+  // Compute the second quotient digit.
+  q0 = un21 / vn1;
+  rhat = un21 - q0 * vn1;
+
+  // q0 has at most error 2. No more than 2 iterations.
+  while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+    q0 = q0 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b)
+      break;
+  }
+
+  *r = (un21 * b + un0 - q0 * v) >> s;
+  return q1 * b + q0;
+}
+
+static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v,
+                                     du_int *r) {
+#if defined(__x86_64__)
+  du_int result;
+  __asm__("divq %[v]"
+          : "=a"(result), "=d"(*r)
+          : [ v ] "r"(v), "a"(u0), "d"(u1));
+  return result;
+#else
+  return udiv128by64to64default(u1, u0, v, r);
+#endif
+}
+
 // Effects: if rem != 0, *rem = a % b
 // Returns: a / b
 
-// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide
-
 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) {
-  const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
   const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
-  utwords n;
-  n.all = a;
-  utwords d;
-  d.all = b;
-  utwords q;
-  utwords r;
-  unsigned sr;
-  // special cases, X is unknown, K != 0
-  if (n.s.high == 0) {
-    if (d.s.high == 0) {
-      // 0 X
-      // ---
-      // 0 X
-      if (rem)
-        *rem = n.s.low % d.s.low;
-      return n.s.low / d.s.low;
-    }
-    // 0 X
-    // ---
-    // K X
+  utwords dividend;
+  dividend.all = a;
+  utwords divisor;
+  divisor.all = b;
+  utwords quotient;
+  utwords remainder;
+  if (divisor.all > dividend.all) {
     if (rem)
-      *rem = n.s.low;
+      *rem = dividend.all;
     return 0;
   }
-  // n.s.high != 0
-  if (d.s.low == 0) {
-    if (d.s.high == 0) {
-      // K X
-      // ---
-      // 0 0
-      if (rem)
-        *rem = n.s.high % d.s.low;
-      return n.s.high / d.s.low;
-    }
-    // d.s.high != 0
-    if (n.s.low == 0) {
-      // K 0
-      // ---
-      // K 0
-      if (rem) {
-        r.s.high = n.s.high % d.s.high;
-        r.s.low = 0;
-        *rem = r.all;
-      }
-      return n.s.high / d.s.high;
-    }
-    // K K
-    // ---
-    // K 0
-    if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ {
-      if (rem) {
-        r.s.low = n.s.low;
-        r.s.high = n.s.high & (d.s.high - 1);
-        *rem = r.all;
-      }
-      return n.s.high >> __builtin_ctzll(d.s.high);
-    }
-    // K K
-    // ---
-    // K 0
-    sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
-    // 0 <= sr <= n_udword_bits - 2 or sr large
-    if (sr > n_udword_bits - 2) {
-      if (rem)
-        *rem = n.all;
-      return 0;
-    }
-    ++sr;
-    // 1 <= sr <= n_udword_bits - 1
-    // q.all = n.all << (n_utword_bits - sr);
-    q.s.low = 0;
-    q.s.high = n.s.low << (n_udword_bits - sr);
-    // r.all = n.all >> sr;
-    r.s.high = n.s.high >> sr;
-    r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
-  } else /* d.s.low != 0 */ {
-    if (d.s.high == 0) {
-      // K X
-      // ---
-      // 0 K
-      if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ {
-        if (rem)
-          *rem = n.s.low & (d.s.low - 1);
-        if (d.s.low == 1)
-          return n.all;
-        sr = __builtin_ctzll(d.s.low);
-        q.s.high = n.s.high >> sr;
-        q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
-        return q.all;
-      }
-      // K X
-      // ---
-      // 0 K
-      sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) -
-           __builtin_clzll(n.s.high);
-      // 2 <= sr <= n_utword_bits - 1
-      // q.all = n.all << (n_utword_bits - sr);
-      // r.all = n.all >> sr;
-      if (sr == n_udword_bits) {
-        q.s.low = 0;
-        q.s.high = n.s.low;
-        r.s.high = 0;
-        r.s.low = n.s.high;
-      } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ {
-        q.s.low = 0;
-        q.s.high = n.s.low << (n_udword_bits - sr);
-        r.s.high = n.s.high >> sr;
-        r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
-      } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ {
-        q.s.low = n.s.low << (n_utword_bits - sr);
-        q.s.high = (n.s.high << (n_utword_bits - sr)) |
-                   (n.s.low >> (sr - n_udword_bits));
-        r.s.high = 0;
-        r.s.low = n.s.high >> (sr - n_udword_bits);
-      }
+  // When the divisor fits in 64 bits, we can use an optimized path.
+  if (divisor.s.high == 0) {
+    remainder.s.high = 0;
+    if (dividend.s.high < divisor.s.low) {
+      // The result fits in 64 bits.
+      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
+                                       divisor.s.low, &remainder.s.low);
+      quotient.s.high = 0;
     } else {
-      // K X
-      // ---
-      // K K
-      sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
-      // 0 <= sr <= n_udword_bits - 1 or sr large
-      if (sr > n_udword_bits - 1) {
-        if (rem)
-          *rem = n.all;
-        return 0;
-      }
-      ++sr;
-      // 1 <= sr <= n_udword_bits
-      // q.all = n.all << (n_utword_bits - sr);
-      // r.all = n.all >> sr;
-      q.s.low = 0;
-      if (sr == n_udword_bits) {
-        q.s.high = n.s.low;
-        r.s.high = 0;
-        r.s.low = n.s.high;
-      } else {
-        r.s.high = n.s.high >> sr;
-        r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
-        q.s.high = n.s.low << (n_udword_bits - sr);
-      }
+      // First, divide with the high part to get the remainder in dividend.s.high.
+      // After that dividend.s.high < divisor.s.low.
+      quotient.s.high = dividend.s.high / divisor.s.low;
+      dividend.s.high = dividend.s.high % divisor.s.low;
+      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
+                                       divisor.s.low, &remainder.s.low);
     }
+    if (rem)
+      *rem = remainder.all;
+    return quotient.all;
   }
-  // Not a special case
-  // q and r are initialized with:
-  // q.all = n.all << (n_utword_bits - sr);
-  // r.all = n.all >> sr;
-  // 1 <= sr <= n_utword_bits - 1
-  su_int carry = 0;
-  for (; sr > 0; --sr) {
-    // r:q = ((r:q)  << 1) | carry
-    r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1));
-    r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1));
-    q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1));
-    q.s.low = (q.s.low << 1) | carry;
-    // carry = 0;
-    // if (r.all >= d.all)
+  // 0 <= shift <= 63.
+  si_int shift =
+      __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
+  divisor.all <<= shift;
+  quotient.s.high = 0;
+  quotient.s.low = 0;
+  for (; shift >= 0; --shift) {
+    quotient.s.low <<= 1;
+    // Branch free version of.
+    // if (dividend.all >= divisor.all)
     // {
-    //     r.all -= d.all;
-    //      carry = 1;
+    //    dividend.all -= divisor.all;
+    //    carry = 1;
     // }
-    const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1);
-    carry = s & 1;
-    r.all -= d.all & s;
+    const ti_int s =
+        (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
+    quotient.s.low |= s & 1;
+    dividend.all -= divisor.all & s;
+    divisor.all >>= 1;
   }
-  q.all = (q.all << 1) | carry;
   if (rem)
-    *rem = r.all;
-  return q.all;
+    *rem = dividend.all;
+  return quotient.all;
 }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/udivsi3.c b/compiler-rt/lib/builtins/udivsi3.c
index 18cc96c1b2e0..3894e1597552 100644
--- a/compiler-rt/lib/builtins/udivsi3.c
+++ b/compiler-rt/lib/builtins/udivsi3.c
@@ -12,49 +12,14 @@
 
 #include "int_lib.h"
 
-// Returns: a / b
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+#include "int_div_impl.inc"
 
-// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide
+// Returns: a / b
 
-// This function should not call __divsi3!
-COMPILER_RT_ABI su_int __udivsi3(su_int n, su_int d) {
-  const unsigned n_uword_bits = sizeof(su_int) * CHAR_BIT;
-  su_int q;
-  su_int r;
-  unsigned sr;
-  // special cases
-  if (d == 0)
-    return 0; // ?!
-  if (n == 0)
-    return 0;
-  sr = __builtin_clz(d) - __builtin_clz(n);
-  // 0 <= sr <= n_uword_bits - 1 or sr large
-  if (sr > n_uword_bits - 1) // d > r
-    return 0;
-  if (sr == n_uword_bits - 1) // d == 1
-    return n;
-  ++sr;
-  // 1 <= sr <= n_uword_bits - 1
-  // Not a special case
-  q = n << (n_uword_bits - sr);
-  r = n >> sr;
-  su_int carry = 0;
-  for (; sr > 0; --sr) {
-    // r:q = ((r:q)  << 1) | carry
-    r = (r << 1) | (q >> (n_uword_bits - 1));
-    q = (q << 1) | carry;
-    // carry = 0;
-    // if (r.all >= d.all)
-    // {
-    //      r.all -= d.all;
-    //      carry = 1;
-    // }
-    const si_int s = (si_int)(d - r - 1) >> (n_uword_bits - 1);
-    carry = s & 1;
-    r -= d & s;
-  }
-  q = (q << 1) | carry;
-  return q;
+COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) {
+  return __udivXi3(a, b);
 }
 
 #if defined(__ARM_EABI__)
diff --git a/compiler-rt/lib/builtins/umoddi3.c b/compiler-rt/lib/builtins/umoddi3.c
index 965cf8fc01bd..e672da96ef62 100644
--- a/compiler-rt/lib/builtins/umoddi3.c
+++ b/compiler-rt/lib/builtins/umoddi3.c
@@ -12,10 +12,12 @@
 
 #include "int_lib.h"
 
+typedef du_int fixuint_t;
+typedef di_int fixint_t;
+#include "int_div_impl.inc"
+
 // Returns: a % b
 
 COMPILER_RT_ABI du_int __umoddi3(du_int a, du_int b) {
-  du_int r;
-  __udivmoddi4(a, b, &r);
-  return r;
+  return __umodXi3(a, b);
 }
diff --git a/compiler-rt/lib/builtins/umodsi3.c b/compiler-rt/lib/builtins/umodsi3.c
index ce9abcd94ef7..5383aea656a9 100644
--- a/compiler-rt/lib/builtins/umodsi3.c
+++ b/compiler-rt/lib/builtins/umodsi3.c
@@ -12,8 +12,12 @@
 
 #include "int_lib.h"
 
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+#include "int_div_impl.inc"
+
 // Returns: a % b
 
 COMPILER_RT_ABI su_int __umodsi3(su_int a, su_int b) {
-  return a - __udivsi3(a, b) * b;
+  return __umodXi3(a, b);
 }
diff --git a/compiler-rt/lib/builtins/ve/grow_stack.S b/compiler-rt/lib/builtins/ve/grow_stack.S
new file mode 100644
index 000000000000..f403798495af
--- /dev/null
+++ b/compiler-rt/lib/builtins/ve/grow_stack.S
@@ -0,0 +1,31 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../assembly.h"
+
+// grow_stack routine
+// This routine is VE specific
+// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf
+
+// destroy %s62 and %s63 only
+
+#ifdef __ve__
+
+.text
+.p2align        4
+DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack)
+        subu.l          %sp, %sp, %s0           # sp -= alloca size
+        and             %sp, -16, %sp           # align sp
+        brge.l.t        %sp, %sl, 1f
+        ld              %s63, 0x18(,%tp)        # load param area
+        lea             %s62, 0x13b             # syscall # of grow
+        shm.l           %s62, 0x0(%s63)         # stored at addr:0
+        shm.l           %sl, 0x8(%s63)          # old limit at addr:8
+        shm.l           %sp, 0x10(%s63)         # new limit at addr:16
+        monc
+1:
+        b.l             (,%lr)
+END_COMPILERRT_FUNCTION(__ve_grow_stack)
+
+#endif // __ve__
diff --git a/compiler-rt/lib/builtins/ve/grow_stack_align.S b/compiler-rt/lib/builtins/ve/grow_stack_align.S
new file mode 100644
index 000000000000..19a1dfa8726c
--- /dev/null
+++ b/compiler-rt/lib/builtins/ve/grow_stack_align.S
@@ -0,0 +1,31 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../assembly.h"
+
+// grow_stack routine
+// This routine is VE specific
+// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf
+
+// destroy %s62 and %s63 only
+
+#ifdef __ve__
+
+.text
+.p2align        4
+DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack_align)
+        subu.l          %sp, %sp, %s0           # sp -= alloca size
+        and             %sp, %sp, %s1           # align sp
+        brge.l.t        %sp, %sl, 1f
+        ld              %s63, 0x18(,%tp)        # load param area
+        lea             %s62, 0x13b             # syscall # of grow
+        shm.l           %s62, 0x0(%s63)         # stored at addr:0
+        shm.l           %sl, 0x8(%s63)          # old limit at addr:8
+        shm.l           %sp, 0x10(%s63)         # new limit at addr:16
+        monc
+1:
+        b.l             (,%lr)
+END_COMPILERRT_FUNCTION(__ve_grow_stack_align)
+
+#endif // __ve__