diff options
Diffstat (limited to 'compiler-rt/lib/builtins')
57 files changed, 572 insertions, 523 deletions
diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt index e603dfa05356..f9e1bc805092 100644 --- a/compiler-rt/lib/builtins/README.txt +++ b/compiler-rt/lib/builtins/README.txt @@ -20,13 +20,18 @@ Here is the specification for this library: http://gcc.gnu.org/onlinedocs/gccint/Libgcc.html#Libgcc +Please note that the libgcc specification explicitly mentions actual types of +arguments and returned values being expressed with machine modes. +In some cases particular types such as "int", "unsigned", "long long", etc. +may be specified just as examples there. + Here is a synopsis of the contents of this library: -typedef int si_int; -typedef unsigned su_int; +typedef int32_t si_int; +typedef uint32_t su_int; -typedef long long di_int; -typedef unsigned long long du_int; +typedef int64_t di_int; +typedef uint64_t du_int; // Integral bit manipulation @@ -38,24 +43,24 @@ ti_int __ashrti3(ti_int a, si_int b); // a >> b arithmetic (sign fill) di_int __lshrdi3(di_int a, si_int b); // a >> b logical (zero fill) ti_int __lshrti3(ti_int a, si_int b); // a >> b logical (zero fill) -si_int __clzsi2(si_int a); // count leading zeros -si_int __clzdi2(di_int a); // count leading zeros -si_int __clzti2(ti_int a); // count leading zeros -si_int __ctzsi2(si_int a); // count trailing zeros -si_int __ctzdi2(di_int a); // count trailing zeros -si_int __ctzti2(ti_int a); // count trailing zeros +int __clzsi2(si_int a); // count leading zeros +int __clzdi2(di_int a); // count leading zeros +int __clzti2(ti_int a); // count leading zeros +int __ctzsi2(si_int a); // count trailing zeros +int __ctzdi2(di_int a); // count trailing zeros +int __ctzti2(ti_int a); // count trailing zeros -si_int __ffssi2(si_int a); // find least significant 1 bit -si_int __ffsdi2(di_int a); // find least significant 1 bit -si_int __ffsti2(ti_int a); // find least significant 1 bit +int __ffssi2(si_int a); // find least significant 1 bit +int __ffsdi2(di_int a); // find least significant 1 bit +int __ffsti2(ti_int a); // find least significant 1 bit -si_int __paritysi2(si_int a); // bit parity -si_int __paritydi2(di_int a); // bit parity -si_int __parityti2(ti_int a); // bit parity +int __paritysi2(si_int a); // bit parity +int __paritydi2(di_int a); // bit parity +int __parityti2(ti_int a); // bit parity -si_int __popcountsi2(si_int a); // bit population -si_int __popcountdi2(di_int a); // bit population -si_int __popcountti2(ti_int a); // bit population +int __popcountsi2(si_int a); // bit population +int __popcountdi2(di_int a); // bit population +int __popcountti2(ti_int a); // bit population uint32_t __bswapsi2(uint32_t a); // a byteswapped uint64_t __bswapdi2(uint64_t a); // a byteswapped @@ -169,10 +174,10 @@ long double __floatuntixf(tu_int a); // Floating point raised to integer power -float __powisf2( float a, si_int b); // a ^ b -double __powidf2( double a, si_int b); // a ^ b -long double __powixf2(long double a, si_int b); // a ^ b -long double __powitf2(long double a, si_int b); // ppc only, a ^ b +float __powisf2( float a, int b); // a ^ b +double __powidf2( double a, int b); // a ^ b +long double __powixf2(long double a, int b); // a ^ b +long double __powitf2(long double a, int b); // ppc only, a ^ b // Complex arithmetic diff --git a/compiler-rt/lib/builtins/absvsi2.c b/compiler-rt/lib/builtins/absvsi2.c index 44ada169e7e6..9d5de7e8a3f2 100644 --- a/compiler-rt/lib/builtins/absvsi2.c +++ b/compiler-rt/lib/builtins/absvsi2.c @@ -18,7 +18,7 @@ COMPILER_RT_ABI si_int __absvsi2(si_int a) { const int N = (int)(sizeof(si_int) * CHAR_BIT); - if (a == (1 << (N - 1))) + if (a == ((si_int)1 << (N - 1))) compilerrt_abort(); const si_int t = a >> (N - 1); return (a ^ t) - t; diff --git a/compiler-rt/lib/builtins/ashldi3.c b/compiler-rt/lib/builtins/ashldi3.c index 7c81057a2284..04f22228f11d 100644 --- a/compiler-rt/lib/builtins/ashldi3.c +++ b/compiler-rt/lib/builtins/ashldi3.c @@ -16,7 +16,7 @@ // Precondition: 0 <= b < bits_in_dword -COMPILER_RT_ABI di_int __ashldi3(di_int a, si_int b) { +COMPILER_RT_ABI di_int __ashldi3(di_int a, int b) { const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT); dwords input; dwords result; diff --git a/compiler-rt/lib/builtins/ashrdi3.c b/compiler-rt/lib/builtins/ashrdi3.c index b9939132205c..934a5c47fd69 100644 --- a/compiler-rt/lib/builtins/ashrdi3.c +++ b/compiler-rt/lib/builtins/ashrdi3.c @@ -16,7 +16,7 @@ // Precondition: 0 <= b < bits_in_dword -COMPILER_RT_ABI di_int __ashrdi3(di_int a, si_int b) { +COMPILER_RT_ABI di_int __ashrdi3(di_int a, int b) { const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT); dwords input; dwords result; diff --git a/compiler-rt/lib/builtins/atomic.c b/compiler-rt/lib/builtins/atomic.c index 32b3a0f9ad23..8634a72e77d1 100644 --- a/compiler-rt/lib/builtins/atomic.c +++ b/compiler-rt/lib/builtins/atomic.c @@ -23,6 +23,7 @@ // //===----------------------------------------------------------------------===// +#include <stdbool.h> #include <stdint.h> #include <string.h> @@ -293,8 +294,8 @@ OPTIMISED_CASES #undef OPTIMISED_CASE #define OPTIMISED_CASE(n, lockfree, type) \ - int __atomic_compare_exchange_##n(type *ptr, type *expected, type desired, \ - int success, int failure) { \ + bool __atomic_compare_exchange_##n(type *ptr, type *expected, type desired, \ + int success, int failure) { \ if (lockfree) \ return __c11_atomic_compare_exchange_strong( \ (_Atomic(type) *)ptr, expected, desired, success, failure); \ @@ -303,11 +304,11 @@ OPTIMISED_CASES if (*ptr == *expected) { \ *ptr = desired; \ unlock(l); \ - return 1; \ + return true; \ } \ *expected = *ptr; \ unlock(l); \ - return 0; \ + return false; \ } OPTIMISED_CASES #undef OPTIMISED_CASE diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c index e83e21254e85..72e02e613de5 100644 --- a/compiler-rt/lib/builtins/clear_cache.c +++ b/compiler-rt/lib/builtins/clear_cache.c @@ -147,6 +147,16 @@ void __clear_cache(void *start, void *end) { for (uintptr_t dword = start_dword; dword < end_dword; dword += dword_size) __asm__ volatile("flush %0" : : "r"(dword)); +#elif defined(__riscv) && defined(__linux__) +#define __NR_riscv_flush_icache (244 + 15) + register void *start_reg __asm("a0") = start; + const register void *end_reg __asm("a1") = end; + const register long flags __asm("a2") = 0; + const register long syscall_nr __asm("a7") = __NR_riscv_flush_icache; + __asm __volatile("ecall" + : "=r"(start_reg) + : "r"(start_reg), "r"(end_reg), "r"(flags), "r"(syscall_nr)); + assert(start_reg == 0 && "Cache flush syscall failed."); #else #if __APPLE__ // On Darwin, sys_icache_invalidate() provides this functionality diff --git a/compiler-rt/lib/builtins/clzdi2.c b/compiler-rt/lib/builtins/clzdi2.c index a0bacb2ae39e..12c17982a5cb 100644 --- a/compiler-rt/lib/builtins/clzdi2.c +++ b/compiler-rt/lib/builtins/clzdi2.c @@ -21,15 +21,15 @@ // ctz instruction, gcc resolves __builtin_clz to __clzdi2 rather than // __clzsi2, leading to infinite recursion. #define __builtin_clz(a) __clzsi2(a) -extern si_int __clzsi2(si_int); +extern int __clzsi2(si_int); #endif // Precondition: a != 0 -COMPILER_RT_ABI si_int __clzdi2(di_int a) { +COMPILER_RT_ABI int __clzdi2(di_int a) { dwords x; x.all = a; const si_int f = -(x.s.high == 0); - return __builtin_clz((x.s.high & ~f) | (x.s.low & f)) + + return clzsi((x.s.high & ~f) | (x.s.low & f)) + (f & ((si_int)(sizeof(si_int) * CHAR_BIT))); } diff --git a/compiler-rt/lib/builtins/clzsi2.c b/compiler-rt/lib/builtins/clzsi2.c index 3f9f27f41331..d75f56d937b0 100644 --- a/compiler-rt/lib/builtins/clzsi2.c +++ b/compiler-rt/lib/builtins/clzsi2.c @@ -16,7 +16,7 @@ // Precondition: a != 0 -COMPILER_RT_ABI si_int __clzsi2(si_int a) { +COMPILER_RT_ABI int __clzsi2(si_int a) { su_int x = (su_int)a; si_int t = ((x & 0xFFFF0000) == 0) << 4; // if (x is small) t = 16 else 0 x >>= 16 - t; // x = [0 - 0xFFFF] diff --git a/compiler-rt/lib/builtins/clzti2.c b/compiler-rt/lib/builtins/clzti2.c index 0c787104caa2..25d30119f271 100644 --- a/compiler-rt/lib/builtins/clzti2.c +++ b/compiler-rt/lib/builtins/clzti2.c @@ -18,7 +18,7 @@ // Precondition: a != 0 -COMPILER_RT_ABI si_int __clzti2(ti_int a) { +COMPILER_RT_ABI int __clzti2(ti_int a) { twords x; x.all = a; const di_int f = -(x.s.high == 0); diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c index fb619037d398..8346bb62dcfb 100644 --- a/compiler-rt/lib/builtins/cpu_model.c +++ b/compiler-rt/lib/builtins/cpu_model.c @@ -82,6 +82,8 @@ enum ProcessorSubtypes { INTEL_COREI7_ICELAKE_SERVER, AMDFAM17H_ZNVER2, INTEL_COREI7_CASCADELAKE, + INTEL_COREI7_TIGERLAKE, + INTEL_COREI7_COOPERLAKE, CPU_SUBTYPE_MAX }; @@ -122,7 +124,9 @@ enum ProcessorFeatures { FEATURE_VPCLMULQDQ, FEATURE_AVX512VNNI, FEATURE_AVX512BITALG, - FEATURE_AVX512BF16 + FEATURE_AVX512BF16, + FEATURE_AVX512VP2INTERSECT, + CPU_FEATURE_MAX }; // The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max). @@ -268,13 +272,17 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family, } } -static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, - unsigned Brand_id, - unsigned Features, - unsigned Features2, unsigned *Type, - unsigned *Subtype) { - if (Brand_id != 0) - return; +static const char * +getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, + const unsigned *Features, + unsigned *Type, unsigned *Subtype) { +#define testFeature(F) \ + (Features[F / 32] & (F % 32)) != 0 + + // We select CPU strings to match the code in Host.cpp, but we don't use them + // in compiler-rt. + const char *CPU = 0; + switch (Family) { case 6: switch (Model) { @@ -285,13 +293,17 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, // 0Fh. All processors are manufactured using the 65 nm process. case 0x16: // Intel Celeron processor model 16h. All processors are // manufactured using the 65 nm process + CPU = "core2"; + *Type = INTEL_CORE2; + break; case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model // 17h. All processors are manufactured using the 45 nm process. // // 45nm: Penryn , Wolfdale, Yorkfield (XE) case 0x1d: // Intel Xeon processor MP. All processors are manufactured using // the 45 nm process. - *Type = INTEL_CORE2; // "penryn" + CPU = "penryn"; + *Type = INTEL_CORE2; break; case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All // processors are manufactured using the 45 nm process. @@ -299,25 +311,29 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, // As found in a Summer 2010 model iMac. case 0x1f: case 0x2e: // Nehalem EX - *Type = INTEL_COREI7; // "nehalem" + CPU = "nehalem"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_NEHALEM; break; case 0x25: // Intel Core i7, laptop version. case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All // processors are manufactured using the 32 nm process. case 0x2f: // Westmere EX - *Type = INTEL_COREI7; // "westmere" + CPU = "westmere"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_WESTMERE; break; case 0x2a: // Intel Core i7 processor. All processors are manufactured // using the 32 nm process. case 0x2d: - *Type = INTEL_COREI7; //"sandybridge" + CPU = "sandybridge"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_SANDYBRIDGE; break; case 0x3a: case 0x3e: // Ivy Bridge EP - *Type = INTEL_COREI7; // "ivybridge" + CPU = "ivybridge"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_IVYBRIDGE; break; @@ -326,7 +342,8 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, case 0x3f: case 0x45: case 0x46: - *Type = INTEL_COREI7; // "haswell" + CPU = "haswell"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_HASWELL; break; @@ -335,7 +352,8 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, case 0x47: case 0x4f: case 0x56: - *Type = INTEL_COREI7; // "broadwell" + CPU = "broadwell"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_BROADWELL; break; @@ -344,37 +362,49 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, case 0x5e: // Skylake desktop case 0x8e: // Kaby Lake mobile case 0x9e: // Kaby Lake desktop - *Type = INTEL_COREI7; // "skylake" + case 0xa5: // Comet Lake-H/S + case 0xa6: // Comet Lake-U + CPU = "skylake"; + *Type = INTEL_COREI7; *Subtype = INTEL_COREI7_SKYLAKE; break; // Skylake Xeon: case 0x55: *Type = INTEL_COREI7; - if (Features2 & (1 << (FEATURE_AVX512VNNI - 32))) - *Subtype = INTEL_COREI7_CASCADELAKE; // "cascadelake" - else - *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512" + if (testFeature(FEATURE_AVX512BF16)) { + CPU = "cooperlake"; + *Subtype = INTEL_COREI7_COOPERLAKE; + } else if (testFeature(FEATURE_AVX512VNNI)) { + CPU = "cascadelake"; + *Subtype = INTEL_COREI7_CASCADELAKE; + } else { + CPU = "skylake-avx512"; + *Subtype = INTEL_COREI7_SKYLAKE_AVX512; + } break; // Cannonlake: case 0x66: + CPU = "cannonlake"; *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_CANNONLAKE; // "cannonlake" + *Subtype = INTEL_COREI7_CANNONLAKE; break; // Icelake: case 0x7d: case 0x7e: + CPU = "icelake-client"; *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ICELAKE_CLIENT; // "icelake-client" + *Subtype = INTEL_COREI7_ICELAKE_CLIENT; break; // Icelake Xeon: case 0x6a: case 0x6c: + CPU = "icelake-server"; *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ICELAKE_SERVER; // "icelake-server" + *Subtype = INTEL_COREI7_ICELAKE_SERVER; break; case 0x1c: // Most 45 nm Intel Atom processors @@ -382,8 +412,9 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, case 0x27: // 32 nm Atom Medfield case 0x35: // 32 nm Atom Midview case 0x36: // 32 nm Atom Midview + CPU = "bonnell"; *Type = INTEL_BONNELL; - break; // "bonnell" + break; // Atom Silvermont codes from the Intel software optimization guide. case 0x37: @@ -392,26 +423,32 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, case 0x5a: case 0x5d: case 0x4c: // really airmont + CPU = "silvermont"; *Type = INTEL_SILVERMONT; - break; // "silvermont" + break; // Goldmont: case 0x5c: // Apollo Lake case 0x5f: // Denverton + CPU = "goldmont"; *Type = INTEL_GOLDMONT; break; // "goldmont" case 0x7a: + CPU = "goldmont-plus"; *Type = INTEL_GOLDMONT_PLUS; break; case 0x86: + CPU = "tremont"; *Type = INTEL_TREMONT; break; case 0x57: - *Type = INTEL_KNL; // knl + CPU = "knl"; + *Type = INTEL_KNL; break; case 0x85: - *Type = INTEL_KNM; // knm + CPU = "knm"; + *Type = INTEL_KNM; break; default: // Unknown family 6 CPU. @@ -421,17 +458,22 @@ static void getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, default: break; // Unknown. } + + return CPU; } -static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, - unsigned Features, unsigned Features2, - unsigned *Type, unsigned *Subtype) { - // FIXME: this poorly matches the generated SubtargetFeatureKV table. There - // appears to be no way to generate the wide variety of AMD-specific targets - // from the information returned from CPUID. +static const char * +getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, + const unsigned *Features, + unsigned *Type, unsigned *Subtype) { + // We select CPU strings to match the code in Host.cpp, but we don't use them + // in compiler-rt. + const char *CPU = 0; + switch (Family) { case 16: - *Type = AMDFAM10H; // "amdfam10" + CPU = "amdfam10"; + *Type = AMDFAM10H; switch (Model) { case 2: *Subtype = AMDFAM10H_BARCELONA; @@ -445,60 +487,62 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, } break; case 20: + CPU = "btver1"; *Type = AMD_BTVER1; - break; // "btver1"; + break; case 21: + CPU = "bdver1"; *Type = AMDFAM15H; if (Model >= 0x60 && Model <= 0x7f) { + CPU = "bdver4"; *Subtype = AMDFAM15H_BDVER4; - break; // "bdver4"; 60h-7Fh: Excavator + break; // 60h-7Fh: Excavator } if (Model >= 0x30 && Model <= 0x3f) { + CPU = "bdver3"; *Subtype = AMDFAM15H_BDVER3; - break; // "bdver3"; 30h-3Fh: Steamroller + break; // 30h-3Fh: Steamroller } if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) { + CPU = "bdver2"; *Subtype = AMDFAM15H_BDVER2; - break; // "bdver2"; 02h, 10h-1Fh: Piledriver + break; // 02h, 10h-1Fh: Piledriver } if (Model <= 0x0f) { *Subtype = AMDFAM15H_BDVER1; - break; // "bdver1"; 00h-0Fh: Bulldozer + break; // 00h-0Fh: Bulldozer } break; case 22: + CPU = "btver2"; *Type = AMD_BTVER2; - break; // "btver2" + break; case 23: + CPU = "znver1"; *Type = AMDFAM17H; if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) { + CPU = "znver2"; *Subtype = AMDFAM17H_ZNVER2; - break; // "znver2"; 30h-3fh, 71h: Zen2 + break; // 30h-3fh, 71h: Zen2 } if (Model <= 0x0f) { *Subtype = AMDFAM17H_ZNVER1; - break; // "znver1"; 00h-0Fh: Zen1 + break; // 00h-0Fh: Zen1 } break; default: - break; // "generic" + break; // Unknown AMD CPU. } + + return CPU; } static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, - unsigned *FeaturesOut, - unsigned *Features2Out) { - unsigned Features = 0; - unsigned Features2 = 0; + unsigned *Features) { unsigned EAX, EBX; #define setFeature(F) \ - do { \ - if (F < 32) \ - Features |= 1U << (F & 0x1f); \ - else if (F < 64) \ - Features2 |= 1U << ((F - 32) & 0x1f); \ - } while (0) + Features[F / 32] |= 1U << (F % 32) if ((EDX >> 15) & 1) setFeature(FEATURE_CMOV); @@ -590,6 +634,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(FEATURE_AVX5124VNNIW); if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save) setFeature(FEATURE_AVX5124FMAPS); + if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VP2INTERSECT); bool HasLeaf7Subleaf1 = MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX); @@ -607,9 +653,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(FEATURE_XOP); if (HasExtLeaf1 && ((ECX >> 16) & 1)) setFeature(FEATURE_FMA4); - - *FeaturesOut = Features; - *Features2Out = Features2; #undef setFeature } @@ -641,7 +684,7 @@ struct __processor_model { #ifndef _WIN32 __attribute__((visibility("hidden"))) #endif -unsigned int __cpu_features2; +unsigned int __cpu_features2 = 0; // A constructor function that is sets __cpu_model and __cpu_features2 with // the right values. This needs to run only once. This constructor is @@ -653,40 +696,38 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { unsigned EAX, EBX, ECX, EDX; unsigned MaxLeaf = 5; unsigned Vendor; - unsigned Model, Family, Brand_id; - unsigned Features = 0; - unsigned Features2 = 0; + unsigned Model, Family; + unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; // This function needs to run just once. if (__cpu_model.__cpu_vendor) return 0; - if (!isCpuIdSupported()) - return -1; - - // Assume cpuid insn present. Run in level 0 to get vendor id. - if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) { + if (!isCpuIdSupported() || + getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) { __cpu_model.__cpu_vendor = VENDOR_OTHER; return -1; } + getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX); detectX86FamilyModel(EAX, &Family, &Model); - Brand_id = EBX & 0xff; // Find available features. - getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2); - __cpu_model.__cpu_features[0] = Features; - __cpu_features2 = Features2; + getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]); + + assert((sizeof(Features)/sizeof(Features[0])) == 2); + __cpu_model.__cpu_features[0] = Features[0]; + __cpu_features2 = Features[1]; if (Vendor == SIG_INTEL) { // Get CPU type. - getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features, - Features2, &(__cpu_model.__cpu_type), + getIntelProcessorTypeAndSubtype(Family, Model, &Features[0], + &(__cpu_model.__cpu_type), &(__cpu_model.__cpu_subtype)); __cpu_model.__cpu_vendor = VENDOR_INTEL; } else if (Vendor == SIG_AMD) { // Get CPU type. - getAMDProcessorTypeAndSubtype(Family, Model, Features, Features2, + getAMDProcessorTypeAndSubtype(Family, Model, &Features[0], &(__cpu_model.__cpu_type), &(__cpu_model.__cpu_subtype)); __cpu_model.__cpu_vendor = VENDOR_AMD; diff --git a/compiler-rt/lib/builtins/ctzdi2.c b/compiler-rt/lib/builtins/ctzdi2.c index 9384aa6055a1..26c908d876ac 100644 --- a/compiler-rt/lib/builtins/ctzdi2.c +++ b/compiler-rt/lib/builtins/ctzdi2.c @@ -21,15 +21,15 @@ // ctz instruction, gcc resolves __builtin_ctz to __ctzdi2 rather than // __ctzsi2, leading to infinite recursion. #define __builtin_ctz(a) __ctzsi2(a) -extern si_int __ctzsi2(si_int); +extern int __ctzsi2(si_int); #endif // Precondition: a != 0 -COMPILER_RT_ABI si_int __ctzdi2(di_int a) { +COMPILER_RT_ABI int __ctzdi2(di_int a) { dwords x; x.all = a; const si_int f = -(x.s.low == 0); - return __builtin_ctz((x.s.high & f) | (x.s.low & ~f)) + + return ctzsi((x.s.high & f) | (x.s.low & ~f)) + (f & ((si_int)(sizeof(si_int) * CHAR_BIT))); } diff --git a/compiler-rt/lib/builtins/ctzsi2.c b/compiler-rt/lib/builtins/ctzsi2.c index 09c6863b74e3..ed95c6057933 100644 --- a/compiler-rt/lib/builtins/ctzsi2.c +++ b/compiler-rt/lib/builtins/ctzsi2.c @@ -16,7 +16,7 @@ // Precondition: a != 0 -COMPILER_RT_ABI si_int __ctzsi2(si_int a) { +COMPILER_RT_ABI int __ctzsi2(si_int a) { su_int x = (su_int)a; si_int t = ((x & 0x0000FFFF) == 0) << 4; // if (x has no small bits) t = 16 else 0 diff --git a/compiler-rt/lib/builtins/ctzti2.c b/compiler-rt/lib/builtins/ctzti2.c index 2a1312c8437d..fb136d0de1c0 100644 --- a/compiler-rt/lib/builtins/ctzti2.c +++ b/compiler-rt/lib/builtins/ctzti2.c @@ -18,7 +18,7 @@ // Precondition: a != 0 -COMPILER_RT_ABI si_int __ctzti2(ti_int a) { +COMPILER_RT_ABI int __ctzti2(ti_int a) { twords x; x.all = a; const di_int f = -(x.s.low == 0); diff --git a/compiler-rt/lib/builtins/ffsdi2.c b/compiler-rt/lib/builtins/ffsdi2.c index 9c1a24260956..beae5530430e 100644 --- a/compiler-rt/lib/builtins/ffsdi2.c +++ b/compiler-rt/lib/builtins/ffsdi2.c @@ -15,13 +15,13 @@ // Returns: the index of the least significant 1-bit in a, or // the value zero if a is zero. The least significant bit is index one. -COMPILER_RT_ABI si_int __ffsdi2(di_int a) { +COMPILER_RT_ABI int __ffsdi2(di_int a) { dwords x; x.all = a; if (x.s.low == 0) { if (x.s.high == 0) return 0; - return __builtin_ctz(x.s.high) + (1 + sizeof(si_int) * CHAR_BIT); + return ctzsi(x.s.high) + (1 + sizeof(si_int) * CHAR_BIT); } - return __builtin_ctz(x.s.low) + 1; + return ctzsi(x.s.low) + 1; } diff --git a/compiler-rt/lib/builtins/ffssi2.c b/compiler-rt/lib/builtins/ffssi2.c index cba1f72fdc61..ddb52927f8db 100644 --- a/compiler-rt/lib/builtins/ffssi2.c +++ b/compiler-rt/lib/builtins/ffssi2.c @@ -15,9 +15,9 @@ // Returns: the index of the least significant 1-bit in a, or // the value zero if a is zero. The least significant bit is index one. -COMPILER_RT_ABI si_int __ffssi2(si_int a) { +COMPILER_RT_ABI int __ffssi2(si_int a) { if (a == 0) { return 0; } - return __builtin_ctz(a) + 1; + return ctzsi(a) + 1; } diff --git a/compiler-rt/lib/builtins/ffsti2.c b/compiler-rt/lib/builtins/ffsti2.c index a2d7ce08ada1..a2177d148a09 100644 --- a/compiler-rt/lib/builtins/ffsti2.c +++ b/compiler-rt/lib/builtins/ffsti2.c @@ -17,7 +17,7 @@ // Returns: the index of the least significant 1-bit in a, or // the value zero if a is zero. The least significant bit is index one. -COMPILER_RT_ABI si_int __ffsti2(ti_int a) { +COMPILER_RT_ABI int __ffsti2(ti_int a) { twords x; x.all = a; if (x.s.low == 0) { diff --git a/compiler-rt/lib/builtins/floatdidf.c b/compiler-rt/lib/builtins/floatdidf.c index 8f887314b9e1..b2d8f2b44b6d 100644 --- a/compiler-rt/lib/builtins/floatdidf.c +++ b/compiler-rt/lib/builtins/floatdidf.c @@ -87,7 +87,7 @@ COMPILER_RT_ABI double __floatdidf(di_int a) { } double_bits fb; fb.u.s.high = ((su_int)s & 0x80000000) | // sign - ((e + 1023) << 20) | // exponent + ((su_int)(e + 1023) << 20) | // exponent ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high fb.u.s.low = (su_int)a; // mantissa-low return fb.f; diff --git a/compiler-rt/lib/builtins/floatdisf.c b/compiler-rt/lib/builtins/floatdisf.c index cd9e0a3b78a5..faaa1bcb3c8e 100644 --- a/compiler-rt/lib/builtins/floatdisf.c +++ b/compiler-rt/lib/builtins/floatdisf.c @@ -26,7 +26,7 @@ COMPILER_RT_ABI float __floatdisf(di_int a) { const di_int s = a >> (N - 1); a = (a ^ s) - s; int sd = N - __builtin_clzll(a); // number of significant digits - int e = sd - 1; // exponent + si_int e = sd - 1; // exponent if (sd > FLT_MANT_DIG) { // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR diff --git a/compiler-rt/lib/builtins/floatsidf.c b/compiler-rt/lib/builtins/floatsidf.c index 2c66167d794d..28cf32f6388b 100644 --- a/compiler-rt/lib/builtins/floatsidf.c +++ b/compiler-rt/lib/builtins/floatsidf.c @@ -17,7 +17,7 @@ #include "int_lib.h" -COMPILER_RT_ABI fp_t __floatsidf(int a) { +COMPILER_RT_ABI fp_t __floatsidf(si_int a) { const int aWidth = sizeof a * CHAR_BIT; @@ -33,14 +33,14 @@ COMPILER_RT_ABI fp_t __floatsidf(int a) { } // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - __builtin_clz(a); + const int exponent = (aWidth - 1) - clzsi(a); rep_t result; // Shift a into the significand field and clear the implicit bit. Extra // cast to unsigned int is necessary to get the correct behavior for // the input INT_MIN. const int shift = significandBits - exponent; - result = (rep_t)(unsigned int)a << shift ^ implicitBit; + result = (rep_t)(su_int)a << shift ^ implicitBit; // Insert the exponent result += (rep_t)(exponent + exponentBias) << significandBits; @@ -50,7 +50,7 @@ COMPILER_RT_ABI fp_t __floatsidf(int a) { #if defined(__ARM_EABI__) #if defined(COMPILER_RT_ARMHF_TARGET) -AEABI_RTABI fp_t __aeabi_i2d(int a) { return __floatsidf(a); } +AEABI_RTABI fp_t __aeabi_i2d(si_int a) { return __floatsidf(a); } #else COMPILER_RT_ALIAS(__floatsidf, __aeabi_i2d) #endif diff --git a/compiler-rt/lib/builtins/floatundidf.c b/compiler-rt/lib/builtins/floatundidf.c index e7c6aae5ce38..4c445b118080 100644 --- a/compiler-rt/lib/builtins/floatundidf.c +++ b/compiler-rt/lib/builtins/floatundidf.c @@ -90,7 +90,7 @@ COMPILER_RT_ABI double __floatundidf(du_int a) { // a is now rounded to DBL_MANT_DIG bits } double_bits fb; - fb.u.s.high = ((e + 1023) << 20) | // exponent + fb.u.s.high = ((su_int)(e + 1023) << 20) | // exponent ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high fb.u.s.low = (su_int)a; // mantissa-low return fb.f; diff --git a/compiler-rt/lib/builtins/floatundisf.c b/compiler-rt/lib/builtins/floatundisf.c index 87841b761ded..00d61b0c6310 100644 --- a/compiler-rt/lib/builtins/floatundisf.c +++ b/compiler-rt/lib/builtins/floatundisf.c @@ -24,7 +24,7 @@ COMPILER_RT_ABI float __floatundisf(du_int a) { return 0.0F; const unsigned N = sizeof(du_int) * CHAR_BIT; int sd = N - __builtin_clzll(a); // number of significant digits - int e = sd - 1; // 8 exponent + si_int e = sd - 1; // 8 exponent if (sd > FLT_MANT_DIG) { // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR diff --git a/compiler-rt/lib/builtins/floatunsidf.c b/compiler-rt/lib/builtins/floatunsidf.c index 2c01c3041434..9b3e5fea0e45 100644 --- a/compiler-rt/lib/builtins/floatunsidf.c +++ b/compiler-rt/lib/builtins/floatunsidf.c @@ -17,7 +17,7 @@ #include "int_lib.h" -COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) { +COMPILER_RT_ABI fp_t __floatunsidf(su_int a) { const int aWidth = sizeof a * CHAR_BIT; @@ -26,7 +26,7 @@ COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) { return fromRep(0); // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - __builtin_clz(a); + const int exponent = (aWidth - 1) - clzsi(a); rep_t result; // Shift a into the significand field and clear the implicit bit. @@ -40,7 +40,7 @@ COMPILER_RT_ABI fp_t __floatunsidf(unsigned int a) { #if defined(__ARM_EABI__) #if defined(COMPILER_RT_ARMHF_TARGET) -AEABI_RTABI fp_t __aeabi_ui2d(unsigned int a) { return __floatunsidf(a); } +AEABI_RTABI fp_t __aeabi_ui2d(su_int a) { return __floatunsidf(a); } #else COMPILER_RT_ALIAS(__floatunsidf, __aeabi_ui2d) #endif diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h index d2083c426722..fb512672e35e 100644 --- a/compiler-rt/lib/builtins/fp_extend.h +++ b/compiler-rt/lib/builtins/fp_extend.h @@ -21,7 +21,7 @@ typedef float src_t; typedef uint32_t src_rep_t; #define SRC_REP_C UINT32_C static const int srcSigBits = 23; -#define src_rep_t_clz __builtin_clz +#define src_rep_t_clz clzsi #elif defined SRC_DOUBLE typedef double src_t; diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h index e2a906681c46..bd1f180f499e 100644 --- a/compiler-rt/lib/builtins/fp_lib.h +++ b/compiler-rt/lib/builtins/fp_lib.h @@ -46,7 +46,7 @@ typedef float fp_t; #define REP_C UINT32_C #define significandBits 23 -static __inline int rep_clz(rep_t a) { return __builtin_clz(a); } +static __inline int rep_clz(rep_t a) { return clzsi(a); } // 32x32 --> 64 bit multiply static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { @@ -69,9 +69,9 @@ static __inline int rep_clz(rep_t a) { return __builtin_clzl(a); #else if (a & REP_C(0xffffffff00000000)) - return __builtin_clz(a >> 32); + return clzsi(a >> 32); else - return 32 + __builtin_clz(a & REP_C(0xffffffff)); + return 32 + clzsi(a & REP_C(0xffffffff)); #endif } diff --git a/compiler-rt/lib/builtins/fp_mode.h b/compiler-rt/lib/builtins/fp_mode.h index 51bec0431a40..4ba682c384f2 100644 --- a/compiler-rt/lib/builtins/fp_mode.h +++ b/compiler-rt/lib/builtins/fp_mode.h @@ -23,7 +23,7 @@ typedef enum { FE_TOWARDZERO } FE_ROUND_MODE; -FE_ROUND_MODE __fe_getround(); -int __fe_raise_inexact(); +FE_ROUND_MODE __fe_getround(void); +int __fe_raise_inexact(void); #endif // FP_MODE_H diff --git a/compiler-rt/lib/builtins/hexagon/dffma.S b/compiler-rt/lib/builtins/hexagon/dffma.S index c201d3d8be5e..843e88b3cab8 100644 --- a/compiler-rt/lib/builtins/hexagon/dffma.S +++ b/compiler-rt/lib/builtins/hexagon/dffma.S @@ -104,13 +104,11 @@ .type __hexagon_fmadf4,@function .global __hexagon_fmadf5 .type __hexagon_fmadf5,@function - .global fma - .type fma,@function Q6_ALIAS(fmadf5) .p2align 5 __hexagon_fmadf4: __hexagon_fmadf5: -fma: +.Lfma_begin: { P_TMP = dfclass(A,#2) P_TMP = dfclass(B,#2) @@ -561,7 +559,7 @@ fma: B = insert(BTMP,#63,#0) AH -= asl(TMP,#HI_MANTBITS) } - jump fma + jump .Lfma_begin .Lfma_ab_tiny: ATMP = combine(##0x00100000,#0) @@ -569,7 +567,7 @@ fma: A = insert(ATMP,#63,#0) B = insert(ATMP,#63,#0) } - jump fma + jump .Lfma_begin .Lab_inf: { diff --git a/compiler-rt/lib/builtins/hexagon/fabs_opt.S b/compiler-rt/lib/builtins/hexagon/fabs_opt.S deleted file mode 100644 index 6bf9b84b3d20..000000000000 --- a/compiler-rt/lib/builtins/hexagon/fabs_opt.S +++ /dev/null @@ -1,36 +0,0 @@ -//===----------------------Hexagon builtin routine ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -.macro FUNCTION_BEGIN name -.text -.p2align 5 -.globl \name -.type \name, @function -\name: -.endm - -.macro FUNCTION_END name -.size \name, . - \name -.endm - -FUNCTION_BEGIN fabs - { - r1 = clrbit(r1, #31) - jumpr r31 - } -FUNCTION_END fabs - -FUNCTION_BEGIN fabsf - { - r0 = clrbit(r0, #31) - jumpr r31 - } -FUNCTION_END fabsf - - .globl fabsl - .set fabsl, fabs diff --git a/compiler-rt/lib/builtins/hexagon/fma_opt.S b/compiler-rt/lib/builtins/hexagon/fma_opt.S deleted file mode 100644 index 7f566adffd6a..000000000000 --- a/compiler-rt/lib/builtins/hexagon/fma_opt.S +++ /dev/null @@ -1,30 +0,0 @@ -//===----------------------Hexagon builtin routine ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -.macro FUNCTION_BEGIN name -.text -.p2align 5 -.globl \name -.type \name, @function -\name: -.endm - -.macro FUNCTION_END name -.size \name, . - \name -.endm - -FUNCTION_BEGIN fmaf - r2 += sfmpy(r0, r1) - { - r0 = r2 - jumpr r31 - } -FUNCTION_END fmaf - - .globl fmal - .set fmal, fma diff --git a/compiler-rt/lib/builtins/hexagon/fmax_opt.S b/compiler-rt/lib/builtins/hexagon/fmax_opt.S deleted file mode 100644 index 81d711dff8d2..000000000000 --- a/compiler-rt/lib/builtins/hexagon/fmax_opt.S +++ /dev/null @@ -1,29 +0,0 @@ -//===----------------------Hexagon builtin routine ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -.macro FUNCTION_BEGIN name -.text -.p2align 5 -.globl \name -.type \name, @function -\name: -.endm - -.macro FUNCTION_END name -.size \name, . - \name -.endm - -FUNCTION_BEGIN fmaxf - { - r0 = sfmax(r0, r1) - jumpr r31 - } -FUNCTION_END fmaxf - - .globl fmaxl - .set fmaxl, fmax diff --git a/compiler-rt/lib/builtins/hexagon/fmin_opt.S b/compiler-rt/lib/builtins/hexagon/fmin_opt.S deleted file mode 100644 index d043f1d7a698..000000000000 --- a/compiler-rt/lib/builtins/hexagon/fmin_opt.S +++ /dev/null @@ -1,29 +0,0 @@ -//===----------------------Hexagon builtin routine ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -.macro FUNCTION_BEGIN name -.text -.p2align 5 -.globl \name -.type \name, @function -\name: -.endm - -.macro FUNCTION_END name -.size \name, . - \name -.endm - -FUNCTION_BEGIN fminf - { - r0 = sfmin(r0, r1) - jumpr r31 - } -FUNCTION_END fminf - - .globl fminl - .set fminl, fmin diff --git a/compiler-rt/lib/builtins/i386/floatdidf.S b/compiler-rt/lib/builtins/i386/floatdidf.S index ab7422c312dc..d588e770364e 100644 --- a/compiler-rt/lib/builtins/i386/floatdidf.S +++ b/compiler-rt/lib/builtins/i386/floatdidf.S @@ -4,7 +4,7 @@ #include "../assembly.h" -// double __floatundidf(du_int a); +// double __floatdidf(du_int a); #ifdef __i386__ diff --git a/compiler-rt/lib/builtins/i386/floatdixf.S b/compiler-rt/lib/builtins/i386/floatdixf.S index df70f5f9e6e3..19dd0835a9c5 100644 --- a/compiler-rt/lib/builtins/i386/floatdixf.S +++ b/compiler-rt/lib/builtins/i386/floatdixf.S @@ -4,7 +4,7 @@ #include "../assembly.h" -// float __floatdixf(di_int a); +// long double __floatdixf(di_int a); #ifdef __i386__ diff --git a/compiler-rt/lib/builtins/int_div_impl.inc b/compiler-rt/lib/builtins/int_div_impl.inc new file mode 100644 index 000000000000..de0373889078 --- /dev/null +++ b/compiler-rt/lib/builtins/int_div_impl.inc @@ -0,0 +1,70 @@ +//===-- int_div_impl.inc - Integer division ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helpers used by __udivsi3, __umodsi3, __udivdi3, and __umodsi3. +// +//===----------------------------------------------------------------------===// + +#define clz(a) (sizeof(a) == sizeof(unsigned long long) ? __builtin_clzll(a) : clzsi(a)) + +// Adapted from Figure 3-40 of The PowerPC Compiler Writer's Guide +static __inline fixuint_t __udivXi3(fixuint_t n, fixuint_t d) { + const unsigned N = sizeof(fixuint_t) * CHAR_BIT; + // d == 0 cases are unspecified. + unsigned sr = (d ? clz(d) : N) - (n ? clz(n) : N); + // 0 <= sr <= N - 1 or sr is very large. + if (sr > N - 1) // n < d + return 0; + if (sr == N - 1) // d == 1 + return n; + ++sr; + // 1 <= sr <= N - 1. Shifts do not trigger UB. + fixuint_t r = n >> sr; + n <<= N - sr; + fixuint_t carry = 0; + for (; sr > 0; --sr) { + r = (r << 1) | (n >> (N - 1)); + n = (n << 1) | carry; + // Branch-less version of: + // carry = 0; + // if (r >= d) r -= d, carry = 1; + const fixint_t s = (fixint_t)(d - r - 1) >> (N - 1); + carry = s & 1; + r -= d & s; + } + n = (n << 1) | carry; + return n; +} + +// Mostly identical to __udivXi3 but the return values are different. +static __inline fixuint_t __umodXi3(fixuint_t n, fixuint_t d) { + const unsigned N = sizeof(fixuint_t) * CHAR_BIT; + // d == 0 cases are unspecified. + unsigned sr = (d ? clz(d) : N) - (n ? clz(n) : N); + // 0 <= sr <= N - 1 or sr is very large. + if (sr > N - 1) // n < d + return n; + if (sr == N - 1) // d == 1 + return 0; + ++sr; + // 1 <= sr <= N - 1. Shifts do not trigger UB. + fixuint_t r = n >> sr; + n <<= N - sr; + fixuint_t carry = 0; + for (; sr > 0; --sr) { + r = (r << 1) | (n >> (N - 1)); + n = (n << 1) | carry; + // Branch-less version of: + // carry = 0; + // if (r >= d) r -= d, carry = 1; + const fixint_t s = (fixint_t)(d - r - 1) >> (N - 1); + carry = s & 1; + r -= d & s; + } + return r; +} diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h index 3092f68c084a..991c4a99ea6e 100644 --- a/compiler-rt/lib/builtins/int_lib.h +++ b/compiler-rt/lib/builtins/int_lib.h @@ -48,12 +48,20 @@ #define XSTR(a) STR(a) #define SYMBOL_NAME(name) XSTR(__USER_LABEL_PREFIX__) #name -#if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__) +#if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__) || \ + defined(_AIX) #define COMPILER_RT_ALIAS(name, aliasname) \ COMPILER_RT_ABI __typeof(name) aliasname __attribute__((__alias__(#name))); #elif defined(__APPLE__) +#if defined(VISIBILITY_HIDDEN) +#define COMPILER_RT_ALIAS_VISIBILITY(name) \ + __asm__(".private_extern " SYMBOL_NAME(name)); +#else +#define COMPILER_RT_ALIAS_VISIBILITY(name) +#endif #define COMPILER_RT_ALIAS(name, aliasname) \ __asm__(".globl " SYMBOL_NAME(aliasname)); \ + COMPILER_RT_ALIAS_VISIBILITY(aliasname) \ __asm__(SYMBOL_NAME(aliasname) " = " SYMBOL_NAME(name)); \ COMPILER_RT_ABI __typeof(name) aliasname; #elif defined(_WIN32) @@ -84,8 +92,8 @@ // Include internal utility function declarations. #include "int_util.h" -COMPILER_RT_ABI si_int __paritysi2(si_int a); -COMPILER_RT_ABI si_int __paritydi2(di_int a); +COMPILER_RT_ABI int __paritysi2(si_int a); +COMPILER_RT_ABI int __paritydi2(di_int a); COMPILER_RT_ABI di_int __divdi3(di_int a, di_int b); COMPILER_RT_ABI si_int __divsi3(si_int a, si_int b); @@ -94,7 +102,7 @@ COMPILER_RT_ABI su_int __udivsi3(su_int n, su_int d); COMPILER_RT_ABI su_int __udivmodsi4(su_int a, su_int b, su_int *rem); COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem); #ifdef CRT_HAS_128BIT -COMPILER_RT_ABI si_int __clzti2(ti_int a); +COMPILER_RT_ABI int __clzti2(ti_int a); COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem); #endif @@ -102,14 +110,14 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem); #if defined(_MSC_VER) && !defined(__clang__) #include <intrin.h> -uint32_t __inline __builtin_ctz(uint32_t value) { +int __inline __builtin_ctz(uint32_t value) { unsigned long trailing_zero = 0; if (_BitScanForward(&trailing_zero, value)) return trailing_zero; return 32; } -uint32_t __inline __builtin_clz(uint32_t value) { +int __inline __builtin_clz(uint32_t value) { unsigned long leading_zero = 0; if (_BitScanReverse(&leading_zero, value)) return 31 - leading_zero; @@ -117,14 +125,14 @@ uint32_t __inline __builtin_clz(uint32_t value) { } #if defined(_M_ARM) || defined(_M_X64) -uint32_t __inline __builtin_clzll(uint64_t value) { +int __inline __builtin_clzll(uint64_t value) { unsigned long leading_zero = 0; if (_BitScanReverse64(&leading_zero, value)) return 63 - leading_zero; return 64; } #else -uint32_t __inline __builtin_clzll(uint64_t value) { +int __inline __builtin_clzll(uint64_t value) { if (value == 0) return 64; uint32_t msh = (uint32_t)(value >> 32); diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h index f89220d54350..705355a4840d 100644 --- a/compiler-rt/lib/builtins/int_types.h +++ b/compiler-rt/lib/builtins/int_types.h @@ -22,11 +22,20 @@ #ifdef si_int #undef si_int #endif -typedef int si_int; -typedef unsigned su_int; +typedef int32_t si_int; +typedef uint32_t su_int; +#if UINT_MAX == 0xFFFFFFFF +#define clzsi __builtin_clz +#define ctzsi __builtin_ctz +#elif ULONG_MAX == 0xFFFFFFFF +#define clzsi __builtin_clzl +#define ctzsi __builtin_ctzl +#else +#error could not determine appropriate clzsi macro for this system +#endif -typedef long long di_int; -typedef unsigned long long du_int; +typedef int64_t di_int; +typedef uint64_t du_int; typedef union { di_int all; @@ -135,9 +144,12 @@ typedef struct { // Check if the target supports 80 bit extended precision long doubles. // Notably, on x86 Windows, MSVC only provides a 64-bit long double, but GCC // still makes it 80 bits. Clang will match whatever compiler it is trying to -// be compatible with. -#if ((defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)) || \ - defined(__m68k__) || defined(__ia64__) +// be compatible with. On 32-bit x86 Android, long double is 64 bits, while on +// x86_64 Android, long double is 128 bits. +#if (defined(__i386__) || defined(__x86_64__)) && \ + !(defined(_MSC_VER) || defined(__ANDROID__)) +#define HAS_80_BIT_LONG_DOUBLE 1 +#elif defined(__m68k__) || defined(__ia64__) #define HAS_80_BIT_LONG_DOUBLE 1 #else #define HAS_80_BIT_LONG_DOUBLE 0 diff --git a/compiler-rt/lib/builtins/lshrdi3.c b/compiler-rt/lib/builtins/lshrdi3.c index 97e08e1e9ba0..6072152583ac 100644 --- a/compiler-rt/lib/builtins/lshrdi3.c +++ b/compiler-rt/lib/builtins/lshrdi3.c @@ -16,7 +16,7 @@ // Precondition: 0 <= b < bits_in_dword -COMPILER_RT_ABI di_int __lshrdi3(di_int a, si_int b) { +COMPILER_RT_ABI di_int __lshrdi3(di_int a, int b) { const int bits_in_word = (int)(sizeof(si_int) * CHAR_BIT); udwords input; udwords result; diff --git a/compiler-rt/lib/builtins/paritydi2.c b/compiler-rt/lib/builtins/paritydi2.c index dd9d45e63ea4..58e85f89e043 100644 --- a/compiler-rt/lib/builtins/paritydi2.c +++ b/compiler-rt/lib/builtins/paritydi2.c @@ -14,7 +14,7 @@ // Returns: 1 if number of bits is odd else returns 0 -COMPILER_RT_ABI si_int __paritydi2(di_int a) { +COMPILER_RT_ABI int __paritydi2(di_int a) { dwords x; x.all = a; return __paritysi2(x.s.high ^ x.s.low); diff --git a/compiler-rt/lib/builtins/paritysi2.c b/compiler-rt/lib/builtins/paritysi2.c index 3efa961f2f85..a4b84e080632 100644 --- a/compiler-rt/lib/builtins/paritysi2.c +++ b/compiler-rt/lib/builtins/paritysi2.c @@ -14,7 +14,7 @@ // Returns: 1 if number of bits is odd else returns 0 -COMPILER_RT_ABI si_int __paritysi2(si_int a) { +COMPILER_RT_ABI int __paritysi2(si_int a) { su_int x = (su_int)a; x ^= x >> 16; x ^= x >> 8; diff --git a/compiler-rt/lib/builtins/parityti2.c b/compiler-rt/lib/builtins/parityti2.c index f3942ba8378c..79e920d8a02d 100644 --- a/compiler-rt/lib/builtins/parityti2.c +++ b/compiler-rt/lib/builtins/parityti2.c @@ -16,7 +16,7 @@ // Returns: 1 if number of bits is odd else returns 0 -COMPILER_RT_ABI si_int __parityti2(ti_int a) { +COMPILER_RT_ABI int __parityti2(ti_int a) { twords x; x.all = a; return __paritydi2(x.s.high ^ x.s.low); diff --git a/compiler-rt/lib/builtins/popcountdi2.c b/compiler-rt/lib/builtins/popcountdi2.c index 9bbc39c6608a..20dd0b0239ef 100644 --- a/compiler-rt/lib/builtins/popcountdi2.c +++ b/compiler-rt/lib/builtins/popcountdi2.c @@ -14,7 +14,7 @@ // Returns: count of 1 bits -COMPILER_RT_ABI si_int __popcountdi2(di_int a) { +COMPILER_RT_ABI int __popcountdi2(di_int a) { du_int x2 = (du_int)a; x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL); // Every 2 bits holds the sum of every pair of bits (32) diff --git a/compiler-rt/lib/builtins/popcountsi2.c b/compiler-rt/lib/builtins/popcountsi2.c index 75e592a778d9..4d346c45d9ce 100644 --- a/compiler-rt/lib/builtins/popcountsi2.c +++ b/compiler-rt/lib/builtins/popcountsi2.c @@ -14,7 +14,7 @@ // Returns: count of 1 bits -COMPILER_RT_ABI si_int __popcountsi2(si_int a) { +COMPILER_RT_ABI int __popcountsi2(si_int a) { su_int x = (su_int)a; x = x - ((x >> 1) & 0x55555555); // Every 2 bits holds the sum of every pair of bits diff --git a/compiler-rt/lib/builtins/popcountti2.c b/compiler-rt/lib/builtins/popcountti2.c index 853fd722309e..79cbb2fb34c0 100644 --- a/compiler-rt/lib/builtins/popcountti2.c +++ b/compiler-rt/lib/builtins/popcountti2.c @@ -17,7 +17,7 @@ // Returns: count of 1 bits -COMPILER_RT_ABI si_int __popcountti2(ti_int a) { +COMPILER_RT_ABI int __popcountti2(ti_int a) { tu_int x3 = (tu_int)a; x3 = x3 - ((x3 >> 1) & (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL)); diff --git a/compiler-rt/lib/builtins/powidf2.c b/compiler-rt/lib/builtins/powidf2.c index 9697588484e7..81058af50829 100644 --- a/compiler-rt/lib/builtins/powidf2.c +++ b/compiler-rt/lib/builtins/powidf2.c @@ -14,7 +14,7 @@ // Returns: a ^ b -COMPILER_RT_ABI double __powidf2(double a, si_int b) { +COMPILER_RT_ABI double __powidf2(double a, int b) { const int recip = b < 0; double r = 1; while (1) { diff --git a/compiler-rt/lib/builtins/powisf2.c b/compiler-rt/lib/builtins/powisf2.c index 469402348825..d0ab26167bbd 100644 --- a/compiler-rt/lib/builtins/powisf2.c +++ b/compiler-rt/lib/builtins/powisf2.c @@ -14,7 +14,7 @@ // Returns: a ^ b -COMPILER_RT_ABI float __powisf2(float a, si_int b) { +COMPILER_RT_ABI float __powisf2(float a, int b) { const int recip = b < 0; float r = 1; while (1) { diff --git a/compiler-rt/lib/builtins/powitf2.c b/compiler-rt/lib/builtins/powitf2.c index fcbdb4c2ee2a..8e639a03a3c4 100644 --- a/compiler-rt/lib/builtins/powitf2.c +++ b/compiler-rt/lib/builtins/powitf2.c @@ -10,13 +10,14 @@ // //===----------------------------------------------------------------------===// -#include "int_lib.h" +#define QUAD_PRECISION +#include "fp_lib.h" -#if _ARCH_PPC +#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) // Returns: a ^ b -COMPILER_RT_ABI long double __powitf2(long double a, si_int b) { +COMPILER_RT_ABI long double __powitf2(long double a, int b) { const int recip = b < 0; long double r = 1; while (1) { diff --git a/compiler-rt/lib/builtins/powixf2.c b/compiler-rt/lib/builtins/powixf2.c index b7b52095afa1..3edfe9fd7af5 100644 --- a/compiler-rt/lib/builtins/powixf2.c +++ b/compiler-rt/lib/builtins/powixf2.c @@ -16,7 +16,7 @@ // Returns: a ^ b -COMPILER_RT_ABI long double __powixf2(long double a, si_int b) { +COMPILER_RT_ABI long double __powixf2(long double a, int b) { const int recip = b < 0; long double r = 1; while (1) { diff --git a/compiler-rt/lib/builtins/riscv/int_mul_impl.inc b/compiler-rt/lib/builtins/riscv/int_mul_impl.inc new file mode 100644 index 000000000000..50951d5f4195 --- /dev/null +++ b/compiler-rt/lib/builtins/riscv/int_mul_impl.inc @@ -0,0 +1,31 @@ +//===-- int_mul_impl.inc - Integer multiplication -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helpers used by __mulsi3, __muldi3. +// +//===----------------------------------------------------------------------===// + +#if !defined(__riscv_mul) + .text + .align 2 + + .globl __mulxi3 + .type __mulxi3, @function +__mulxi3: + mv a2, a0 + mv a0, zero +.L1: + andi a3, a1, 1 + beqz a3, .L2 + add a0, a0, a2 +.L2: + srli a1, a1, 1 + slli a2, a2, 1 + bnez a1, .L1 + ret +#endif diff --git a/compiler-rt/lib/builtins/riscv/muldi3.S b/compiler-rt/lib/builtins/riscv/muldi3.S new file mode 100644 index 000000000000..9e292e8dd8b9 --- /dev/null +++ b/compiler-rt/lib/builtins/riscv/muldi3.S @@ -0,0 +1,11 @@ +//===--- muldi3.S - Integer multiplication routines -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#if __riscv_xlen == 64 +#define __mulxi3 __muldi3 +#include "int_mul_impl.inc" +#endif diff --git a/compiler-rt/lib/builtins/riscv/mulsi3.S b/compiler-rt/lib/builtins/riscv/mulsi3.S index 5464919b26b9..cfafb7a0d7b3 100644 --- a/compiler-rt/lib/builtins/riscv/mulsi3.S +++ b/compiler-rt/lib/builtins/riscv/mulsi3.S @@ -1,4 +1,4 @@ -//===--- mulsi3.S - Integer multiplication routines routines ---===// +//===--- mulsi3.S - Integer multiplication routines -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,22 +6,7 @@ // //===----------------------------------------------------------------------===// -#if !defined(__riscv_mul) && __riscv_xlen == 32 - .text - .align 2 - - .globl __mulsi3 - .type __mulsi3, @function -__mulsi3: - mv a2, a0 - mv a0, zero -.L1: - andi a3, a1, 1 - beqz a3, .L2 - add a0, a0, a2 -.L2: - srli a1, a1, 1 - slli a2, a2, 1 - bnez a1, .L1 - ret +#if __riscv_xlen == 32 +#define __mulxi3 __mulsi3 +#include "int_mul_impl.inc" #endif diff --git a/compiler-rt/lib/builtins/udivdi3.c b/compiler-rt/lib/builtins/udivdi3.c index a23139ec947f..74319cbe71c3 100644 --- a/compiler-rt/lib/builtins/udivdi3.c +++ b/compiler-rt/lib/builtins/udivdi3.c @@ -12,8 +12,12 @@ #include "int_lib.h" +typedef du_int fixuint_t; +typedef di_int fixint_t; +#include "int_div_impl.inc" + // Returns: a / b COMPILER_RT_ABI du_int __udivdi3(du_int a, du_int b) { - return __udivmoddi4(a, b, 0); + return __udivXi3(a, b); } diff --git a/compiler-rt/lib/builtins/udivmoddi4.c b/compiler-rt/lib/builtins/udivmoddi4.c index 5b297c32d790..10b41df28f84 100644 --- a/compiler-rt/lib/builtins/udivmoddi4.c +++ b/compiler-rt/lib/builtins/udivmoddi4.c @@ -87,7 +87,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) { // K K // --- // K 0 - sr = __builtin_clz(d.s.high) - __builtin_clz(n.s.high); + sr = clzsi(d.s.high) - clzsi(n.s.high); // 0 <= sr <= n_uword_bits - 2 or sr large if (sr > n_uword_bits - 2) { if (rem) @@ -120,7 +120,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) { // K X // --- // 0 K - sr = 1 + n_uword_bits + __builtin_clz(d.s.low) - __builtin_clz(n.s.high); + sr = 1 + n_uword_bits + clzsi(d.s.low) - clzsi(n.s.high); // 2 <= sr <= n_udword_bits - 1 // q.all = n.all << (n_udword_bits - sr); // r.all = n.all >> sr; @@ -145,7 +145,7 @@ COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem) { // K X // --- // K K - sr = __builtin_clz(d.s.high) - __builtin_clz(n.s.high); + sr = clzsi(d.s.high) - clzsi(n.s.high); // 0 <= sr <= n_uword_bits - 1 or sr large if (sr > n_uword_bits - 1) { if (rem) diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c index dd14a8b579ca..55def37c9e1f 100644 --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -14,182 +14,145 @@ #ifdef CRT_HAS_128BIT +// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. +// Remainder stored in r. +// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division +// fallback. For a correctness proof see the reference for this algorithm +// in Knuth, Volume 2, section 4.3.1, Algorithm D. +UNUSED +static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, + du_int *r) { + const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; + const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits) + du_int un1, un0; // Norm. dividend LSD's + du_int vn1, vn0; // Norm. divisor digits + du_int q1, q0; // Quotient digits + du_int un64, un21, un10; // Dividend digit pairs + du_int rhat; // A remainder + si_int s; // Shift amount for normalization + + s = __builtin_clzll(v); + if (s > 0) { + // Normalize the divisor. + v = v << s; + un64 = (u1 << s) | (u0 >> (n_udword_bits - s)); + un10 = u0 << s; // Shift dividend left + } else { + // Avoid undefined behavior of (u0 >> 64). + un64 = u1; + un10 = u0; + } + + // Break divisor up into two 32-bit digits. + vn1 = v >> (n_udword_bits / 2); + vn0 = v & 0xFFFFFFFF; + + // Break right half of dividend into two digits. + un1 = un10 >> (n_udword_bits / 2); + un0 = un10 & 0xFFFFFFFF; + + // Compute the first quotient digit, q1. + q1 = un64 / vn1; + rhat = un64 - q1 * vn1; + + // q1 has at most error 2. No more than 2 iterations. + while (q1 >= b || q1 * vn0 > b * rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + un21 = un64 * b + un1 - q1 * v; + + // Compute the second quotient digit. + q0 = un21 / vn1; + rhat = un21 - q0 * vn1; + + // q0 has at most error 2. No more than 2 iterations. + while (q0 >= b || q0 * vn0 > b * rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + *r = (un21 * b + un0 - q0 * v) >> s; + return q1 * b + q0; +} + +static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, + du_int *r) { +#if defined(__x86_64__) + du_int result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [ v ] "r"(v), "a"(u0), "d"(u1)); + return result; +#else + return udiv128by64to64default(u1, u0, v, r); +#endif +} + // Effects: if rem != 0, *rem = a % b // Returns: a / b -// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide - COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) { - const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; - utwords n; - n.all = a; - utwords d; - d.all = b; - utwords q; - utwords r; - unsigned sr; - // special cases, X is unknown, K != 0 - if (n.s.high == 0) { - if (d.s.high == 0) { - // 0 X - // --- - // 0 X - if (rem) - *rem = n.s.low % d.s.low; - return n.s.low / d.s.low; - } - // 0 X - // --- - // K X + utwords dividend; + dividend.all = a; + utwords divisor; + divisor.all = b; + utwords quotient; + utwords remainder; + if (divisor.all > dividend.all) { if (rem) - *rem = n.s.low; + *rem = dividend.all; return 0; } - // n.s.high != 0 - if (d.s.low == 0) { - if (d.s.high == 0) { - // K X - // --- - // 0 0 - if (rem) - *rem = n.s.high % d.s.low; - return n.s.high / d.s.low; - } - // d.s.high != 0 - if (n.s.low == 0) { - // K 0 - // --- - // K 0 - if (rem) { - r.s.high = n.s.high % d.s.high; - r.s.low = 0; - *rem = r.all; - } - return n.s.high / d.s.high; - } - // K K - // --- - // K 0 - if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ { - if (rem) { - r.s.low = n.s.low; - r.s.high = n.s.high & (d.s.high - 1); - *rem = r.all; - } - return n.s.high >> __builtin_ctzll(d.s.high); - } - // K K - // --- - // K 0 - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 2 or sr large - if (sr > n_udword_bits - 2) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - // r.all = n.all >> sr; - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* d.s.low != 0 */ { - if (d.s.high == 0) { - // K X - // --- - // 0 K - if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ { - if (rem) - *rem = n.s.low & (d.s.low - 1); - if (d.s.low == 1) - return n.all; - sr = __builtin_ctzll(d.s.low); - q.s.high = n.s.high >> sr; - q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - return q.all; - } - // K X - // --- - // 0 K - sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) - - __builtin_clzll(n.s.high); - // 2 <= sr <= n_utword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - if (sr == n_udword_bits) { - q.s.low = 0; - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ { - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ { - q.s.low = n.s.low << (n_utword_bits - sr); - q.s.high = (n.s.high << (n_utword_bits - sr)) | - (n.s.low >> (sr - n_udword_bits)); - r.s.high = 0; - r.s.low = n.s.high >> (sr - n_udword_bits); - } + // When the divisor fits in 64 bits, we can use an optimized path. + if (divisor.s.high == 0) { + remainder.s.high = 0; + if (dividend.s.high < divisor.s.low) { + // The result fits in 64 bits. + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); + quotient.s.high = 0; } else { - // K X - // --- - // K K - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 1 or sr large - if (sr > n_udword_bits - 1) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - q.s.low = 0; - if (sr == n_udword_bits) { - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else { - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - q.s.high = n.s.low << (n_udword_bits - sr); - } + // First, divide with the high part to get the remainder in dividend.s.high. + // After that dividend.s.high < divisor.s.low. + quotient.s.high = dividend.s.high / divisor.s.low; + dividend.s.high = dividend.s.high % divisor.s.low; + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); } + if (rem) + *rem = remainder.all; + return quotient.all; } - // Not a special case - // q and r are initialized with: - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - // 1 <= sr <= n_utword_bits - 1 - su_int carry = 0; - for (; sr > 0; --sr) { - // r:q = ((r:q) << 1) | carry - r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1)); - r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1)); - q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1)); - q.s.low = (q.s.low << 1) | carry; - // carry = 0; - // if (r.all >= d.all) + // 0 <= shift <= 63. + si_int shift = + __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high); + divisor.all <<= shift; + quotient.s.high = 0; + quotient.s.low = 0; + for (; shift >= 0; --shift) { + quotient.s.low <<= 1; + // Branch free version of. + // if (dividend.all >= divisor.all) // { - // r.all -= d.all; - // carry = 1; + // dividend.all -= divisor.all; + // carry = 1; // } - const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1); - carry = s & 1; - r.all -= d.all & s; + const ti_int s = + (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1); + quotient.s.low |= s & 1; + dividend.all -= divisor.all & s; + divisor.all >>= 1; } - q.all = (q.all << 1) | carry; if (rem) - *rem = r.all; - return q.all; + *rem = dividend.all; + return quotient.all; } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/udivsi3.c b/compiler-rt/lib/builtins/udivsi3.c index 18cc96c1b2e0..3894e1597552 100644 --- a/compiler-rt/lib/builtins/udivsi3.c +++ b/compiler-rt/lib/builtins/udivsi3.c @@ -12,49 +12,14 @@ #include "int_lib.h" -// Returns: a / b +typedef su_int fixuint_t; +typedef si_int fixint_t; +#include "int_div_impl.inc" -// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide +// Returns: a / b -// This function should not call __divsi3! -COMPILER_RT_ABI su_int __udivsi3(su_int n, su_int d) { - const unsigned n_uword_bits = sizeof(su_int) * CHAR_BIT; - su_int q; - su_int r; - unsigned sr; - // special cases - if (d == 0) - return 0; // ?! - if (n == 0) - return 0; - sr = __builtin_clz(d) - __builtin_clz(n); - // 0 <= sr <= n_uword_bits - 1 or sr large - if (sr > n_uword_bits - 1) // d > r - return 0; - if (sr == n_uword_bits - 1) // d == 1 - return n; - ++sr; - // 1 <= sr <= n_uword_bits - 1 - // Not a special case - q = n << (n_uword_bits - sr); - r = n >> sr; - su_int carry = 0; - for (; sr > 0; --sr) { - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (n_uword_bits - 1)); - q = (q << 1) | carry; - // carry = 0; - // if (r.all >= d.all) - // { - // r.all -= d.all; - // carry = 1; - // } - const si_int s = (si_int)(d - r - 1) >> (n_uword_bits - 1); - carry = s & 1; - r -= d & s; - } - q = (q << 1) | carry; - return q; +COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) { + return __udivXi3(a, b); } #if defined(__ARM_EABI__) diff --git a/compiler-rt/lib/builtins/umoddi3.c b/compiler-rt/lib/builtins/umoddi3.c index 965cf8fc01bd..e672da96ef62 100644 --- a/compiler-rt/lib/builtins/umoddi3.c +++ b/compiler-rt/lib/builtins/umoddi3.c @@ -12,10 +12,12 @@ #include "int_lib.h" +typedef du_int fixuint_t; +typedef di_int fixint_t; +#include "int_div_impl.inc" + // Returns: a % b COMPILER_RT_ABI du_int __umoddi3(du_int a, du_int b) { - du_int r; - __udivmoddi4(a, b, &r); - return r; + return __umodXi3(a, b); } diff --git a/compiler-rt/lib/builtins/umodsi3.c b/compiler-rt/lib/builtins/umodsi3.c index ce9abcd94ef7..5383aea656a9 100644 --- a/compiler-rt/lib/builtins/umodsi3.c +++ b/compiler-rt/lib/builtins/umodsi3.c @@ -12,8 +12,12 @@ #include "int_lib.h" +typedef su_int fixuint_t; +typedef si_int fixint_t; +#include "int_div_impl.inc" + // Returns: a % b COMPILER_RT_ABI su_int __umodsi3(su_int a, su_int b) { - return a - __udivsi3(a, b) * b; + return __umodXi3(a, b); } diff --git a/compiler-rt/lib/builtins/ve/grow_stack.S b/compiler-rt/lib/builtins/ve/grow_stack.S new file mode 100644 index 000000000000..f403798495af --- /dev/null +++ b/compiler-rt/lib/builtins/ve/grow_stack.S @@ -0,0 +1,31 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../assembly.h" + +// grow_stack routine +// This routine is VE specific +// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf + +// destroy %s62 and %s63 only + +#ifdef __ve__ + +.text +.p2align 4 +DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack) + subu.l %sp, %sp, %s0 # sp -= alloca size + and %sp, -16, %sp # align sp + brge.l.t %sp, %sl, 1f + ld %s63, 0x18(,%tp) # load param area + lea %s62, 0x13b # syscall # of grow + shm.l %s62, 0x0(%s63) # stored at addr:0 + shm.l %sl, 0x8(%s63) # old limit at addr:8 + shm.l %sp, 0x10(%s63) # new limit at addr:16 + monc +1: + b.l (,%lr) +END_COMPILERRT_FUNCTION(__ve_grow_stack) + +#endif // __ve__ diff --git a/compiler-rt/lib/builtins/ve/grow_stack_align.S b/compiler-rt/lib/builtins/ve/grow_stack_align.S new file mode 100644 index 000000000000..19a1dfa8726c --- /dev/null +++ b/compiler-rt/lib/builtins/ve/grow_stack_align.S @@ -0,0 +1,31 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../assembly.h" + +// grow_stack routine +// This routine is VE specific +// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf + +// destroy %s62 and %s63 only + +#ifdef __ve__ + +.text +.p2align 4 +DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack_align) + subu.l %sp, %sp, %s0 # sp -= alloca size + and %sp, %sp, %s1 # align sp + brge.l.t %sp, %sl, 1f + ld %s63, 0x18(,%tp) # load param area + lea %s62, 0x13b # syscall # of grow + shm.l %s62, 0x0(%s63) # stored at addr:0 + shm.l %sl, 0x8(%s63) # old limit at addr:8 + shm.l %sp, 0x10(%s63) # new limit at addr:16 + monc +1: + b.l (,%lr) +END_COMPILERRT_FUNCTION(__ve_grow_stack_align) + +#endif // __ve__ |