diff options
Diffstat (limited to 'contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S')
| -rw-r--r-- | contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S | 694 |
1 files changed, 694 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S new file mode 100644 index 000000000000..843e88b3cab8 --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S @@ -0,0 +1,694 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + +// Double Precision Multiply + + +#define A r1:0 +#define AH r1 +#define AL r0 +#define B r3:2 +#define BH r3 +#define BL r2 +#define C r5:4 +#define CH r5 +#define CL r4 + + + +#define BTMP r15:14 +#define BTMPH r15 +#define BTMPL r14 + +#define ATMP r13:12 +#define ATMPH r13 +#define ATMPL r12 + +#define CTMP r11:10 +#define CTMPH r11 +#define CTMPL r10 + +#define PP_LL r9:8 +#define PP_LL_H r9 +#define PP_LL_L r8 + +#define PP_ODD r7:6 +#define PP_ODD_H r7 +#define PP_ODD_L r6 + + +#define PP_HH r17:16 +#define PP_HH_H r17 +#define PP_HH_L r16 + +#define EXPA r18 +#define EXPB r19 +#define EXPBA r19:18 + +#define TMP r28 + +#define P_TMP p0 +#define PROD_NEG p3 +#define EXACT p2 +#define SWAP p1 + +#define MANTBITS 52 +#define HI_MANTBITS 20 +#define EXPBITS 11 +#define BIAS 1023 +#define STACKSPACE 32 + +#define ADJUST 4 + +#define FUDGE 7 +#define FUDGE2 3 + +#ifndef SR_ROUND_OFF +#define SR_ROUND_OFF 22 +#endif + + // First, classify for normal values, and abort if abnormal + // + // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8 + // + // Since we know that the 2 MSBs of the H registers is zero, we should never carry + // the partial products that involve the H registers + // + // Try to buy X slots, at the expense of latency if needed + // + // We will have PP_HH with the upper bits of the product, PP_LL with the lower + // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts + // PP_HH can have a minimum of 0x0100_0000_0000_0000 + // + // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS + // + // We need to align CTMP. + // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add + // If CTMP << PP align CTMP and add 128 bits. Then compute sticky + // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation. + // + // Convert partial product and CTMP to 2's complement prior to addition + // + // After we add, we need to normalize into upper 64 bits, then compute sticky. + + .text + .global __hexagon_fmadf4 + .type __hexagon_fmadf4,@function + .global __hexagon_fmadf5 + .type __hexagon_fmadf5,@function + Q6_ALIAS(fmadf5) + .p2align 5 +__hexagon_fmadf4: +__hexagon_fmadf5: +.Lfma_begin: + { + P_TMP = dfclass(A,#2) + P_TMP = dfclass(B,#2) + ATMP = #0 + BTMP = #0 + } + { + ATMP = insert(A,#MANTBITS,#EXPBITS-3) + BTMP = insert(B,#MANTBITS,#EXPBITS-3) + PP_ODD_H = ##0x10000000 + allocframe(#STACKSPACE) + } + { + PP_LL = mpyu(ATMPL,BTMPL) + if (!P_TMP) jump .Lfma_abnormal_ab + ATMPH = or(ATMPH,PP_ODD_H) + BTMPH = or(BTMPH,PP_ODD_H) + } + { + P_TMP = dfclass(C,#2) + if (!P_TMP.new) jump:nt .Lfma_abnormal_c + CTMP = combine(PP_ODD_H,#0) + PP_ODD = combine(#0,PP_LL_H) + } +.Lfma_abnormal_c_restart: + { + PP_ODD += mpyu(BTMPL,ATMPH) + CTMP = insert(C,#MANTBITS,#EXPBITS-3) + memd(r29+#0) = PP_HH + memd(r29+#8) = EXPBA + } + { + PP_ODD += mpyu(ATMPL,BTMPH) + EXPBA = neg(CTMP) + P_TMP = cmp.gt(CH,#-1) + TMP = xor(AH,BH) + } + { + EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) + EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) + PP_HH = combine(#0,PP_ODD_H) + if (!P_TMP) CTMP = EXPBA + } + { + PP_HH += mpyu(ATMPH,BTMPH) + PP_LL = combine(PP_ODD_L,PP_LL_L) +#undef PP_ODD +#undef PP_ODD_H +#undef PP_ODD_L +#undef ATMP +#undef ATMPL +#undef ATMPH +#undef BTMP +#undef BTMPL +#undef BTMPH +#define RIGHTLEFTSHIFT r13:12 +#define RIGHTSHIFT r13 +#define LEFTSHIFT r12 + + EXPA = add(EXPA,EXPB) +#undef EXPB +#undef EXPBA +#define EXPC r19 +#define EXPCA r19:18 + EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS) + } + // PP_HH:PP_LL now has product + // CTMP is negated + // EXPA,B,C are extracted + // We need to negate PP + // Since we will be adding with carry later, if we need to negate, + // just invert all bits now, which we can do conditionally and in parallel +#define PP_HH_TMP r15:14 +#define PP_LL_TMP r7:6 + { + EXPA = add(EXPA,#-BIAS+(ADJUST)) + PROD_NEG = !cmp.gt(TMP,#-1) + PP_LL_TMP = #0 + PP_HH_TMP = #0 + } + { + PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry + P_TMP = !cmp.gt(TMP,#-1) + SWAP = cmp.gt(EXPC,EXPA) // If C >> PP + if (SWAP.new) EXPCA = combine(EXPA,EXPC) + } + { + PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry + if (P_TMP) PP_LL = PP_LL_TMP +#undef PP_LL_TMP +#define CTMP2 r7:6 +#define CTMP2H r7 +#define CTMP2L r6 + CTMP2 = #0 + EXPC = sub(EXPA,EXPC) + } + { + if (P_TMP) PP_HH = PP_HH_TMP + P_TMP = cmp.gt(EXPC,#63) + if (SWAP) PP_LL = CTMP2 + if (SWAP) CTMP2 = PP_LL + } +#undef PP_HH_TMP +//#define ONE r15:14 +//#define S_ONE r14 +#define ZERO r15:14 +#define S_ZERO r15 +#undef PROD_NEG +#define P_CARRY p3 + { + if (SWAP) PP_HH = CTMP // Swap C and PP + if (SWAP) CTMP = PP_HH + if (P_TMP) EXPC = add(EXPC,#-64) + TMP = #63 + } + { + // If diff > 63, pre-shift-right by 64... + if (P_TMP) CTMP2 = CTMP + TMP = asr(CTMPH,#31) + RIGHTSHIFT = min(EXPC,TMP) + LEFTSHIFT = #0 + } +#undef C +#undef CH +#undef CL +#define STICKIES r5:4 +#define STICKIESH r5 +#define STICKIESL r4 + { + if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64 + STICKIES = extract(CTMP2,RIGHTLEFTSHIFT) + CTMP2 = lsr(CTMP2,RIGHTSHIFT) + LEFTSHIFT = sub(#64,RIGHTSHIFT) + } + { + ZERO = #0 + TMP = #-2 + CTMP2 |= lsl(CTMP,LEFTSHIFT) + CTMP = asr(CTMP,RIGHTSHIFT) + } + { + P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift + if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR +#undef ZERO +#define ONE r15:14 +#define S_ONE r14 + ONE = #1 + STICKIES = #0 + } + { + PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky + } + { + PP_HH = add(CTMP,PP_HH,P_CARRY):carry + TMP = #62 + } + // PP_HH:PP_LL now holds the sum + // We may need to normalize left, up to ??? bits. + // + // I think that if we have massive cancellation, the range we normalize by + // is still limited + { + LEFTSHIFT = add(clb(PP_HH),#-2) + if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits? + } + // We had all sign bits, shift left by 62. + { + CTMP = extractu(PP_LL,#62,#2) + PP_LL = asl(PP_LL,#62) + EXPA = add(EXPA,#-62) // And adjust exponent of result + } + { + PP_HH = insert(CTMP,#62,#0) // Then shift 63 + } + { + LEFTSHIFT = add(clb(PP_HH),#-2) + } + .falign +1: + { + CTMP = asl(PP_HH,LEFTSHIFT) + STICKIES |= asl(PP_LL,LEFTSHIFT) + RIGHTSHIFT = sub(#64,LEFTSHIFT) + EXPA = sub(EXPA,LEFTSHIFT) + } + { + CTMP |= lsr(PP_LL,RIGHTSHIFT) + EXACT = cmp.gtu(ONE,STICKIES) + TMP = #BIAS+BIAS-2 + } + { + if (!EXACT) CTMPL = or(CTMPL,S_ONE) + // If EXPA is overflow/underflow, jump to ovf_unf + P_TMP = !cmp.gt(EXPA,TMP) + P_TMP = cmp.gt(EXPA,#1) + if (!P_TMP.new) jump:nt .Lfma_ovf_unf + } + { + // XXX: FIXME: should PP_HH for check of zero be CTMP? + P_TMP = cmp.gtu(ONE,CTMP) // is result true zero? + A = convert_d2df(CTMP) + EXPA = add(EXPA,#-BIAS-60) + PP_HH = memd(r29+#0) + } + { + AH += asl(EXPA,#HI_MANTBITS) + EXPCA = memd(r29+#8) + if (!P_TMP) dealloc_return // not zero, return + } +.Ladd_yields_zero: + // We had full cancellation. Return +/- zero (-0 when round-down) + { + TMP = USR + A = #0 + } + { + TMP = extractu(TMP,#2,#SR_ROUND_OFF) + PP_HH = memd(r29+#0) + EXPCA = memd(r29+#8) + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = ##0x80000000 + dealloc_return + } + +#undef RIGHTLEFTSHIFT +#undef RIGHTSHIFT +#undef LEFTSHIFT +#undef CTMP2 +#undef CTMP2H +#undef CTMP2L + +.Lfma_ovf_unf: + { + p0 = cmp.gtu(ONE,CTMP) + if (p0.new) jump:nt .Ladd_yields_zero + } + { + A = convert_d2df(CTMP) + EXPA = add(EXPA,#-BIAS-60) + TMP = EXPA + } +#define NEW_EXPB r7 +#define NEW_EXPA r6 + { + AH += asl(EXPA,#HI_MANTBITS) + NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS) + } + { + NEW_EXPA = add(EXPA,NEW_EXPB) + PP_HH = memd(r29+#0) + EXPCA = memd(r29+#8) +#undef PP_HH +#undef PP_HH_H +#undef PP_HH_L +#undef EXPCA +#undef EXPC +#undef EXPA +#undef PP_LL +#undef PP_LL_H +#undef PP_LL_L +#define EXPA r6 +#define EXPB r7 +#define EXPBA r7:6 +#define ATMP r9:8 +#define ATMPH r9 +#define ATMPL r8 +#undef NEW_EXPB +#undef NEW_EXPA + ATMP = abs(CTMP) + } + { + p0 = cmp.gt(EXPA,##BIAS+BIAS) + if (p0.new) jump:nt .Lfma_ovf + } + { + p0 = cmp.gt(EXPA,#0) + if (p0.new) jump:nt .Lpossible_unf + } + { + // TMP has original EXPA. + // ATMP is corresponding value + // Normalize ATMP and shift right to correct location + EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize + EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize + p3 = cmp.gt(CTMPH,#-1) + } + // Underflow + // We know that the infinte range exponent should be EXPA + // CTMP is 2's complement, ATMP is abs(CTMP) + { + EXPA = add(EXPA,EXPB) // how much to shift back right + ATMP = asl(ATMP,EXPB) // shift left + AH = USR + TMP = #63 + } + { + EXPB = min(EXPA,TMP) + EXPA = #0 + AL = #0x0030 + } + { + B = extractu(ATMP,EXPBA) + ATMP = asr(ATMP,EXPB) + } + { + p0 = cmp.gtu(ONE,B) + if (!p0.new) ATMPL = or(ATMPL,S_ONE) + ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2) + } + { + CTMP = neg(ATMP) + p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1) + if (!p1.new) AH = or(AH,AL) + B = #0 + } + { + if (p3) CTMP = ATMP + USR = AH + TMP = #-BIAS-(MANTBITS+FUDGE2) + } + { + A = convert_d2df(CTMP) + } + { + AH += asl(TMP,#HI_MANTBITS) + dealloc_return + } +.Lpossible_unf: + { + TMP = ##0x7fefffff + ATMP = abs(CTMP) + } + { + p0 = cmp.eq(AL,#0) + p0 = bitsclr(AH,TMP) + if (!p0.new) dealloc_return:t + TMP = #0x7fff + } + { + p0 = bitsset(ATMPH,TMP) + BH = USR + BL = #0x0030 + } + { + if (p0) BH = or(BH,BL) + } + { + USR = BH + } + { + p0 = dfcmp.eq(A,A) + dealloc_return + } +.Lfma_ovf: + { + TMP = USR + CTMP = combine(##0x7fefffff,#-1) + A = CTMP + } + { + ATMP = combine(##0x7ff00000,#0) + BH = extractu(TMP,#2,#SR_ROUND_OFF) + TMP = or(TMP,#0x28) + } + { + USR = TMP + BH ^= lsr(AH,#31) + BL = BH + } + { + p0 = !cmp.eq(BL,#1) + p0 = !cmp.eq(BH,#2) + } + { + p0 = dfcmp.eq(ATMP,ATMP) + if (p0.new) CTMP = ATMP + } + { + A = insert(CTMP,#63,#0) + dealloc_return + } +#undef CTMP +#undef CTMPH +#undef CTMPL +#define BTMP r11:10 +#define BTMPH r11 +#define BTMPL r10 + +#undef STICKIES +#undef STICKIESH +#undef STICKIESL +#define C r5:4 +#define CH r5 +#define CL r4 + +.Lfma_abnormal_ab: + { + ATMP = extractu(A,#63,#0) + BTMP = extractu(B,#63,#0) + deallocframe + } + { + p3 = cmp.gtu(ATMP,BTMP) + if (!p3.new) A = B // sort values + if (!p3.new) B = A + } + { + p0 = dfclass(A,#0x0f) // A NaN? + if (!p0.new) jump:nt .Lnan + if (!p3) ATMP = BTMP + if (!p3) BTMP = ATMP + } + { + p1 = dfclass(A,#0x08) // A is infinity + p1 = dfclass(B,#0x0e) // B is nonzero + } + { + p0 = dfclass(A,#0x08) // a is inf + p0 = dfclass(B,#0x01) // b is zero + } + { + if (p1) jump .Lab_inf + p2 = dfclass(B,#0x01) + } + { + if (p0) jump .Linvalid + if (p2) jump .Lab_true_zero + TMP = ##0x7c000000 + } + // We are left with a normal or subnormal times a subnormal, A > B + // If A and B are both very small, we will go to a single sticky bit; replace + // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results + // if A and B might multiply to something bigger, decrease A exp and increase B exp + // and start over + { + p0 = bitsclr(AH,TMP) + if (p0.new) jump:nt .Lfma_ab_tiny + } + { + TMP = add(clb(BTMP),#-EXPBITS) + } + { + BTMP = asl(BTMP,TMP) + } + { + B = insert(BTMP,#63,#0) + AH -= asl(TMP,#HI_MANTBITS) + } + jump .Lfma_begin + +.Lfma_ab_tiny: + ATMP = combine(##0x00100000,#0) + { + A = insert(ATMP,#63,#0) + B = insert(ATMP,#63,#0) + } + jump .Lfma_begin + +.Lab_inf: + { + B = lsr(B,#63) + p0 = dfclass(C,#0x10) + } + { + A ^= asl(B,#63) + if (p0) jump .Lnan + } + { + p1 = dfclass(C,#0x08) + if (p1.new) jump:nt .Lfma_inf_plus_inf + } + // A*B is +/- inf, C is finite. Return A + { + jumpr r31 + } + .falign +.Lfma_inf_plus_inf: + { // adding infinities of different signs is invalid + p0 = dfcmp.eq(A,C) + if (!p0.new) jump:nt .Linvalid + } + { + jumpr r31 + } + +.Lnan: + { + p0 = dfclass(B,#0x10) + p1 = dfclass(C,#0x10) + if (!p0.new) B = A + if (!p1.new) C = A + } + { // find sNaNs + BH = convert_df2sf(B) + BL = convert_df2sf(C) + } + { + BH = convert_df2sf(A) + A = #-1 + jumpr r31 + } + +.Linvalid: + { + TMP = ##0x7f800001 // sp snan + } + { + A = convert_sf2df(TMP) + jumpr r31 + } + +.Lab_true_zero: + // B is zero, A is finite number + { + p0 = dfclass(C,#0x10) + if (p0.new) jump:nt .Lnan + if (p0.new) A = C + } + { + p0 = dfcmp.eq(B,C) // is C also zero? + AH = lsr(AH,#31) // get sign + } + { + BH ^= asl(AH,#31) // form correctly signed zero in B + if (!p0) A = C // If C is not zero, return C + if (!p0) jumpr r31 + } + // B has correctly signed zero, C is also zero +.Lzero_plus_zero: + { + p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0 + if (p0.new) jumpr:t r31 + A = B + } + { + TMP = USR + } + { + TMP = extractu(TMP,#2,#SR_ROUND_OFF) + A = #0 + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = ##0x80000000 + jumpr r31 + } +#undef BTMP +#undef BTMPH +#undef BTMPL +#define CTMP r11:10 + .falign +.Lfma_abnormal_c: + // We know that AB is normal * normal + // C is not normal: zero, subnormal, inf, or NaN. + { + p0 = dfclass(C,#0x10) // is C NaN? + if (p0.new) jump:nt .Lnan + if (p0.new) A = C // move NaN to A + deallocframe + } + { + p0 = dfclass(C,#0x08) // is C inf? + if (p0.new) A = C // return C + if (p0.new) jumpr:nt r31 + } + // zero or subnormal + // If we have a zero, and we know AB is normal*normal, we can just call normal multiply + { + p0 = dfclass(C,#0x01) // is C zero? + if (p0.new) jump:nt __hexagon_muldf3 + TMP = #1 + } + // Left with: subnormal + // Adjust C and jump back to restart + { + allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame + CTMP = #0 + CH = insert(TMP,#EXPBITS,#HI_MANTBITS) + jump .Lfma_abnormal_c_restart + } +END(fma) |
