aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S')
-rw-r--r--contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S694
1 files changed, 694 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S
new file mode 100644
index 000000000000..843e88b3cab8
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S
@@ -0,0 +1,694 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+// Double Precision Multiply
+
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+#define C r5:4
+#define CH r5
+#define CL r4
+
+
+
+#define BTMP r15:14
+#define BTMPH r15
+#define BTMPL r14
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define CTMP r11:10
+#define CTMPH r11
+#define CTMPL r10
+
+#define PP_LL r9:8
+#define PP_LL_H r9
+#define PP_LL_L r8
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+
+#define PP_HH r17:16
+#define PP_HH_H r17
+#define PP_HH_L r16
+
+#define EXPA r18
+#define EXPB r19
+#define EXPBA r19:18
+
+#define TMP r28
+
+#define P_TMP p0
+#define PROD_NEG p3
+#define EXACT p2
+#define SWAP p1
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1023
+#define STACKSPACE 32
+
+#define ADJUST 4
+
+#define FUDGE 7
+#define FUDGE2 3
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+ // First, classify for normal values, and abort if abnormal
+ //
+ // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
+ //
+ // Since we know that the 2 MSBs of the H registers is zero, we should never carry
+ // the partial products that involve the H registers
+ //
+ // Try to buy X slots, at the expense of latency if needed
+ //
+ // We will have PP_HH with the upper bits of the product, PP_LL with the lower
+ // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
+ // PP_HH can have a minimum of 0x0100_0000_0000_0000
+ //
+ // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
+ //
+ // We need to align CTMP.
+ // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
+ // If CTMP << PP align CTMP and add 128 bits. Then compute sticky
+ // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
+ //
+ // Convert partial product and CTMP to 2's complement prior to addition
+ //
+ // After we add, we need to normalize into upper 64 bits, then compute sticky.
+
+ .text
+ .global __hexagon_fmadf4
+ .type __hexagon_fmadf4,@function
+ .global __hexagon_fmadf5
+ .type __hexagon_fmadf5,@function
+ Q6_ALIAS(fmadf5)
+ .p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+.Lfma_begin:
+ {
+ P_TMP = dfclass(A,#2)
+ P_TMP = dfclass(B,#2)
+ ATMP = #0
+ BTMP = #0
+ }
+ {
+ ATMP = insert(A,#MANTBITS,#EXPBITS-3)
+ BTMP = insert(B,#MANTBITS,#EXPBITS-3)
+ PP_ODD_H = ##0x10000000
+ allocframe(#STACKSPACE)
+ }
+ {
+ PP_LL = mpyu(ATMPL,BTMPL)
+ if (!P_TMP) jump .Lfma_abnormal_ab
+ ATMPH = or(ATMPH,PP_ODD_H)
+ BTMPH = or(BTMPH,PP_ODD_H)
+ }
+ {
+ P_TMP = dfclass(C,#2)
+ if (!P_TMP.new) jump:nt .Lfma_abnormal_c
+ CTMP = combine(PP_ODD_H,#0)
+ PP_ODD = combine(#0,PP_LL_H)
+ }
+.Lfma_abnormal_c_restart:
+ {
+ PP_ODD += mpyu(BTMPL,ATMPH)
+ CTMP = insert(C,#MANTBITS,#EXPBITS-3)
+ memd(r29+#0) = PP_HH
+ memd(r29+#8) = EXPBA
+ }
+ {
+ PP_ODD += mpyu(ATMPL,BTMPH)
+ EXPBA = neg(CTMP)
+ P_TMP = cmp.gt(CH,#-1)
+ TMP = xor(AH,BH)
+ }
+ {
+ EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+ PP_HH = combine(#0,PP_ODD_H)
+ if (!P_TMP) CTMP = EXPBA
+ }
+ {
+ PP_HH += mpyu(ATMPH,BTMPH)
+ PP_LL = combine(PP_ODD_L,PP_LL_L)
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
+#define RIGHTLEFTSHIFT r13:12
+#define RIGHTSHIFT r13
+#define LEFTSHIFT r12
+
+ EXPA = add(EXPA,EXPB)
+#undef EXPB
+#undef EXPBA
+#define EXPC r19
+#define EXPCA r19:18
+ EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
+ }
+ // PP_HH:PP_LL now has product
+ // CTMP is negated
+ // EXPA,B,C are extracted
+ // We need to negate PP
+ // Since we will be adding with carry later, if we need to negate,
+ // just invert all bits now, which we can do conditionally and in parallel
+#define PP_HH_TMP r15:14
+#define PP_LL_TMP r7:6
+ {
+ EXPA = add(EXPA,#-BIAS+(ADJUST))
+ PROD_NEG = !cmp.gt(TMP,#-1)
+ PP_LL_TMP = #0
+ PP_HH_TMP = #0
+ }
+ {
+ PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
+ P_TMP = !cmp.gt(TMP,#-1)
+ SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
+ if (SWAP.new) EXPCA = combine(EXPA,EXPC)
+ }
+ {
+ PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
+ if (P_TMP) PP_LL = PP_LL_TMP
+#undef PP_LL_TMP
+#define CTMP2 r7:6
+#define CTMP2H r7
+#define CTMP2L r6
+ CTMP2 = #0
+ EXPC = sub(EXPA,EXPC)
+ }
+ {
+ if (P_TMP) PP_HH = PP_HH_TMP
+ P_TMP = cmp.gt(EXPC,#63)
+ if (SWAP) PP_LL = CTMP2
+ if (SWAP) CTMP2 = PP_LL
+ }
+#undef PP_HH_TMP
+//#define ONE r15:14
+//#define S_ONE r14
+#define ZERO r15:14
+#define S_ZERO r15
+#undef PROD_NEG
+#define P_CARRY p3
+ {
+ if (SWAP) PP_HH = CTMP // Swap C and PP
+ if (SWAP) CTMP = PP_HH
+ if (P_TMP) EXPC = add(EXPC,#-64)
+ TMP = #63
+ }
+ {
+ // If diff > 63, pre-shift-right by 64...
+ if (P_TMP) CTMP2 = CTMP
+ TMP = asr(CTMPH,#31)
+ RIGHTSHIFT = min(EXPC,TMP)
+ LEFTSHIFT = #0
+ }
+#undef C
+#undef CH
+#undef CL
+#define STICKIES r5:4
+#define STICKIESH r5
+#define STICKIESL r4
+ {
+ if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
+ STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
+ CTMP2 = lsr(CTMP2,RIGHTSHIFT)
+ LEFTSHIFT = sub(#64,RIGHTSHIFT)
+ }
+ {
+ ZERO = #0
+ TMP = #-2
+ CTMP2 |= lsl(CTMP,LEFTSHIFT)
+ CTMP = asr(CTMP,RIGHTSHIFT)
+ }
+ {
+ P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
+ if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
+#undef ZERO
+#define ONE r15:14
+#define S_ONE r14
+ ONE = #1
+ STICKIES = #0
+ }
+ {
+ PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
+ }
+ {
+ PP_HH = add(CTMP,PP_HH,P_CARRY):carry
+ TMP = #62
+ }
+ // PP_HH:PP_LL now holds the sum
+ // We may need to normalize left, up to ??? bits.
+ //
+ // I think that if we have massive cancellation, the range we normalize by
+ // is still limited
+ {
+ LEFTSHIFT = add(clb(PP_HH),#-2)
+ if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
+ }
+ // We had all sign bits, shift left by 62.
+ {
+ CTMP = extractu(PP_LL,#62,#2)
+ PP_LL = asl(PP_LL,#62)
+ EXPA = add(EXPA,#-62) // And adjust exponent of result
+ }
+ {
+ PP_HH = insert(CTMP,#62,#0) // Then shift 63
+ }
+ {
+ LEFTSHIFT = add(clb(PP_HH),#-2)
+ }
+ .falign
+1:
+ {
+ CTMP = asl(PP_HH,LEFTSHIFT)
+ STICKIES |= asl(PP_LL,LEFTSHIFT)
+ RIGHTSHIFT = sub(#64,LEFTSHIFT)
+ EXPA = sub(EXPA,LEFTSHIFT)
+ }
+ {
+ CTMP |= lsr(PP_LL,RIGHTSHIFT)
+ EXACT = cmp.gtu(ONE,STICKIES)
+ TMP = #BIAS+BIAS-2
+ }
+ {
+ if (!EXACT) CTMPL = or(CTMPL,S_ONE)
+ // If EXPA is overflow/underflow, jump to ovf_unf
+ P_TMP = !cmp.gt(EXPA,TMP)
+ P_TMP = cmp.gt(EXPA,#1)
+ if (!P_TMP.new) jump:nt .Lfma_ovf_unf
+ }
+ {
+ // XXX: FIXME: should PP_HH for check of zero be CTMP?
+ P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
+ A = convert_d2df(CTMP)
+ EXPA = add(EXPA,#-BIAS-60)
+ PP_HH = memd(r29+#0)
+ }
+ {
+ AH += asl(EXPA,#HI_MANTBITS)
+ EXPCA = memd(r29+#8)
+ if (!P_TMP) dealloc_return // not zero, return
+ }
+.Ladd_yields_zero:
+ // We had full cancellation. Return +/- zero (-0 when round-down)
+ {
+ TMP = USR
+ A = #0
+ }
+ {
+ TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+ PP_HH = memd(r29+#0)
+ EXPCA = memd(r29+#8)
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = ##0x80000000
+ dealloc_return
+ }
+
+#undef RIGHTLEFTSHIFT
+#undef RIGHTSHIFT
+#undef LEFTSHIFT
+#undef CTMP2
+#undef CTMP2H
+#undef CTMP2L
+
+.Lfma_ovf_unf:
+ {
+ p0 = cmp.gtu(ONE,CTMP)
+ if (p0.new) jump:nt .Ladd_yields_zero
+ }
+ {
+ A = convert_d2df(CTMP)
+ EXPA = add(EXPA,#-BIAS-60)
+ TMP = EXPA
+ }
+#define NEW_EXPB r7
+#define NEW_EXPA r6
+ {
+ AH += asl(EXPA,#HI_MANTBITS)
+ NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ }
+ {
+ NEW_EXPA = add(EXPA,NEW_EXPB)
+ PP_HH = memd(r29+#0)
+ EXPCA = memd(r29+#8)
+#undef PP_HH
+#undef PP_HH_H
+#undef PP_HH_L
+#undef EXPCA
+#undef EXPC
+#undef EXPA
+#undef PP_LL
+#undef PP_LL_H
+#undef PP_LL_L
+#define EXPA r6
+#define EXPB r7
+#define EXPBA r7:6
+#define ATMP r9:8
+#define ATMPH r9
+#define ATMPL r8
+#undef NEW_EXPB
+#undef NEW_EXPA
+ ATMP = abs(CTMP)
+ }
+ {
+ p0 = cmp.gt(EXPA,##BIAS+BIAS)
+ if (p0.new) jump:nt .Lfma_ovf
+ }
+ {
+ p0 = cmp.gt(EXPA,#0)
+ if (p0.new) jump:nt .Lpossible_unf
+ }
+ {
+ // TMP has original EXPA.
+ // ATMP is corresponding value
+ // Normalize ATMP and shift right to correct location
+ EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
+ EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
+ p3 = cmp.gt(CTMPH,#-1)
+ }
+ // Underflow
+ // We know that the infinte range exponent should be EXPA
+ // CTMP is 2's complement, ATMP is abs(CTMP)
+ {
+ EXPA = add(EXPA,EXPB) // how much to shift back right
+ ATMP = asl(ATMP,EXPB) // shift left
+ AH = USR
+ TMP = #63
+ }
+ {
+ EXPB = min(EXPA,TMP)
+ EXPA = #0
+ AL = #0x0030
+ }
+ {
+ B = extractu(ATMP,EXPBA)
+ ATMP = asr(ATMP,EXPB)
+ }
+ {
+ p0 = cmp.gtu(ONE,B)
+ if (!p0.new) ATMPL = or(ATMPL,S_ONE)
+ ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
+ }
+ {
+ CTMP = neg(ATMP)
+ p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
+ if (!p1.new) AH = or(AH,AL)
+ B = #0
+ }
+ {
+ if (p3) CTMP = ATMP
+ USR = AH
+ TMP = #-BIAS-(MANTBITS+FUDGE2)
+ }
+ {
+ A = convert_d2df(CTMP)
+ }
+ {
+ AH += asl(TMP,#HI_MANTBITS)
+ dealloc_return
+ }
+.Lpossible_unf:
+ {
+ TMP = ##0x7fefffff
+ ATMP = abs(CTMP)
+ }
+ {
+ p0 = cmp.eq(AL,#0)
+ p0 = bitsclr(AH,TMP)
+ if (!p0.new) dealloc_return:t
+ TMP = #0x7fff
+ }
+ {
+ p0 = bitsset(ATMPH,TMP)
+ BH = USR
+ BL = #0x0030
+ }
+ {
+ if (p0) BH = or(BH,BL)
+ }
+ {
+ USR = BH
+ }
+ {
+ p0 = dfcmp.eq(A,A)
+ dealloc_return
+ }
+.Lfma_ovf:
+ {
+ TMP = USR
+ CTMP = combine(##0x7fefffff,#-1)
+ A = CTMP
+ }
+ {
+ ATMP = combine(##0x7ff00000,#0)
+ BH = extractu(TMP,#2,#SR_ROUND_OFF)
+ TMP = or(TMP,#0x28)
+ }
+ {
+ USR = TMP
+ BH ^= lsr(AH,#31)
+ BL = BH
+ }
+ {
+ p0 = !cmp.eq(BL,#1)
+ p0 = !cmp.eq(BH,#2)
+ }
+ {
+ p0 = dfcmp.eq(ATMP,ATMP)
+ if (p0.new) CTMP = ATMP
+ }
+ {
+ A = insert(CTMP,#63,#0)
+ dealloc_return
+ }
+#undef CTMP
+#undef CTMPH
+#undef CTMPL
+#define BTMP r11:10
+#define BTMPH r11
+#define BTMPL r10
+
+#undef STICKIES
+#undef STICKIESH
+#undef STICKIESL
+#define C r5:4
+#define CH r5
+#define CL r4
+
+.Lfma_abnormal_ab:
+ {
+ ATMP = extractu(A,#63,#0)
+ BTMP = extractu(B,#63,#0)
+ deallocframe
+ }
+ {
+ p3 = cmp.gtu(ATMP,BTMP)
+ if (!p3.new) A = B // sort values
+ if (!p3.new) B = A
+ }
+ {
+ p0 = dfclass(A,#0x0f) // A NaN?
+ if (!p0.new) jump:nt .Lnan
+ if (!p3) ATMP = BTMP
+ if (!p3) BTMP = ATMP
+ }
+ {
+ p1 = dfclass(A,#0x08) // A is infinity
+ p1 = dfclass(B,#0x0e) // B is nonzero
+ }
+ {
+ p0 = dfclass(A,#0x08) // a is inf
+ p0 = dfclass(B,#0x01) // b is zero
+ }
+ {
+ if (p1) jump .Lab_inf
+ p2 = dfclass(B,#0x01)
+ }
+ {
+ if (p0) jump .Linvalid
+ if (p2) jump .Lab_true_zero
+ TMP = ##0x7c000000
+ }
+ // We are left with a normal or subnormal times a subnormal, A > B
+ // If A and B are both very small, we will go to a single sticky bit; replace
+ // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
+ // if A and B might multiply to something bigger, decrease A exp and increase B exp
+ // and start over
+ {
+ p0 = bitsclr(AH,TMP)
+ if (p0.new) jump:nt .Lfma_ab_tiny
+ }
+ {
+ TMP = add(clb(BTMP),#-EXPBITS)
+ }
+ {
+ BTMP = asl(BTMP,TMP)
+ }
+ {
+ B = insert(BTMP,#63,#0)
+ AH -= asl(TMP,#HI_MANTBITS)
+ }
+ jump .Lfma_begin
+
+.Lfma_ab_tiny:
+ ATMP = combine(##0x00100000,#0)
+ {
+ A = insert(ATMP,#63,#0)
+ B = insert(ATMP,#63,#0)
+ }
+ jump .Lfma_begin
+
+.Lab_inf:
+ {
+ B = lsr(B,#63)
+ p0 = dfclass(C,#0x10)
+ }
+ {
+ A ^= asl(B,#63)
+ if (p0) jump .Lnan
+ }
+ {
+ p1 = dfclass(C,#0x08)
+ if (p1.new) jump:nt .Lfma_inf_plus_inf
+ }
+ // A*B is +/- inf, C is finite. Return A
+ {
+ jumpr r31
+ }
+ .falign
+.Lfma_inf_plus_inf:
+ { // adding infinities of different signs is invalid
+ p0 = dfcmp.eq(A,C)
+ if (!p0.new) jump:nt .Linvalid
+ }
+ {
+ jumpr r31
+ }
+
+.Lnan:
+ {
+ p0 = dfclass(B,#0x10)
+ p1 = dfclass(C,#0x10)
+ if (!p0.new) B = A
+ if (!p1.new) C = A
+ }
+ { // find sNaNs
+ BH = convert_df2sf(B)
+ BL = convert_df2sf(C)
+ }
+ {
+ BH = convert_df2sf(A)
+ A = #-1
+ jumpr r31
+ }
+
+.Linvalid:
+ {
+ TMP = ##0x7f800001 // sp snan
+ }
+ {
+ A = convert_sf2df(TMP)
+ jumpr r31
+ }
+
+.Lab_true_zero:
+ // B is zero, A is finite number
+ {
+ p0 = dfclass(C,#0x10)
+ if (p0.new) jump:nt .Lnan
+ if (p0.new) A = C
+ }
+ {
+ p0 = dfcmp.eq(B,C) // is C also zero?
+ AH = lsr(AH,#31) // get sign
+ }
+ {
+ BH ^= asl(AH,#31) // form correctly signed zero in B
+ if (!p0) A = C // If C is not zero, return C
+ if (!p0) jumpr r31
+ }
+ // B has correctly signed zero, C is also zero
+.Lzero_plus_zero:
+ {
+ p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
+ if (p0.new) jumpr:t r31
+ A = B
+ }
+ {
+ TMP = USR
+ }
+ {
+ TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+ A = #0
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = ##0x80000000
+ jumpr r31
+ }
+#undef BTMP
+#undef BTMPH
+#undef BTMPL
+#define CTMP r11:10
+ .falign
+.Lfma_abnormal_c:
+ // We know that AB is normal * normal
+ // C is not normal: zero, subnormal, inf, or NaN.
+ {
+ p0 = dfclass(C,#0x10) // is C NaN?
+ if (p0.new) jump:nt .Lnan
+ if (p0.new) A = C // move NaN to A
+ deallocframe
+ }
+ {
+ p0 = dfclass(C,#0x08) // is C inf?
+ if (p0.new) A = C // return C
+ if (p0.new) jumpr:nt r31
+ }
+ // zero or subnormal
+ // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
+ {
+ p0 = dfclass(C,#0x01) // is C zero?
+ if (p0.new) jump:nt __hexagon_muldf3
+ TMP = #1
+ }
+ // Left with: subnormal
+ // Adjust C and jump back to restart
+ {
+ allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
+ CTMP = #0
+ CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
+ jump .Lfma_abnormal_c_restart
+ }
+END(fma)