1 files changed, 694 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S
new file mode 100644
index 000000000000..843e88b3cab8
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S
@@ -0,0 +1,694 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+// Double Precision Multiply
+
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+#define C r5:4
+#define CH r5
+#define CL r4
+
+
+
+#define BTMP r15:14
+#define BTMPH r15
+#define BTMPL r14
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define CTMP r11:10
+#define CTMPH r11
+#define CTMPL r10
+
+#define PP_LL r9:8
+#define PP_LL_H r9
+#define PP_LL_L r8
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+
+#define PP_HH r17:16
+#define PP_HH_H r17
+#define PP_HH_L r16
+
+#define EXPA r18
+#define EXPB r19
+#define EXPBA r19:18
+
+#define TMP r28
+
+#define P_TMP p0
+#define PROD_NEG p3
+#define EXACT p2
+#define SWAP p1
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1023
+#define STACKSPACE 32
+
+#define ADJUST 4
+
+#define FUDGE 7
+#define FUDGE2 3
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+	// First, classify for normal values, and abort if abnormal
+	//
+	// Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
+	//
+	// Since we know that the 2 MSBs of the H registers is zero, we should never carry
+	// the partial products that involve the H registers
+	//
+	// Try to buy X slots, at the expense of latency if needed
+	//
+	// We will have PP_HH with the upper bits of the product, PP_LL with the lower
+	// PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
+	// PP_HH can have a minimum of 0x0100_0000_0000_0000
+	//
+	// 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
+	//
+	// We need to align CTMP.
+	// If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
+	// If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
+	// If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
+	//
+	// Convert partial product and CTMP to 2's complement prior to addition
+	//
+	// After we add, we need to normalize into upper 64 bits, then compute sticky.
+
+	.text
+	.global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+	.global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+	Q6_ALIAS(fmadf5)
+	.p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+.Lfma_begin:
+	{
+		P_TMP = dfclass(A,#2)
+		P_TMP = dfclass(B,#2)
+		ATMP = #0
+		BTMP = #0
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
+		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
+		PP_ODD_H = ##0x10000000
+		allocframe(#STACKSPACE)
+	}
+	{
+		PP_LL = mpyu(ATMPL,BTMPL)
+		if (!P_TMP) jump .Lfma_abnormal_ab
+		ATMPH = or(ATMPH,PP_ODD_H)
+		BTMPH = or(BTMPH,PP_ODD_H)
+	}
+	{
+		P_TMP = dfclass(C,#2)
+		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
+		CTMP = combine(PP_ODD_H,#0)
+		PP_ODD = combine(#0,PP_LL_H)
+	}
+.Lfma_abnormal_c_restart:
+	{
+		PP_ODD += mpyu(BTMPL,ATMPH)
+		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
+		memd(r29+#0) = PP_HH
+		memd(r29+#8) = EXPBA
+	}
+	{
+		PP_ODD += mpyu(ATMPL,BTMPH)
+		EXPBA = neg(CTMP)
+		P_TMP = cmp.gt(CH,#-1)
+		TMP = xor(AH,BH)
+	}
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+		PP_HH = combine(#0,PP_ODD_H)
+		if (!P_TMP) CTMP = EXPBA
+	}
+	{
+		PP_HH += mpyu(ATMPH,BTMPH)
+		PP_LL = combine(PP_ODD_L,PP_LL_L)
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
+#define RIGHTLEFTSHIFT r13:12
+#define RIGHTSHIFT r13
+#define LEFTSHIFT r12
+
+		EXPA = add(EXPA,EXPB)
+#undef EXPB
+#undef EXPBA
+#define EXPC r19
+#define EXPCA r19:18
+		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
+	}
+	// PP_HH:PP_LL now has product
+	// CTMP is negated
+	// EXPA,B,C are extracted
+	// We need to negate PP
+	// Since we will be adding with carry later, if we need to negate,
+	// just invert all bits now, which we can do conditionally and in parallel
+#define PP_HH_TMP r15:14
+#define PP_LL_TMP r7:6
+	{
+		EXPA = add(EXPA,#-BIAS+(ADJUST))
+		PROD_NEG = !cmp.gt(TMP,#-1)
+		PP_LL_TMP = #0
+		PP_HH_TMP = #0
+	}
+	{
+		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
+		P_TMP = !cmp.gt(TMP,#-1)
+		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
+		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
+	}
+	{
+		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
+		if (P_TMP) PP_LL = PP_LL_TMP
+#undef PP_LL_TMP
+#define CTMP2 r7:6
+#define CTMP2H r7
+#define CTMP2L r6
+		CTMP2 = #0
+		EXPC = sub(EXPA,EXPC)
+	}
+	{
+		if (P_TMP) PP_HH = PP_HH_TMP
+		P_TMP = cmp.gt(EXPC,#63)
+		if (SWAP) PP_LL = CTMP2
+		if (SWAP) CTMP2 = PP_LL
+	}
+#undef PP_HH_TMP
+//#define ONE r15:14
+//#define S_ONE r14
+#define ZERO r15:14
+#define S_ZERO r15
+#undef PROD_NEG
+#define P_CARRY p3
+	{
+		if (SWAP) PP_HH = CTMP	// Swap C and PP
+		if (SWAP) CTMP = PP_HH
+		if (P_TMP) EXPC = add(EXPC,#-64)
+		TMP = #63
+	}
+	{
+		// If diff > 63, pre-shift-right by 64...
+		if (P_TMP) CTMP2 = CTMP
+		TMP = asr(CTMPH,#31)
+		RIGHTSHIFT = min(EXPC,TMP)
+		LEFTSHIFT = #0
+	}
+#undef C
+#undef CH
+#undef CL
+#define STICKIES r5:4
+#define STICKIESH r5
+#define STICKIESL r4
+	{
+		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
+		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
+		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
+		LEFTSHIFT = sub(#64,RIGHTSHIFT)
+	}
+	{
+		ZERO = #0
+		TMP = #-2
+		CTMP2 |= lsl(CTMP,LEFTSHIFT)
+		CTMP = asr(CTMP,RIGHTSHIFT)
+	}
+	{
+		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
+		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
+#undef ZERO
+#define ONE r15:14
+#define S_ONE r14
+		ONE = #1
+		STICKIES = #0
+	}
+	{
+		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
+	}
+	{
+		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
+		TMP = #62
+	}
+	// PP_HH:PP_LL now holds the sum
+	// We may need to normalize left, up to ??? bits.
+	//
+	// I think that if we have massive cancellation, the range we normalize by
+	// is still limited
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
+	}
+	// We had all sign bits, shift left by 62.
+	{
+		CTMP = extractu(PP_LL,#62,#2)
+		PP_LL = asl(PP_LL,#62)
+		EXPA = add(EXPA,#-62)			// And adjust exponent of result
+	}
+	{
+		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
+	}
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+	}
+	.falign
+1:
+	{
+		CTMP = asl(PP_HH,LEFTSHIFT)
+		STICKIES |= asl(PP_LL,LEFTSHIFT)
+		RIGHTSHIFT = sub(#64,LEFTSHIFT)
+		EXPA = sub(EXPA,LEFTSHIFT)
+	}
+	{
+		CTMP |= lsr(PP_LL,RIGHTSHIFT)
+		EXACT = cmp.gtu(ONE,STICKIES)
+		TMP = #BIAS+BIAS-2
+	}
+	{
+		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
+		// If EXPA is overflow/underflow, jump to ovf_unf
+		P_TMP = !cmp.gt(EXPA,TMP)
+		P_TMP = cmp.gt(EXPA,#1)
+		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
+	}
+	{
+		// XXX: FIXME: should PP_HH for check of zero be CTMP?
+		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		PP_HH = memd(r29+#0)
+	}
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		EXPCA = memd(r29+#8)
+		if (!P_TMP) dealloc_return		// not zero, return
+	}
+.Ladd_yields_zero:
+	// We had full cancellation.  Return +/- zero (-0 when round-down)
+	{
+		TMP = USR
+		A = #0
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		dealloc_return
+	}
+
+#undef RIGHTLEFTSHIFT
+#undef RIGHTSHIFT
+#undef LEFTSHIFT
+#undef CTMP2
+#undef CTMP2H
+#undef CTMP2L
+
+.Lfma_ovf_unf:
+	{
+		p0 = cmp.gtu(ONE,CTMP)
+		if (p0.new) jump:nt .Ladd_yields_zero
+	}
+	{
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		TMP = EXPA
+	}
+#define NEW_EXPB r7
+#define NEW_EXPA r6
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		NEW_EXPA = add(EXPA,NEW_EXPB)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+#undef PP_HH
+#undef PP_HH_H
+#undef PP_HH_L
+#undef EXPCA
+#undef EXPC
+#undef EXPA
+#undef PP_LL
+#undef PP_LL_H
+#undef PP_LL_L
+#define EXPA r6
+#define EXPB r7
+#define EXPBA r7:6
+#define ATMP r9:8
+#define ATMPH r9
+#define ATMPL r8
+#undef NEW_EXPB
+#undef NEW_EXPA
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.gt(EXPA,##BIAS+BIAS)
+		if (p0.new) jump:nt .Lfma_ovf
+	}
+	{
+		p0 = cmp.gt(EXPA,#0)
+		if (p0.new) jump:nt .Lpossible_unf
+	}
+	{
+		// TMP has original EXPA.
+		// ATMP is corresponding value
+		// Normalize ATMP and shift right to correct location
+		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
+		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
+		p3 = cmp.gt(CTMPH,#-1)
+	}
+	// Underflow
+	// We know that the infinte range exponent should be EXPA
+	// CTMP is 2's complement, ATMP is abs(CTMP)
+	{
+		EXPA = add(EXPA,EXPB)		// how much to shift back right
+		ATMP = asl(ATMP,EXPB)		// shift left
+		AH = USR
+		TMP = #63
+	}
+	{
+		EXPB = min(EXPA,TMP)
+		EXPA = #0
+		AL = #0x0030
+	}
+	{
+		B = extractu(ATMP,EXPBA)
+		ATMP = asr(ATMP,EXPB)
+	}
+	{
+		p0 = cmp.gtu(ONE,B)
+		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
+		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
+	}
+	{
+		CTMP = neg(ATMP)
+		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
+		if (!p1.new) AH = or(AH,AL)
+		B = #0
+	}
+	{
+		if (p3) CTMP = ATMP
+		USR = AH
+		TMP = #-BIAS-(MANTBITS+FUDGE2)
+	}
+	{
+		A = convert_d2df(CTMP)
+	}
+	{
+		AH += asl(TMP,#HI_MANTBITS)
+		dealloc_return
+	}
+.Lpossible_unf:
+	{
+		TMP = ##0x7fefffff
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.eq(AL,#0)
+		p0 = bitsclr(AH,TMP)
+		if (!p0.new) dealloc_return:t
+		TMP = #0x7fff
+	}
+	{
+		p0 = bitsset(ATMPH,TMP)
+		BH = USR
+		BL = #0x0030
+	}
+	{
+		if (p0) BH = or(BH,BL)
+	}
+	{
+		USR = BH
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		dealloc_return
+	}
+.Lfma_ovf:
+	{
+		TMP = USR
+		CTMP = combine(##0x7fefffff,#-1)
+		A = CTMP
+	}
+	{
+		ATMP = combine(##0x7ff00000,#0)
+		BH = extractu(TMP,#2,#SR_ROUND_OFF)
+		TMP = or(TMP,#0x28)
+	}
+	{
+		USR = TMP
+		BH ^= lsr(AH,#31)
+		BL = BH
+	}
+	{
+		p0 = !cmp.eq(BL,#1)
+		p0 = !cmp.eq(BH,#2)
+	}
+	{
+		p0 = dfcmp.eq(ATMP,ATMP)
+		if (p0.new) CTMP = ATMP
+	}
+	{
+		A = insert(CTMP,#63,#0)
+		dealloc_return
+	}
+#undef CTMP
+#undef CTMPH
+#undef CTMPL
+#define BTMP r11:10
+#define BTMPH r11
+#define BTMPL r10
+
+#undef STICKIES
+#undef STICKIESH
+#undef STICKIESL
+#define C r5:4
+#define CH r5
+#define CL r4
+
+.Lfma_abnormal_ab:
+	{
+		ATMP = extractu(A,#63,#0)
+		BTMP = extractu(B,#63,#0)
+		deallocframe
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B		// sort values
+		if (!p3.new) B = A
+	}
+	{
+		p0 = dfclass(A,#0x0f)		// A NaN?
+		if (!p0.new) jump:nt .Lnan
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		p1 = dfclass(A,#0x08)		// A is infinity
+		p1 = dfclass(B,#0x0e)		// B is nonzero
+	}
+	{
+		p0 = dfclass(A,#0x08)		// a is inf
+		p0 = dfclass(B,#0x01)		// b is zero
+	}
+	{
+		if (p1) jump .Lab_inf
+		p2 = dfclass(B,#0x01)
+	}
+	{
+		if (p0) jump .Linvalid
+		if (p2) jump .Lab_true_zero
+		TMP = ##0x7c000000
+	}
+	// We are left with a normal or subnormal times a subnormal, A > B
+	// If A and B are both very small, we will go to a single sticky bit; replace
+	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
+	// if A and B might multiply to something bigger, decrease A exp and increase B exp
+	// and start over
+	{
+		p0 = bitsclr(AH,TMP)
+		if (p0.new) jump:nt .Lfma_ab_tiny
+	}
+	{
+		TMP = add(clb(BTMP),#-EXPBITS)
+	}
+	{
+		BTMP = asl(BTMP,TMP)
+	}
+	{
+		B = insert(BTMP,#63,#0)
+		AH -= asl(TMP,#HI_MANTBITS)
+	}
+	jump .Lfma_begin
+
+.Lfma_ab_tiny:
+	ATMP = combine(##0x00100000,#0)
+	{
+		A = insert(ATMP,#63,#0)
+		B = insert(ATMP,#63,#0)
+	}
+	jump .Lfma_begin
+
+.Lab_inf:
+	{
+		B = lsr(B,#63)
+		p0 = dfclass(C,#0x10)
+	}
+	{
+		A ^= asl(B,#63)
+		if (p0) jump .Lnan
+	}
+	{
+		p1 = dfclass(C,#0x08)
+		if (p1.new) jump:nt .Lfma_inf_plus_inf
+	}
+	// A*B is +/- inf, C is finite.  Return A
+	{
+		jumpr r31
+	}
+	.falign
+.Lfma_inf_plus_inf:
+	{	// adding infinities of different signs is invalid
+		p0 = dfcmp.eq(A,C)
+		if (!p0.new) jump:nt .Linvalid
+	}
+	{
+		jumpr r31
+	}
+
+.Lnan:
+	{
+		p0 = dfclass(B,#0x10)
+		p1 = dfclass(C,#0x10)
+		if (!p0.new) B = A
+		if (!p1.new) C = A
+	}
+	{	// find sNaNs
+		BH = convert_df2sf(B)
+		BL = convert_df2sf(C)
+	}
+	{
+		BH = convert_df2sf(A)
+		A = #-1
+		jumpr r31
+	}
+
+.Linvalid:
+	{
+		TMP = ##0x7f800001		// sp snan
+	}
+	{
+		A = convert_sf2df(TMP)
+		jumpr r31
+	}
+
+.Lab_true_zero:
+	// B is zero, A is finite number
+	{
+		p0 = dfclass(C,#0x10)
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C
+	}
+	{
+		p0 = dfcmp.eq(B,C)		// is C also zero?
+		AH = lsr(AH,#31)		// get sign
+	}
+	{
+		BH ^= asl(AH,#31)		// form correctly signed zero in B
+		if (!p0) A = C			// If C is not zero, return C
+		if (!p0) jumpr r31
+	}
+	// B has correctly signed zero, C is also zero
+.Lzero_plus_zero:
+	{
+		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
+		if (p0.new) jumpr:t r31
+		A = B
+	}
+	{
+		TMP = USR
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		A = #0
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		jumpr r31
+	}
+#undef BTMP
+#undef BTMPH
+#undef BTMPL
+#define CTMP r11:10
+	.falign
+.Lfma_abnormal_c:
+	// We know that AB is normal * normal
+	// C is not normal: zero, subnormal, inf, or NaN.
+	{
+		p0 = dfclass(C,#0x10)		// is C NaN?
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C		// move NaN to A
+		deallocframe
+	}
+	{
+		p0 = dfclass(C,#0x08)		// is C inf?
+		if (p0.new) A = C		// return C
+		if (p0.new) jumpr:nt r31
+	}
+	// zero or subnormal
+	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
+	{
+		p0 = dfclass(C,#0x01)		// is C zero?
+		if (p0.new) jump:nt __hexagon_muldf3
+		TMP = #1
+	}
+	// Left with: subnormal
+	// Adjust C and jump back to restart
+	{
+		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
+		CTMP = #0
+		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
+		jump .Lfma_abnormal_c_restart
+	}
+END(fma)