1 files changed, 80 insertions, 8 deletions
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index d969a9e1ab3a..bccbbd4591ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> :
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
 
+class CCIfILP32<CCAction A> :
+  CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
+
+
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
@@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCPassIndirect<i64>>,
+
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCAssignToReg<[P0, P1, P2, P3]>>,
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCPassIndirect<i64>>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
   CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
@@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
-      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCAssignToReg<[P0, P1, P2, P3]>>
 ]>;
 
 // Vararg functions on windows pass floats in integer registers
@@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
   CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Re-demote pointers to 32-bits so we don't end up storing 64-bit
+  // values and clobbering neighbouring stack locations. Not very pretty.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
+
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
@@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
            CCAssignToStack<16, 16>>
 ]>;
 
+// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the
+// same as the normal Darwin VarArgs handling.
+let Entry = 1 in
+def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i32 or f32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f16],     CCPromoteToType<f32>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+
 // The WebKit_JS calling convention only passes the first argument (the callee)
 // in register and the remaining arguments on stack. We allow 32bit stack slots,
 // so that WebKit can write partial values in the stack and define the other
@@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
 ]>;
 
+// The order of the callee-saves in this file is important, because the
+// FrameLowering code will use this order to determine the layout the
+// callee-save area in the stack frame. As can be observed below, Darwin
+// requires the frame-record (LR, FP) to be at the top the callee-save area,
+// whereas for other platforms they are at the bottom.
+
 // FIXME: LR is only callee-saved in the sense that *we* preserve it and are
 // presumably a callee to someone. External functions may not do so, but this
 // is currently safe since BL has LR as an implicit-def and what happens after a
@@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[
 // It would be better to model its preservation semantics properly (create a
 // vreg on entry, use it in RET & tail call generation; make that vreg def if we
 // end up saving LR as part of a call frame). Watch this space...
-def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                           X25, X26, X27, X28, LR, FP,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            X23, X24, X25, X26, X27, X28,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
@@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
 // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
 // and not (LR,FP) pairs.
-def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
-                                               X23, X24, X25, X26, X27, X28,
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                               X25, X26, X27, X28, FP, LR,
                                                D8, D9, D10, D11,
                                                D12, D13, D14, D15)>;
 
 // AArch64 PCS for vector functions (VPCS)
 // must (additionally) preserve full Q8-Q23 registers
-def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
-                                          X23, X24, X25, X26, X27, X28,
+def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                          X25, X26, X27, X28, LR, FP,
                                           (sequence "Q%u", 8, 23))>;
 
+// Functions taking SVE arguments or returning an SVE type
+// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                               X25, X26, X27, X28, LR, FP,
+                                               (sequence "Z%u", 8, 23),
+                                               (sequence "P%u", 4, 15))>;
+
 // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
 // 'this' and the pointer return value are both passed in X0 in these cases,
 // this can be partially modelled by treating X0 as a callee-saved register;
@@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 def CSR_AArch64_AAPCS_SwiftError
-    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
+    : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
@@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin
 // fast path calls a function that follows CSR_AArch64_TLS_Darwin,
 // CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
 def CSR_AArch64_CXX_TLS_Darwin
-    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
                            (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
                            (sequence "D%u", 0, 31))>;