diff options
Diffstat (limited to 'lib/Target/NVPTX/NVPTXInstrInfo.td')
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.td | 13 |
1 files changed, 9 insertions, 4 deletions
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 62da3c79f465..fe7a84f9a361 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; +def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ +def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" + "&& Subtarget->getPTXVersion() >= 64)">; + def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; @@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. -def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the @@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // and then ctlz that value. This way we don't have to subtract 16 from the // result. Unfortunately today we don't have a way to generate // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(ctlz Int16Regs:$a), +def : Pat<(i16 (ctlz Int16Regs:$a)), (SUBi16ri (CVT_u16_u32 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (ctlz Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; // Population count @@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; // If we know that we're storing into an i32, we can avoid the final trunc. def : Pat<(ctpop Int16Regs:$a), (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (ctpop Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; // fpround f32 -> f16 |