diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
commit | f382538d471e38a9b98f016c4caebd24c8d60b62 (patch) | |
tree | d30f3d58b1044b5355d50c17a6a96c6a0b35703a /lib/Target | |
parent | ee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff) | |
download | src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.tar.gz src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.zip |
Notes
Diffstat (limited to 'lib/Target')
74 files changed, 1829 insertions, 683 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 4af5fef4287cc..abe28460c83ab 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -190,6 +190,7 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -226,6 +227,7 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td index 3fbbc0be682d7..3b71cf8399a0d 100644 --- a/lib/Target/AArch64/AArch64SchedM1.td +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel { let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 0; // Use the default model otherwise. + let CompleteModel = 1; // Use the default model otherwise. } //===----------------------------------------------------------------------===// @@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, +def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteL5]>]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, +def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteS1]>]>; @@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; } // Load instructions. def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; } -def : SchedAlias<WriteLDIdx, M1WriteLA>; +def : SchedAlias<WriteLDIdx, M1WriteLX>; // Store instructions. def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; } -def : SchedAlias<WriteSTIdx, M1WriteSA>; +def : SchedAlias<WriteSTIdx, M1WriteSX>; // FP data instructions. def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } @@ -231,6 +231,111 @@ def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } def M1WriteTB : SchedWriteRes<[M1UnitC, M1UnitALU]> { let Latency = 2; } +def M1WriteVLDA : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 6; } +def M1WriteVLDB : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 7; } +def M1WriteVLDC : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 8; } +def M1WriteVLDD : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDE : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDF : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 10; + let ResourceCycles = [5]; } +def M1WriteVLDG : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDH : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDI : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 12; + let ResourceCycles = [6]; } +def M1WriteVLDJ : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDK : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDL : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDM : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDN : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 14; + let ResourceCycles = [7]; } + +def M1WriteVSTA : WriteSequence<[WriteVST], 2>; +def M1WriteVSTB : WriteSequence<[WriteVST], 3>; +def M1WriteVSTC : WriteSequence<[WriteVST], 4>; +def M1WriteVSTD : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 7; + let ResourceCycles = [7]; } +def M1WriteVSTE : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 8; + let ResourceCycles = [8]; } +def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 15; + let ResourceCycles = [15]; } +def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 16; + let ResourceCycles = [16]; } +def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 14; + let ResourceCycles = [14]; } +def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 17; + let ResourceCycles = [17]; } // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -360,8 +465,233 @@ def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64 def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. +def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[M1WriteVLDD, + WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; +def : InstRW<[M1WriteVLDE, + WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>; +def : InstRW<[M1WriteVLDH, + WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>; +def : InstRW<[M1WriteVLDL, + WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>; +def : InstRW<[M1WriteVLDM, + WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; // ASIMD store instructions. +def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index cb3f72a524f51..d4a8cecdb29f1 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -256,9 +256,9 @@ namespace { /// AArch64 Code Generator Pass Configuration Options. class AArch64PassConfig : public TargetPassConfig { public: - AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM.getOptLevel() != CodeGenOpt::None) substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -317,7 +317,7 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { - return new AArch64PassConfig(this, PM); + return new AArch64PassConfig(*this, PM); } void AArch64PassConfig::addIRPasses() { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index f473944cd5283..0959014812d8b 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -503,40 +503,37 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (!FrameInfo.hasCalls()) { - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. - return Info; + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } } - llvm_unreachable("calls not implemented"); + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 48827f4639974..596f02ae4a649 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -456,7 +456,7 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // Exceptions and StackMaps are not supported, so these passes will never do // anything. @@ -487,7 +487,7 @@ public: class R600PassConfig final : public AMDGPUPassConfig { public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} ScheduleDAGInstrs *createMachineScheduler( @@ -503,7 +503,7 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} GCNTargetMachine &getGCNTargetMachine() const { @@ -682,7 +682,7 @@ void R600PassConfig::addPreEmitPass() { } TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); + return new R600PassConfig(*this, PM); } //===----------------------------------------------------------------------===// @@ -844,6 +844,6 @@ void GCNPassConfig::addPreEmitPass() { } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); + return new GCNPassConfig(*this, PM); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 934bf7f31bab4..a3c7c1982d0a6 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -69,7 +69,6 @@ public: return -1; return 0; } - }; //===----------------------------------------------------------------------===// @@ -89,6 +88,10 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; const R600Subtarget *getSubtargetImpl(const Function &) const override; + + bool isMachineVerifierClean() const override { + return false; + } }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f5541e08e1b72..cc68c971b2490 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -161,7 +161,8 @@ public: ImmTyOpSel, ImmTyOpSelHi, ImmTyNegLo, - ImmTyNegHi + ImmTyNegHi, + ImmTySwizzle }; struct TokOp { @@ -474,6 +475,7 @@ public: bool isSWaitCnt() const; bool isHwreg() const; bool isSendMsg() const; + bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; @@ -659,6 +661,7 @@ public: case ImmTyOpSelHi: OS << "OpSelHi"; break; case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; + case ImmTySwizzle: OS << "Swizzle"; break; } } @@ -994,6 +997,12 @@ private: bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool trySkipId(const StringRef Id); + bool trySkipToken(const AsmToken::TokenKind Kind); + bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); + bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseExpr(int64_t &Imm); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1003,6 +1012,19 @@ public: OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg); + OperandMatchResultTy parseSwizzleOp(OperandVector &Operands); + bool parseSwizzleOffset(int64_t &Imm); + bool parseSwizzleMacro(int64_t &Imm); + bool parseSwizzleQuadPerm(int64_t &Imm); + bool parseSwizzleBitmaskPerm(int64_t &Imm); + bool parseSwizzleBroadcast(int64_t &Imm); + bool parseSwizzleSwap(int64_t &Imm); + bool parseSwizzleReverse(int64_t &Imm); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } @@ -2785,7 +2807,13 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, OptionalIdx[Op.getImmTy()] = i; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + AMDGPUOperand::ImmTy OffsetType = + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : + AMDGPUOperand::ImmTyOffset; + + addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); + if (!IsGdsHardcoded) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } @@ -3384,6 +3412,298 @@ bool AMDGPUOperand::isSendMsg() const { } //===----------------------------------------------------------------------===// +// parser helpers +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id) { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == Id) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { + if (getLexer().getKind() == Kind) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, + const StringRef ErrMsg) { + if (!trySkipToken(Kind)) { + Error(Parser.getTok().getLoc(), ErrMsg); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseExpr(int64_t &Imm) { + return !getParser().parseAbsoluteExpression(Imm); +} + +bool +AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { + SMLoc S = Parser.getTok().getLoc(); + if (getLexer().getKind() == AsmToken::String) { + Val = Parser.getTok().getStringContents(); + Parser.Lex(); + return true; + } else { + Error(S, ErrMsg); + return false; + } +} + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +LLVM_READNONE +static unsigned +encodeBitmaskPerm(const unsigned AndMask, + const unsigned OrMask, + const unsigned XorMask) { + using namespace llvm::AMDGPU::Swizzle; + + return BITMASK_PERM_ENC | + (AndMask << BITMASK_AND_SHIFT) | + (OrMask << BITMASK_OR_SHIFT) | + (XorMask << BITMASK_XOR_SHIFT); +} + +bool +AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg) { + for (unsigned i = 0; i < OpNum; ++i) { + if (!skipToken(AsmToken::Comma, "expected a comma")){ + return false; + } + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (!parseExpr(Op[i])) { + return false; + } + if (Op[i] < MinVal || Op[i] > MaxVal) { + Error(ExprLoc, ErrMsg); + return false; + } + } + + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + int64_t Lane[LANE_NUM]; + if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, + "expected a 2-bit lane id")) { + Imm = QUAD_PERM_ENC; + for (auto i = 0; i < LANE_NUM; ++i) { + Imm |= Lane[i] << (LANE_SHIFT * i); + } + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + int64_t LaneIdx; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, + "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + if (parseSwizzleOperands(1, &LaneIdx, + 0, GroupSize - 1, + "lane id must be in the interval [0,group size - 1]")) { + Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 1, 16, "group size must be in the interval [1,16]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!skipToken(AsmToken::Comma, "expected a comma")) { + return false; + } + + StringRef Ctl; + SMLoc StrLoc = Parser.getTok().getLoc(); + if (!parseString(Ctl)) { + return false; + } + if (Ctl.size() != BITMASK_WIDTH) { + Error(StrLoc, "expected a 5-character mask"); + return false; + } + + unsigned AndMask = 0; + unsigned OrMask = 0; + unsigned XorMask = 0; + + for (size_t i = 0; i < Ctl.size(); ++i) { + unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i); + switch(Ctl[i]) { + default: + Error(StrLoc, "invalid mask"); + return false; + case '0': + break; + case '1': + OrMask |= Mask; + break; + case 'p': + AndMask |= Mask; + break; + case 'i': + AndMask |= Mask; + XorMask |= Mask; + break; + } + } + + Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) { + + SMLoc OffsetLoc = Parser.getTok().getLoc(); + + if (!parseExpr(Imm)) { + return false; + } + if (!isUInt<16>(Imm)) { + Error(OffsetLoc, "expected a 16-bit offset"); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (skipToken(AsmToken::LParen, "expected a left parentheses")) { + + SMLoc ModeLoc = Parser.getTok().getLoc(); + bool Ok = false; + + if (trySkipId(IdSymbolic[ID_QUAD_PERM])) { + Ok = parseSwizzleQuadPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) { + Ok = parseSwizzleBitmaskPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BROADCAST])) { + Ok = parseSwizzleBroadcast(Imm); + } else if (trySkipId(IdSymbolic[ID_SWAP])) { + Ok = parseSwizzleSwap(Imm); + } else if (trySkipId(IdSymbolic[ID_REVERSE])) { + Ok = parseSwizzleReverse(Imm); + } else { + Error(ModeLoc, "expected a swizzle mode"); + } + + return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses"); + } + + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (trySkipId("offset")) { + + bool Ok = false; + if (skipToken(AsmToken::Colon, "expected a colon")) { + if (trySkipId("swizzle")) { + Ok = parseSwizzleMacro(Imm); + } else { + Ok = parseSwizzleOffset(Imm); + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle)); + + return Ok? MatchOperand_Success : MatchOperand_ParseFail; + } else { + return MatchOperand_NoMatch; + } +} + +bool +AMDGPUOperand::isSwizzle() const { + return isImmTy(ImmTySwizzle); +} + +//===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 357e18108e7e8..fc516c3b39c28 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -145,10 +145,10 @@ class DS_1A2D_Off8_RET<string opName, let hasPostISelHook = 1; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32> +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, offset:$offset, gds:$gds), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds), "$vdst, $addr$offset$gds"> { let has_data0 = 0; @@ -440,7 +440,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; } let mayStore = 0 in { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a817ff3cbaf09..523eea41897ea 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -1160,6 +1160,112 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, O << SImm16; // Unknown simm16 code. } +static void printSwizzleBitmask(const uint16_t AndMask, + const uint16_t OrMask, + const uint16_t XorMask, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; + uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; + + O << "\""; + + for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { + uint16_t p0 = Probe0 & Mask; + uint16_t p1 = Probe1 & Mask; + + if (p0 == p1) { + if (p0 == 0) { + O << "0"; + } else { + O << "1"; + } + } else { + if (p0 == 0) { + O << "p"; + } else { + O << "i"; + } + } + } + + O << "\""; +} + +void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << " offset:"; + + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + + O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; + for (auto i = 0; i < LANE_NUM; ++i) { + O << ","; + O << formatDec(Imm & LANE_MASK); + Imm >>= LANE_SHIFT; + } + O << ")"; + + } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { + + uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; + uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; + uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; + + if (AndMask == BITMASK_MAX && + OrMask == 0 && + countPopulation(XorMask) == 1) { + + O << "swizzle(" << IdSymbolic[ID_SWAP]; + O << ","; + O << formatDec(XorMask); + O << ")"; + + } else if (AndMask == BITMASK_MAX && + OrMask == 0 && XorMask > 0 && + isPowerOf2_64(XorMask + 1)) { + + O << "swizzle(" << IdSymbolic[ID_REVERSE]; + O << ","; + O << formatDec(XorMask + 1); + O << ")"; + + } else { + + uint16_t GroupSize = BITMASK_MAX - AndMask + 1; + if (GroupSize > 1 && + isPowerOf2_64(GroupSize) && + OrMask < GroupSize && + XorMask == 0) { + + O << "swizzle(" << IdSymbolic[ID_BROADCAST]; + O << ","; + O << formatDec(GroupSize); + O << ","; + O << formatDec(OrMask); + O << ")"; + + } else { + O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; + O << ","; + printSwizzleBitmask(AndMask, OrMask, XorMask, O); + O << ")"; + } + } + } else { + printU16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index c0b8e5c510893..c8094c4b840a1 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -193,6 +193,8 @@ private: raw_ostream &O); void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 80967edee0ab1..5cd90323ff67b 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -281,6 +281,46 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] } // namespace Hwreg +namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. + +enum Id { // id of symbolic names + ID_QUAD_PERM = 0, + ID_BITMASK_PERM, + ID_SWAP, + ID_REVERSE, + ID_BROADCAST +}; + +enum EncBits { + + // swizzle mode encodings + + QUAD_PERM_ENC = 0x8000, + QUAD_PERM_ENC_MASK = 0xFF00, + + BITMASK_PERM_ENC = 0x0000, + BITMASK_PERM_ENC_MASK = 0x8000, + + // QUAD_PERM encodings + + LANE_MASK = 0x3, + LANE_MAX = LANE_MASK, + LANE_SHIFT = 2, + LANE_NUM = 4, + + // BITMASK_PERM encodings + + BITMASK_MASK = 0x1F, + BITMASK_MAX = BITMASK_MASK, + BITMASK_WIDTH = 5, + + BITMASK_AND_SHIFT = 0, + BITMASK_OR_SHIFT = 5, + BITMASK_XOR_SHIFT = 10 +}; + +} // namespace Swizzle + namespace SDWA { enum SdwaSel { diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b5e3ce3dfe3ed..e22166d03e9ae 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -826,7 +826,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { @@ -1149,8 +1150,10 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) { - if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + uint64_t TSFlags = Inst.getDesc().TSFlags; + if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && + TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1183,7 +1186,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && - (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) { + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { @@ -1715,6 +1718,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); MLI = &getAnalysis<MachineLoopInfo>(); IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); AMDGPUASI = ST->getAMDGPUAS(); HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1859,5 +1863,19 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Modified = true; + } + return Modified; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c5287c7f64ba4..445bf79a7814e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -383,6 +383,14 @@ def SendMsgMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def SwizzleMatchClass : AsmOperandClass { + let Name = "Swizzle"; + let PredicateMethod = "isSwizzle"; + let ParserMethod = "parseSwizzleOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -395,6 +403,11 @@ def SendMsgImm : Operand<i32> { let ParserMatchClass = SendMsgMatchClass; } +def SwizzleImm : Operand<i16> { + let PrintMethod = "printSwizzle"; + let ParserMatchClass = SwizzleMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index b91cdddc5520f..a648c178101a1 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -66,6 +66,12 @@ public: const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + // Stack access is very expensive. CSRs are also the high registers, and we + // want to minimize the number of used registers. + unsigned getCSRFirstUseCost() const override { + return 100; + } + unsigned getFrameRegister(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index b6868de6a74e3..03b11ae80500e 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -65,5 +65,18 @@ const char* const IdSymbolic[] = { }; } // namespace Hwreg + +namespace Swizzle { + +// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "QUAD_PERM", + "BITMASK_PERM", + "SWAP", + "REVERSE", + "BROADCAST", +}; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index b2dc2c0e364cd..ebb2be22b4879 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax. extern const char* const IdSymbolic[]; } // namespace Hwreg + +namespace Swizzle { // Symbolic names for the swizzle(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 90baabcdb6520..ec49f0d37af44 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -757,14 +757,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MI.eraseFromParent(); } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as -/// possible. This only gets used at -O0 so we don't care about efficiency of the -/// generated code. +/// possible. This only gets used at -O0 so we don't care about efficiency of +/// the generated code. bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, @@ -773,16 +768,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, bool IsThumb = STI->isThumb(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -795,25 +789,35 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, if (UxtOp) { MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg()) - .addReg(Desired.getReg(), RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg) + .addReg(DesiredReg, RegState::Kill); if (!IsThumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); } // .Lloadcmp: + // mov wStatus, #0 // ldrex rDest, [rAddr] // cmp rDest, rDesired // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); + if (!StatusDead) { + if (IsThumb) { + BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg) + .addDef(ARM::CPSR, RegState::Dead) + .addImm(0) + .add(predOps(ARMCC::AL)); + } else { + BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + } + } MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg()); - MIB.addReg(Addr.getReg()); + MIB.addReg(AddrReg); if (LdrexOp == ARM::t2LDREX) MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); @@ -821,7 +825,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .add(Desired) + .addReg(DesiredReg) .add(predOps(ARMCC::AL)); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; BuildMI(LoadCmpBB, DL, TII->get(Bcc)) @@ -835,21 +839,16 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, // strex rStatus, rNew, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - - - MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg); - MIB.add(New); - MIB.add(Addr); + MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg) + .addReg(NewReg) + .addReg(AddrReg); if (StrexOp == ARM::t2STREX) MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -861,12 +860,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -894,19 +905,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + MachineOperand New = MI.getOperand(4); + New.setIsKill(false); unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); - unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); - unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); + unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -922,26 +933,21 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // cmp rDestLo, rDesiredLo // sbcs rStatus<dead>, rDestHi, rDesiredHi // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD; MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD)); addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI); - MIB.addReg(Addr.getReg()).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestLo, getKillRegState(Dest.isDead())) - .addReg(DesiredLo, getKillRegState(Desired.isDead())) + .addReg(DesiredLo) .add(predOps(ARMCC::AL)); BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestHi, getKillRegState(Dest.isDead())) - .addReg(DesiredHi, getKillRegState(Desired.isDead())) + .addReg(DesiredHi) .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; @@ -956,18 +962,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // strexd rStatus, rNewLo, rNewHi, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg); addExclusiveRegPair(MIB, New, 0, IsThumb, TRI); - MIB.add(Addr).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -979,12 +981,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 4f7a0ab4e2203..c2b2502843c0a 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -968,8 +968,9 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) continue; - bool isLiveIn = MF.getRegInfo().isLiveIn(Reg); - if (!isLiveIn) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool isLiveIn = MRI.isLiveIn(Reg); + if (!isLiveIn && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); // If NoGap is true, push consecutive registers and then leave the rest // for other instructions. e.g. diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index bee83dfb6f636..423f97ccacd64 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1413,7 +1413,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them // and make use of the same compressed jump table format as Thumb-2. -let Size = 2 in { +let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, + isIndirectBranch = 1 in { def tTBB_JT : tPseudoInst<(outs), (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index bf3d820e7b7d0..45471a4e95b39 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -3494,7 +3494,8 @@ def t2B : T2I<(outs), (ins thumb_br_target:$target), IIC_Br, let AsmMatchConverter = "cvtThumbBranches"; } -let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { +let Size = 4, isNotDuplicable = 1, isBranch = 1, isTerminator = 1, + isBarrier = 1, isIndirectBranch = 1 in { // available in both v8-M.Baseline and Thumb2 targets def t2BR_JT : t2basePseudoInst<(outs), diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index c4f23c66e4eab..f5e4043882ff7 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -382,7 +382,7 @@ namespace { /// ARM Code Generator Pass Configuration Options. class ARMPassConfig : public TargetPassConfig { public: - ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM) + ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} ARMBaseTargetMachine &getARMTargetMachine() const { @@ -419,7 +419,7 @@ INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix", "ARM Execution Dependency Fix", false, false) TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { - return new ARMPassConfig(this, PM); + return new ARMPassConfig(*this, PM); } void ARMPassConfig::addIRPasses() { diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index e5eb27114c726..2fcee73228fe7 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -60,6 +60,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// ARM/Thumb little endian target machine. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 40bf545e83224..b0d1d3fb9ef0b 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -729,6 +729,15 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // linker can handle it. GNU AS produces an error in this case. if (Sym->isExternal() || Value >= 0x400004) IsResolved = false; + // When an ARM function is called from a Thumb function, produce a + // relocation so the linker will use the correct branch instruction for ELF + // binaries. + if (Sym->isELF()) { + unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType(); + if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) && + !Asm.isThumbFunc(Sym)) + IsResolved = false; + } } // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index f917c35b9cebb..f10427e2ed575 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -698,13 +698,14 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, CopyRegs.insert(ArgReg); // Push the low registers and lr + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!LoRegsToSave.empty()) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) { if (LoRegsToSave.count(Reg)) { - bool isKill = !MF.getRegInfo().isLiveIn(Reg); - if (isKill) + bool isKill = !MRI.isLiveIn(Reg); + if (isKill && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); MIB.addReg(Reg, getKillRegState(isKill)); @@ -746,8 +747,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, SmallVector<unsigned, 4> RegsToPush; while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) { if (HiRegsToSave.count(*HiRegToSave)) { - bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave); - if (isKill) + bool isKill = !MRI.isLiveIn(*HiRegToSave); + if (isKill && !MRI.isReserved(*HiRegToSave)) MBB.addLiveIn(*HiRegToSave); // Emit a MOV from the high reg to the low reg. diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index ef9c00e4b784e..7d3faac1dcc20 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -1500,9 +1500,9 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); // BB: - // cp 0, N + // cpi N, 0 // breq RemBB - BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0); + BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0); BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB); // LoopBB: diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index f10ca394f36ce..5dd8b2c27b212 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -904,7 +904,7 @@ let Defs = [SREG] in // Compares a register with an 8 bit immediate. def CPIRdK : FRdK<0b0011, (outs), - (ins GPR8:$rd, imm_ldi8:$k), + (ins LD8:$rd, imm_ldi8:$k), "cpi\t$rd, $k", [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>; } diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index fb3262916b4fd..2ab0b1080c6af 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -57,7 +57,7 @@ namespace { /// AVR Code Generator Pass Configuration Options. class AVRPassConfig : public TargetPassConfig { public: - AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM) + AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} AVRTargetMachine &getAVRTargetMachine() const { @@ -71,7 +71,7 @@ public: } // namespace TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) { - return new AVRPassConfig(this, PM); + return new AVRPassConfig(*this, PM); } extern "C" void LLVMInitializeAVRTarget() { diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h index 10345193d14af..795e94e6af03a 100644 --- a/lib/Target/AVR/AVRTargetMachine.h +++ b/lib/Target/AVR/AVRTargetMachine.h @@ -41,6 +41,10 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + bool isMachineVerifierClean() const override { + return false; + } + private: std::unique_ptr<TargetLoweringObjectFile> TLOF; AVRSubtarget SubTarget; diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 897695633e46b..cf8e735409042 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -58,7 +58,7 @@ namespace { // BPF Code Generator Pass Configuration Options. class BPFPassConfig : public TargetPassConfig { public: - BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM) + BPFPassConfig(BPFTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} BPFTargetMachine &getBPFTargetMachine() const { @@ -70,7 +70,7 @@ public: } TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { - return new BPFPassConfig(this, PM); + return new BPFPassConfig(*this, PM); } // Install an instruction selector pass using diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt index e2654b0465df1..4918653ff19da 100644 --- a/lib/Target/BPF/CMakeLists.txt +++ b/lib/Target/BPF/CMakeLists.txt @@ -4,7 +4,7 @@ tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info) tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info) tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM BPFGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel) tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv) diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 8e10c521a77d3..e4434136bf866 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -71,6 +71,9 @@ public: return true; } + bool ComplexPatternFuncMutatesDAG() const override { + return true; + } void PreprocessISelDAG() override; void EmitFunctionEntryCode() override; @@ -81,6 +84,7 @@ public: inline bool SelectAddrGP(SDValue &N, SDValue &R); bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP); bool SelectAddrFI(SDValue &N, SDValue &R); + bool DetectUseSxtw(SDValue &N, SDValue &R); StringRef getPassName() const override { return "Hexagon DAG->DAG Pattern Instruction Selection"; @@ -106,7 +110,6 @@ public: void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl); void SelectStore(SDNode *N); void SelectSHL(SDNode *N); - void SelectMul(SDNode *N); void SelectZeroExtend(SDNode *N); void SelectIntrinsicWChain(SDNode *N); void SelectIntrinsicWOChain(SDNode *N); @@ -118,7 +121,7 @@ public: #include "HexagonGenDAGISel.inc" private: - bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src); + bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src); bool isOrEquivalentToAdd(const SDNode *N) const; bool isAlignedMemNode(const MemSDNode *N) const; bool isPositiveHalfWord(const SDNode *N) const; @@ -597,90 +600,6 @@ void HexagonDAGToDAGISel::SelectStore(SDNode *N) { SelectCode(ST); } -void HexagonDAGToDAGISel::SelectMul(SDNode *N) { - SDLoc dl(N); - - // %conv.i = sext i32 %tmp1 to i64 - // %conv2.i = sext i32 %add to i64 - // %mul.i = mul nsw i64 %conv2.i, %conv.i - // - // --- match with the following --- - // - // %mul.i = mpy (%tmp1, %add) - // - - if (N->getValueType(0) == MVT::i64) { - // Shifting a i64 signed multiply. - SDValue MulOp0 = N->getOperand(0); - SDValue MulOp1 = N->getOperand(1); - - SDValue OP0; - SDValue OP1; - - // Handle sign_extend and sextload. - if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext0 = MulOp0.getOperand(0); - if (Sext0.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP0 = Sext0; - } else if (MulOp0.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Same goes for the second operand. - if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext1 = MulOp1.getOperand(0); - if (Sext1.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP1 = Sext1; - } else if (MulOp1.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Generate a mpy instruction. - SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl, - MVT::i64, OP0, OP1); - ReplaceNode(N, Result); - return; - } - - SelectCode(N); -} - void HexagonDAGToDAGISel::SelectSHL(SDNode *N) { SDLoc dl(N); SDValue Shl_0 = N->getOperand(0); @@ -843,7 +762,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { SDValue V = N->getOperand(1); SDValue U; - if (isValueExtension(V, Bits, U)) { + if (keepsLowBits(V, Bits, U)) { SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), N->getOperand(0), U); ReplaceNode(N, R.getNode()); @@ -949,7 +868,6 @@ void HexagonDAGToDAGISel::Select(SDNode *N) { case ISD::SHL: return SelectSHL(N); case ISD::LOAD: return SelectLoad(N); case ISD::STORE: return SelectStore(N); - case ISD::MUL: return SelectMul(N); case ISD::ZERO_EXTEND: return SelectZeroExtend(N); case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N); case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N); @@ -1327,7 +1245,7 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() { } // Match a frame index that can be used in an addressing mode. -bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) { +bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) { if (N.getOpcode() != ISD::FrameIndex) return false; auto &HFI = *HST->getFrameLowering(); @@ -1388,16 +1306,83 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R, return false; } -bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, - unsigned FromBits, SDValue &Src) { +bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { + // This (complex pattern) function is meant to detect a sign-extension + // i32->i64 on a per-operand basis. This would allow writing single + // patterns that would cover a number of combinations of different ways + // a sign-extensions could be written. For example: + // (mul (DetectUseSxtw x) (DetectUseSxtw y)) -> (M2_dpmpyss_s0 x y) + // could match either one of these: + // (mul (sext x) (sext_inreg y)) + // (mul (sext-load *p) (sext_inreg y)) + // (mul (sext_inreg x) (sext y)) + // etc. + // + // The returned value will have type i64 and its low word will + // contain the value being extended. The high bits are not specified. + // The returned type is i64 because the original type of N was i64, + // but the users of this function should only use the low-word of the + // result, e.g. + // (mul sxtw:x, sxtw:y) -> (M2_dpmpyss_s0 (LoReg sxtw:x), (LoReg sxtw:y)) + + if (N.getValueType() != MVT::i64) + return false; + EVT SrcVT; + unsigned Opc = N.getOpcode(); + switch (Opc) { + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + // sext_inreg has the source type as a separate operand. + EVT T = Opc == ISD::SIGN_EXTEND + ? N.getOperand(0).getValueType() + : cast<VTSDNode>(N.getOperand(1))->getVT(); + if (T.getSizeInBits() != 32) + return false; + R = N.getOperand(0); + break; + } + case ISD::LOAD: { + LoadSDNode *L = cast<LoadSDNode>(N); + if (L->getExtensionType() != ISD::SEXTLOAD) + return false; + // All extending loads extend to i32, so even if the value in + // memory is shorter than 32 bits, it will be i32 after the load. + if (L->getMemoryVT().getSizeInBits() > 32) + return false; + R = N; + break; + } + default: + return false; + } + EVT RT = R.getValueType(); + if (RT == MVT::i64) + return true; + assert(RT == MVT::i32); + // This is only to produce a value of type i64. Do not rely on the + // high bits produced by this. + const SDLoc &dl(N); + SDValue Ops[] = { + CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32) + }; + SDNode *T = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, + MVT::i64, Ops); + R = SDValue(T, 0); + return true; +} + +bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits, + SDValue &Src) { unsigned Opc = Val.getOpcode(); switch (Opc) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: { - SDValue const &Op0 = Val.getOperand(0); + const SDValue &Op0 = Val.getOperand(0); EVT T = Op0.getValueType(); - if (T.isInteger() && T.getSizeInBits() == FromBits) { + if (T.isInteger() && T.getSizeInBits() == NumBits) { Src = Op0; return true; } @@ -1408,23 +1393,23 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, case ISD::AssertZext: if (Val.getOperand(0).getValueType().isInteger()) { VTSDNode *T = cast<VTSDNode>(Val.getOperand(1)); - if (T->getVT().getSizeInBits() == FromBits) { + if (T->getVT().getSizeInBits() == NumBits) { Src = Val.getOperand(0); return true; } } break; case ISD::AND: { - // Check if this is an AND with "FromBits" of lower bits set to 1. - uint64_t FromMask = (1 << FromBits) - 1; + // Check if this is an AND with NumBits of lower bits set to 1. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(0); return true; } @@ -1433,16 +1418,16 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, } case ISD::OR: case ISD::XOR: { - // OR/XOR with the lower "FromBits" bits set to 0. - uint64_t FromMask = (1 << FromBits) - 1; + // OR/XOR with the lower NumBits bits set to 0. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(0); return true; } diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 5ecf9320d5c26..4c6c6eeafbe0e 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1928,11 +1928,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BSWAP, MVT::i64, Legal); - - // We custom lower i64 to i64 mul, so that it is not considered as a legal - // operation. There is a pattern that will match i64 mul and transform it - // to a series of instructions. - setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MUL, MVT::i64, Legal); for (unsigned IntExpOp : { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 66e07c67958e9..0fef91ec4d3e0 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1769,161 +1769,6 @@ bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const { return getType(MI) == HexagonII::TypeCJ && MI.isBranch(); } -bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const { - return (MI.isBranch() && isPredicated(MI)) || - isConditionalTransfer(MI) || - isConditionalALU32(MI) || - isConditionalLoad(MI) || - // Predicated stores which don't have a .new on any operands. - (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) && - !isPredicatedNew(MI)); -} - -bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_paddf: - case Hexagon::A2_paddfnew: - case Hexagon::A2_paddif: - case Hexagon::A2_paddifnew: - case Hexagon::A2_paddit: - case Hexagon::A2_padditnew: - case Hexagon::A2_paddt: - case Hexagon::A2_paddtnew: - case Hexagon::A2_pandf: - case Hexagon::A2_pandfnew: - case Hexagon::A2_pandt: - case Hexagon::A2_pandtnew: - case Hexagon::A2_porf: - case Hexagon::A2_porfnew: - case Hexagon::A2_port: - case Hexagon::A2_portnew: - case Hexagon::A2_psubf: - case Hexagon::A2_psubfnew: - case Hexagon::A2_psubt: - case Hexagon::A2_psubtnew: - case Hexagon::A2_pxorf: - case Hexagon::A2_pxorfnew: - case Hexagon::A2_pxort: - case Hexagon::A2_pxortnew: - case Hexagon::A4_paslhf: - case Hexagon::A4_paslhfnew: - case Hexagon::A4_paslht: - case Hexagon::A4_paslhtnew: - case Hexagon::A4_pasrhf: - case Hexagon::A4_pasrhfnew: - case Hexagon::A4_pasrht: - case Hexagon::A4_pasrhtnew: - case Hexagon::A4_psxtbf: - case Hexagon::A4_psxtbfnew: - case Hexagon::A4_psxtbt: - case Hexagon::A4_psxtbtnew: - case Hexagon::A4_psxthf: - case Hexagon::A4_psxthfnew: - case Hexagon::A4_psxtht: - case Hexagon::A4_psxthtnew: - case Hexagon::A4_pzxtbf: - case Hexagon::A4_pzxtbfnew: - case Hexagon::A4_pzxtbt: - case Hexagon::A4_pzxtbtnew: - case Hexagon::A4_pzxthf: - case Hexagon::A4_pzxthfnew: - case Hexagon::A4_pzxtht: - case Hexagon::A4_pzxthtnew: - case Hexagon::C2_ccombinewf: - case Hexagon::C2_ccombinewt: - return true; - } - return false; -} - -// FIXME - Function name and it's functionality don't match. -// It should be renamed to hasPredNewOpcode() -bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const { - if (!MI.getDesc().mayLoad() || !isPredicated(MI)) - return false; - - int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode()); - // Instruction with valid predicated-new opcode can be promoted to .new. - return PNewOpcode >= 0; -} - -// Returns true if an instruction is a conditional store. -// -// Note: It doesn't include conditional new-value stores as they can't be -// converted to .new predicate. -bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - default: return false; - case Hexagon::S4_storeirbt_io: - case Hexagon::S4_storeirbf_io: - case Hexagon::S4_pstorerbt_rr: - case Hexagon::S4_pstorerbf_rr: - case Hexagon::S2_pstorerbt_io: - case Hexagon::S2_pstorerbf_io: - case Hexagon::S2_pstorerbt_pi: - case Hexagon::S2_pstorerbf_pi: - case Hexagon::S2_pstorerdt_io: - case Hexagon::S2_pstorerdf_io: - case Hexagon::S4_pstorerdt_rr: - case Hexagon::S4_pstorerdf_rr: - case Hexagon::S2_pstorerdt_pi: - case Hexagon::S2_pstorerdf_pi: - case Hexagon::S2_pstorerht_io: - case Hexagon::S2_pstorerhf_io: - case Hexagon::S4_storeirht_io: - case Hexagon::S4_storeirhf_io: - case Hexagon::S4_pstorerht_rr: - case Hexagon::S4_pstorerhf_rr: - case Hexagon::S2_pstorerht_pi: - case Hexagon::S2_pstorerhf_pi: - case Hexagon::S2_pstorerit_io: - case Hexagon::S2_pstorerif_io: - case Hexagon::S4_storeirit_io: - case Hexagon::S4_storeirif_io: - case Hexagon::S4_pstorerit_rr: - case Hexagon::S4_pstorerif_rr: - case Hexagon::S2_pstorerit_pi: - case Hexagon::S2_pstorerif_pi: - - // V4 global address store before promoting to dot new. - case Hexagon::S4_pstorerdt_abs: - case Hexagon::S4_pstorerdf_abs: - case Hexagon::S4_pstorerbt_abs: - case Hexagon::S4_pstorerbf_abs: - case Hexagon::S4_pstorerht_abs: - case Hexagon::S4_pstorerhf_abs: - case Hexagon::S4_pstorerit_abs: - case Hexagon::S4_pstorerif_abs: - return true; - - // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded - // from the "Conditional Store" list. Because a predicated new value store - // would NOT be promoted to a double dot new store. - // This function returns yes for those stores that are predicated but not - // yet promoted to predicate dot new instructions. - } -} - -bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_tfrt: - case Hexagon::A2_tfrf: - case Hexagon::C2_cmoveit: - case Hexagon::C2_cmoveif: - case Hexagon::A2_tfrtnew: - case Hexagon::A2_tfrfnew: - case Hexagon::C2_cmovenewit: - case Hexagon::C2_cmovenewif: - case Hexagon::A2_tfrpt: - case Hexagon::A2_tfrpf: - return true; - - default: - return false; - } - return false; -} - // TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle // isFPImm and later getFPImm as well. bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const { @@ -3474,6 +3319,8 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const { // Returns the opcode to use when converting MI, which is a conditional jump, // into a conditional instruction which uses the .new value of the predicate. // We also use branch probabilities to add a hint to the jump. +// If MBPI is null, all edges will be treated as equally likely for the +// purposes of establishing a predication hint. int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, const MachineBranchProbabilityInfo *MBPI) const { // We assume that block can have at most two successors. @@ -3482,9 +3329,16 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, bool Taken = false; const BranchProbability OneHalf(1, 2); + auto getEdgeProbability = [MBPI] (const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) { + if (MBPI) + return MBPI->getEdgeProbability(Src, Dst); + return BranchProbability(1, Src->succ_size()); + }; + if (BrTarget.isMBB()) { const MachineBasicBlock *Dst = BrTarget.getMBB(); - Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf; + Taken = getEdgeProbability(Src, Dst) >= OneHalf; } else { // The branch target is not a basic block (most likely a function). // Since BPI only gives probabilities for targets that are basic blocks, @@ -3521,7 +3375,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, for (const MachineBasicBlock *SB : B.successors()) { if (!B.isLayoutSuccessor(SB)) continue; - Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf; + Taken = getEdgeProbability(Src, SB) < OneHalf; break; } } else { @@ -3534,7 +3388,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, BT = Op.getMBB(); break; } - Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf; + Taken = BT && getEdgeProbability(Src, BT) < OneHalf; } } // if (!Bad) } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 97b9bc9546885..944d0161a7c8e 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -314,11 +314,6 @@ public: bool isAccumulator(const MachineInstr &MI) const; bool isComplex(const MachineInstr &MI) const; bool isCompoundBranchInstr(const MachineInstr &MI) const; - bool isCondInst(const MachineInstr &MI) const; - bool isConditionalALU32 (const MachineInstr &MI) const; - bool isConditionalLoad(const MachineInstr &MI) const; - bool isConditionalStore(const MachineInstr &MI) const; - bool isConditionalTransfer(const MachineInstr &MI) const; bool isConstExtended(const MachineInstr &MI) const; bool isDeallocRet(const MachineInstr &MI) const; bool isDependent(const MachineInstr &ProdMI, diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index e6ea67d55b43c..9aa185fc85a6a 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -59,6 +59,9 @@ cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); +static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000), + cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); + static const char *HexagonVolatileMemcpyName = "hexagon_memcpy_forward_vp4cp4n2"; @@ -477,7 +480,7 @@ Value *Simplifier::simplify(Context &C) { WorkListType Q; Q.push_back(C.Root); unsigned Count = 0; - const unsigned Limit = 100000; + const unsigned Limit = SimplifyLimit; while (!Q.empty()) { if (Count++ >= Limit) @@ -501,8 +504,7 @@ Value *Simplifier::simplify(Context &C) { Q.push_back(Op); } } - assert(Count < Limit && "Infinite loop in HLIR/simplify?"); - return C.Root; + return Count < Limit ? C.Root : nullptr; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 81b5e10c11731..70ed123bc8981 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -382,48 +382,42 @@ def: T_MType_acc_pat3 <M4_or_andn, and, or>; def: T_MType_acc_pat3 <M4_and_andn, and, and>; def: T_MType_acc_pat3 <M4_xor_andn, and, xor>; +// This complex pattern is really only to detect various forms of +// sign-extension i32->i64. The selected value will be of type i64 +// whose low word is the value being extended. The high word is +// unspecified. +def Usxtw : ComplexPattern<i64, 1, "DetectUseSxtw", [], []>; + def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>; -def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>; def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>; +def Sext64: PatLeaf<(i64 Usxtw:$Rs)>; -// Return true if for a 32 to 64-bit sign-extended load. -def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{ - LoadSDNode *LD = dyn_cast<LoadSDNode>(N); - if (!LD) - return false; - return LD->getExtensionType() == ISD::SEXTLOAD && - LD->getMemoryVT().getScalarType() == MVT::i32; -}]>; - -def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)), - (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>; - -def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)), - (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>; +def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)), + (M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>; -def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2), - (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>; +def: Pat<(mul Sext64:$Rs, Sext64:$Rt), + (M2_dpmpyss_s0 (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; // Multiply and accumulate, use full result. // Rxx[+-]=mpy(Rs,Rt) -def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_acc_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_nac_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset, InstHexagon MI> @@ -545,7 +539,8 @@ def: Storexm_simple_pat<truncstorei8, I64, LoReg, S2_storerb_io>; def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>; def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>; -def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext I32:$src)), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext_inreg I64:$src, i32)), (A2_sxtw (LoReg I64:$src))>; def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src), (A2_abs IntRegs:$src)>; @@ -1159,8 +1154,8 @@ multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> { defm: T_MinMax_pats<Op, I64, Inst, SwapInst>; } -def: Pat<(add (Sext64 I32:$Rs), I64:$Rt), - (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>; +def: Pat<(add Sext64:$Rs, I64:$Rt), + (A2_addsp (LoReg Sext64:$Rs), DoubleRegs:$Rt)>; let AddedComplexity = 200 in { defm: MinMax_pats_p<setge, A2_maxp, A2_minp>; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 8e93df6201aea..14ecf297d351c 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -223,7 +223,7 @@ namespace { /// Hexagon Code Generator Pass Configuration Options. class HexagonPassConfig : public TargetPassConfig { public: - HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM) + HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} HexagonTargetMachine &getHexagonTargetMachine() const { @@ -245,7 +245,7 @@ public: } // namespace TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) { - return new HexagonPassConfig(this, PM); + return new HexagonPassConfig(*this, PM); } void HexagonPassConfig::addIRPasses() { diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index cd474921d4bc8..fa08afe4019d7 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -273,25 +273,17 @@ bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI, if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister()) return true; - // Check if this is a predicate dependence. - const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg); - if (RC == &Hexagon::PredRegsRegClass) - return true; - - // Assumes that the first operand of the CALLr is the function address. - if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) { - const MachineOperand MO = MI.getOperand(0); - if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) - return true; + // Call-like instructions can be packetized with preceding instructions + // that define registers implicitly used or modified by the call. Explicit + // uses are still prohibited, as in the case of indirect calls: + // r0 = ... + // J2_jumpr r0 + if (DepType == SDep::Data) { + for (const MachineOperand MO : MI.operands()) + if (MO.isReg() && MO.getReg() == DepReg && !MO.isImplicit()) + return true; } - if (HII->isJumpR(MI)) { - const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1) - : MI.getOperand(0); - assert(MO.isReg() && MO.isUse()); - if (MO.getReg() == DepReg) - return true; - } return false; } @@ -333,11 +325,13 @@ bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC) { // Vector stores can be predicated, and can be new-value stores, but // they cannot be predicated on a .new predicate value. - if (NewRC == &Hexagon::PredRegsRegClass) + if (NewRC == &Hexagon::PredRegsRegClass) { if (HII->isHVXVec(MI) && MI.mayStore()) return false; - return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() || - HII->mayBeNewStore(MI); + return HII->isPredicated(MI) && HII->getDotNewPredOp(MI, nullptr) > 0; + } + // If the class is not PredRegs, it could only apply to new-value stores. + return HII->mayBeNewStore(MI); } // Promote an instructiont to its .cur form. @@ -760,11 +754,14 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI, return false; } -static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) { +static bool isImplicitDependency(const MachineInstr &I, bool CheckDef, + unsigned DepReg) { for (auto &MO : I.operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DepReg)) + if (CheckDef && MO.isRegMask() && MO.clobbersPhysReg(DepReg)) return true; - if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit()) + if (!MO.isReg() || MO.getReg() != DepReg || !MO.isImplicit()) + continue; + if (CheckDef == MO.isDef()) return true; } return false; @@ -798,7 +795,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // If dependency is trough an implicitly defined register, we should not // newify the use. - if (isImplicitDependency(PI, DepReg)) + if (isImplicitDependency(PI, true, DepReg) || + isImplicitDependency(MI, false, DepReg)) return false; const MCInstrDesc& MCID = PI.getDesc(); @@ -808,8 +806,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // predicate .new if (RC == &Hexagon::PredRegsRegClass) - if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn()) - return HII->predCanBeUsedAsDotNew(PI, DepReg); + return HII->predCanBeUsedAsDotNew(PI, DepReg); if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI)) return false; diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 2a9bc25d7fadb..a2f005ce445a8 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -76,7 +76,7 @@ namespace { // Lanai Code Generator Pass Configuration Options. class LanaiPassConfig : public TargetPassConfig { public: - LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager) + LanaiPassConfig(LanaiTargetMachine &TM, PassManagerBase *PassManager) : TargetPassConfig(TM, *PassManager) {} LanaiTargetMachine &getLanaiTargetMachine() const { @@ -91,7 +91,7 @@ public: TargetPassConfig * LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) { - return new LanaiPassConfig(this, &PassManager); + return new LanaiPassConfig(*this, &PassManager); } // Install an instruction selector pass. diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 5278c70d909da..083ba6fdf8416 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -49,6 +49,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index bebe5fa35ad42..d8fdc8ba674e6 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -52,7 +52,7 @@ namespace { /// MSP430 Code Generator Pass Configuration Options. class MSP430PassConfig : public TargetPassConfig { public: - MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM) + MSP430PassConfig(MSP430TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} MSP430TargetMachine &getMSP430TargetMachine() const { @@ -65,7 +65,7 @@ public: } // namespace TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) { - return new MSP430PassConfig(this, PM); + return new MSP430PassConfig(*this, PM); } bool MSP430PassConfig::addInstSelector() { diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index e7ceca9612a92..a222080f6b814 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -1,4 +1,4 @@ -//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===// +//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===// // // The LLVM Compiler Infrastructure // @@ -11,20 +11,29 @@ // //===----------------------------------------------------------------------===// -#include "Mips16FrameLowering.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "Mips16FrameLowering.h" #include "Mips16InstrInfo.h" #include "MipsInstrInfo.h" #include "MipsRegisterInfo.h" #include "MipsSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetFrameLowering.h" +#include <cassert> +#include <cstdint> +#include <vector> using namespace llvm; @@ -63,7 +72,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (CSI.size()) { + if (!CSI.empty()) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), @@ -80,7 +89,6 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, if (hasFP(MF)) BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0) .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup); - } void Mips16FrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 092de216e9b89..a9d6ab0558927 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -201,7 +201,7 @@ namespace { /// Mips Code Generator Pass Configuration Options. class MipsPassConfig : public TargetPassConfig { public: - MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM) + MipsPassConfig(MipsTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // The current implementation of long branch pass requires a scratch // register ($at) to be available before branch instructions. Tail merging @@ -227,7 +227,7 @@ public: } // end anonymous namespace TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { - return new MipsPassConfig(this, PM); + return new MipsPassConfig(*this, PM); } void MipsPassConfig::addIRPasses() { diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 140d7133f879b..a3462868cb111 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -66,6 +66,10 @@ public: bool isLittleEndian() const { return isLittle; } const MipsABIInfo &getABI() const { return ABI; } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Mips32/64 big endian target machine. diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index ab5298d0dcfd6..8dfbfece9b8eb 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -132,7 +132,7 @@ namespace { class NVPTXPassConfig : public TargetPassConfig { public: - NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) + NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} NVPTXTargetMachine &getNVPTXTargetMachine() const { @@ -163,7 +163,7 @@ private: } // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { - return new NVPTXPassConfig(this, PM); + return new NVPTXPassConfig(*this, PM); } void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 1ed8e3b1e9357..2f3981be22f83 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -65,6 +65,9 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; + bool isMachineVerifierClean() const override { + return false; + } }; // NVPTXTargetMachine. class NVPTXTargetMachine32 : public NVPTXTargetMachine { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 5fa7b2c6bfb1b..54414457388d3 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -77,6 +77,11 @@ STATISTIC(SignExtensionsAdded, "Number of sign extensions for compare inputs added."); STATISTIC(ZeroExtensionsAdded, "Number of zero extensions for compare inputs added."); +STATISTIC(NumLogicOpsOnComparison, + "Number of logical ops on i1 values calculated in GPR."); +STATISTIC(OmittedForNonExtendUses, + "Number of compares not eliminated as they have non-extending uses."); + // FIXME: Remove this once the bug has been fixed! cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -275,6 +280,8 @@ private: bool trySETCC(SDNode *N); bool tryEXTEND(SDNode *N); + bool tryLogicOpOfCompares(SDNode *N); + SDValue computeLogicOpInGPR(SDValue LogicOp); SDValue signExtendInputIfNeeded(SDValue Input); SDValue zeroExtendInputIfNeeded(SDValue Input); SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); @@ -282,6 +289,10 @@ private: int64_t RHSValue, SDLoc dl); SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, int64_t RHSValue, SDLoc dl); + SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); @@ -2501,6 +2512,11 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +// Is this opcode a bitwise logical operation? +static bool isLogicOp(unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; +} + /// If this node is a sign/zero extension of an integer comparison, /// it can usually be computed in GPR's rather than using comparison /// instructions and ISEL. We only do this on 64-bit targets for now @@ -2513,13 +2529,20 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { N->getOpcode() == ISD::SIGN_EXTEND) && "Expecting a zero/sign extend node!"); - if (N->getOperand(0).getOpcode() != ISD::SETCC) + SDValue WideRes; + // If we are zero-extending the result of a logical operation on i1 + // values, we can keep the values in GPRs. + if (isLogicOp(N->getOperand(0).getOpcode()) && + N->getOperand(0).getValueType() == MVT::i1 && + N->getOpcode() == ISD::ZERO_EXTEND) + WideRes = computeLogicOpInGPR(N->getOperand(0)); + else if (N->getOperand(0).getOpcode() != ISD::SETCC) return false; - - SDValue WideRes = - getSETCCInGPR(N->getOperand(0), - N->getOpcode() == ISD::SIGN_EXTEND ? - SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + else + WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); if (!WideRes) return false; @@ -2540,6 +2563,159 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { return true; } +// Lower a logical operation on i1 values into a GPR sequence if possible. +// The result can be kept in a GPR if requested. +// Three types of inputs can be handled: +// - SETCC +// - TRUNCATE +// - Logical operation (AND/OR/XOR) +// There is also a special case that is handled (namely a complement operation +// achieved with xor %a, -1). +SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) { + assert(isLogicOp(LogicOp.getOpcode()) && + "Can only handle logic operations here."); + assert(LogicOp.getValueType() == MVT::i1 && + "Can only handle logic operations on i1 values here."); + SDLoc dl(LogicOp); + SDValue LHS, RHS; + + // Special case: xor %a, -1 + bool IsBitwiseNegation = isBitwiseNot(LogicOp); + + // Produces a GPR sequence for each operand of the binary logic operation. + // For SETCC, it produces the respective comparison, for TRUNCATE it truncates + // the value in a GPR and for logic operations, it will recursively produce + // a GPR sequence for the operation. + auto getLogicOperand = [&] (SDValue Operand) -> SDValue { + unsigned OperandOpcode = Operand.getOpcode(); + if (OperandOpcode == ISD::SETCC) + return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); + else if (OperandOpcode == ISD::TRUNCATE) { + SDValue InputOp = Operand.getOperand(0); + EVT InVT = InputOp.getValueType(); + return + SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : + PPC::RLDICL, dl, InVT, InputOp, + getI64Imm(0, dl), getI64Imm(63, dl)), 0); + } else if (isLogicOp(OperandOpcode)) + return computeLogicOpInGPR(Operand); + return SDValue(); + }; + LHS = getLogicOperand(LogicOp.getOperand(0)); + RHS = getLogicOperand(LogicOp.getOperand(1)); + + // If a GPR sequence can't be produced for the LHS we can't proceed. + // Not producing a GPR sequence for the RHS is only a problem if this isn't + // a bitwise negation operation. + if (!LHS || (!RHS && !IsBitwiseNegation)) + return SDValue(); + + NumLogicOpsOnComparison++; + + // We will use the inputs as 64-bit values. + if (LHS.getValueType() == MVT::i32) + LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); + if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) + RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + + unsigned NewOpc; + switch (LogicOp.getOpcode()) { + default: llvm_unreachable("Unknown logic operation."); + case ISD::AND: NewOpc = PPC::AND8; break; + case ISD::OR: NewOpc = PPC::OR8; break; + case ISD::XOR: NewOpc = PPC::XOR8; break; + } + + if (IsBitwiseNegation) { + RHS = getI64Imm(1, dl); + NewOpc = PPC::XORI8; + } + + return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); + +} + +/// Try performing logical operations on results of comparisons in GPRs. +/// It is typically preferred from a performance perspective over performing +/// the operations on individual bits in the CR. We only do this on 64-bit +/// targets for now as the code is specialized for 64-bit (it uses 64-bit +/// instructions and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + if (N->getValueType(0) != MVT::i1) + return false; + assert(isLogicOp(N->getOpcode()) && + "Expected a logic operation on setcc results."); + SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); + if (!LoweredLogical) + return false; + + SDLoc dl(N); + bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; + unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue LHS = LoweredLogical.getOperand(0); + SDValue RHS = LoweredLogical.getOperand(1); + SDValue WideOp; + SDValue OpToConvToRecForm; + + // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode + // that is input to the XORI. + if (IsBitwiseNegate && + LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) + OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); + else if (IsBitwiseNegate) + // If the input to the XORI isn't an extension, that's what we're after. + OpToConvToRecForm = LoweredLogical.getOperand(0); + else + // If this is not an XORI, it is a reg-reg logical op and we can convert it + // to record-form. + OpToConvToRecForm = LoweredLogical; + + // Get the record-form version of the node we're looking to use to get the + // CR result from. + uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); + int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); + + // Convert the right node to record-form. This is either the logical we're + // looking at or it is the input node to the negation (if we're looking at + // a bitwise negation). + if (NewOpc != -1 && IsBitwiseNegate) { + // The input to the XORI has a record-form. Use it. + assert(LoweredLogical.getConstantOperandVal(1) == 1 && + "Expected a PPC::XORI8 only for bitwise negation."); + // Emit the record-form instruction. + std::vector<SDValue> Ops; + for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) + Ops.push_back(OpToConvToRecForm.getOperand(i)); + + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc, dl, + OpToConvToRecForm.getValueType(), + MVT::Glue, Ops), 0); + } else { + assert((NewOpc != -1 || !IsBitwiseNegate) && + "No record form available for AND8/OR8/XOR8?"); + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, + MVT::i64, MVT::Glue, LHS, RHS), 0); + } + + // Select this node to a single bit from CR0 set by the record-form node + // just created. For bitwise negation, use the EQ bit which is the equivalent + // of negating the result (i.e. it is a bit set when the result of the + // operation is zero). + SDValue SRIdxVal = + CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); + SDValue CRBit = + SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i1, CR0Reg, SRIdxVal, + WideOp.getValue(1)), 0); + ReplaceNode(N, CRBit.getNode()); + return true; +} + /// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. /// Useful when emitting comparison code for 32-bit values without using /// the compare instruction (which only considers the lower 32-bits). @@ -2677,6 +2853,77 @@ SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, } } +/// Produces a zero-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) + // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, + getI64Imm(58, dl), getI64Imm(63, dl)), + 0); + } + } +} + +/// Produces a sign-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue AddInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Addic = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + AddInput, getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, + Addic, Addic.getValue(1)), 0); + } + } +} + +/// Does this SDValue have any uses for which keeping the value in a GPR is +/// appropriate. This is meant to be used on values that have type i1 since +/// it is somewhat meaningless to ask if values of other types can be kept in +/// GPR's. +static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { + assert(Compare.getOpcode() == ISD::SETCC && + "An ISD::SETCC node required here."); + + // For values that have a single use, the caller should obviously already have + // checked if that use is an extending use. We check the other uses here. + if (Compare.hasOneUse()) + return true; + // We want the value in a GPR if it is being extended, used for a select, or + // used in logical operations. + for (auto CompareUse : Compare.getNode()->uses()) + if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && + CompareUse->getOpcode() != ISD::ZERO_EXTEND && + CompareUse->getOpcode() != ISD::SELECT && + !isLogicOp(CompareUse->getOpcode())) { + OmittedForNonExtendUses++; + return false; + } + return true; +} + /// Returns an equivalent of a SETCC node but with the result the same width as /// the inputs. This can nalso be used for SELECT_CC if either the true or false /// values is a power of two while the other is zero. @@ -2686,6 +2933,11 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, Compare.getOpcode() == ISD::SELECT_CC) && "An ISD::SETCC node required here."); + // Don't convert this comparison to a GPR sequence because there are uses + // of the i1 result (i.e. uses that require the result in the CR). + if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) + return SDValue(); + SDValue LHS = Compare.getOperand(0); SDValue RHS = Compare.getOperand(1); @@ -2694,30 +2946,35 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, ISD::CondCode CC = cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get(); EVT InputVT = LHS.getValueType(); - if (InputVT != MVT::i32) + if (InputVT != MVT::i32 && InputVT != MVT::i64) return SDValue(); - SDLoc dl(Compare); - ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); - int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; - if (ConvOpts == SetccInGPROpts::ZExtInvert || ConvOpts == SetccInGPROpts::SExtInvert) CC = ISD::getSetCCInverse(CC, true); - if (ISD::isSignedIntSetCC(CC)) { + bool Inputs32Bit = InputVT == MVT::i32; + if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) { LHS = signExtendInputIfNeeded(LHS); RHS = signExtendInputIfNeeded(RHS); - } else if (ISD::isUnsignedIntSetCC(CC)) { + } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) { LHS = zeroExtendInputIfNeeded(LHS); RHS = zeroExtendInputIfNeeded(RHS); } + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || ConvOpts == SetccInGPROpts::SExtInvert; - if (IsSext) + + if (IsSext && Inputs32Bit) return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); - return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (Inputs32Bit) + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (IsSext) + return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { @@ -2906,6 +3163,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::AND: { + if (tryLogicOpOfCompares(N)) + return; + unsigned Imm, Imm2, SH, MB, ME; uint64_t Imm64; @@ -3025,6 +3285,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitfieldInsert(N)) return; + if (tryLogicOpOfCompares(N)) + return; + short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { @@ -3042,6 +3305,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::XOR: { + if (tryLogicOpOfCompares(N)) + return; + break; + } case ISD::ADD: { short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 216efcc4a1ee8..41ff9d903aa07 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } @@ -1112,6 +1116,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1593,17 +1598,25 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } - // Check that the mask is shuffling words -static bool isWordShuffleMask(ShuffleVectorSDNode *N) { - for (unsigned i = 0; i < 4; ++i) { - unsigned B0 = N->getMaskElt(i*4); - unsigned B1 = N->getMaskElt(i*4+1); - unsigned B2 = N->getMaskElt(i*4+2); - unsigned B3 = N->getMaskElt(i*4+3); - if (B0 % 4) - return false; - if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) +// Check that the mask is shuffling N byte elements. +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) { + assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && + "Unexpected element width."); + + unsigned NumOfElem = 16 / Width; + unsigned MaskVal[16]; // Width is never greater than 16 + for (unsigned i = 0; i < NumOfElem; ++i) { + MaskVal[0] = N->getMaskElt(i * Width); + if (MaskVal[0] % Width) { return false; + } + + for (unsigned int j = 1; j < Width; ++j) { + MaskVal[j] = N->getMaskElt(i * Width + j); + if (MaskVal[j] != MaskVal[j-1] + 1) { + return false; + } + } } return true; @@ -1611,7 +1624,7 @@ static bool isWordShuffleMask(ShuffleVectorSDNode *N) { bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE) { - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12 @@ -1688,7 +1701,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); // Ensure each byte index of the word is consecutive. - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12, which are the beginning of words. @@ -1746,6 +1759,66 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, } } +/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap +/// if the inputs to the instruction should be swapped and set \p DM to the +/// value for the immediate. +/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI +/// AND element 0 of the result comes from the first input (LE) or second input +/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. +/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle +/// mask. +bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + // Ensure each byte index of the double word is consecutive. + if (!isNByteElemShuffleMask(N, 8)) + return false; + + unsigned M0 = N->getMaskElt(0) / 8; + unsigned M1 = N->getMaskElt(8) / 8; + assert(((M0 | M1) < 4) && "A mask element out of bounds?"); + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + if ((M0 | M1) < 2) { + DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); + Swap = false; + return true; + } else + return false; + } + + if (IsLE) { + if (M0 > 1 && M1 < 2) { + Swap = false; + } else if (M0 < 2 && M1 > 1) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (((~M1) & 1) << 1) + ((~M0) & 1); + return true; + } else { // BE + if (M0 < 2 && M1 > 1) { + Swap = false; + } else if (M0 > 1 && M1 < 2) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (M0 << 1) + (M1 & 1); + return true; + } +} + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. @@ -7760,6 +7833,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); } + if (Subtarget.hasVSX() && + PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); + + SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 2f9eb95f6de68..7982a4a9e9fb7 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -90,6 +90,10 @@ namespace llvm { /// VECSHL, + /// XXPERMDI - The PPC XXPERMDI instruction + /// + XXPERMDI, + /// The CMPB instruction (takes two operands of i32 or i64). CMPB, @@ -454,6 +458,10 @@ namespace llvm { /// for a XXSLDWI instruction. bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE); + /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXPERMDI instruction. + bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 165970f9678c3..295590b2acf62 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -735,12 +735,12 @@ def RLDICL_32_64 : MDForm_1<30, 0, "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; // End fast-isel. -let isCodeGenOnly = 1 in -def RLDICL_32 : MDForm_1<30, 0, - (outs gprc:$rA), - (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLDICL_32 : MDForm_1r<30, 0, + (outs gprc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index fd6785e963a64..f3c68c443b1bc 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1983,3 +1983,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { return &PPC::VSRCRegClass; return RC; } + +int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { + return PPC::getRecordFormOpcode(Opcode); +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index b30d09e03ec47..8dd4dbb608794 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -290,6 +290,7 @@ public: return Reg >= PPC::V0 && Reg <= PPC::V31; } const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const; + static int getRecordFormOpcode(unsigned Opcode); }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 26b99eced23cb..8223aa655e38b 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; +def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>, + SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> +]>; + def SDT_PPCvcmp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> ]>; @@ -170,6 +174,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 1589ab03e5075..c4139ca8b7bdb 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -843,7 +843,9 @@ let Uses = [RM] in { def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), - "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, + [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB, + imm32SExt16:$DM))]>; let isCodeGenOnly = 1 in def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM), "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index ddae5befee3e4..b9004cc8a9f54 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,7 +296,7 @@ namespace { /// PPC Code Generator Pass Configuration Options. class PPCPassConfig : public TargetPassConfig { public: - PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} PPCTargetMachine &getPPCTargetMachine() const { @@ -316,7 +316,7 @@ public: } // end anonymous namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { - return new PPCPassConfig(this, PM); + return new PPCPassConfig(*this, PM); } void PPCPassConfig::addIRPasses() { diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index f2838351cee56..b8f5a2083d808 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -55,6 +55,10 @@ public: const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; + + bool isMachineVerifierClean() const override { + return false; + } }; /// PowerPC 32-bit target machine. diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 7ee1317bf72f2..5559cdc5fe467 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } @@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { } unsigned PPCTTIImpl::getCacheLineSize() { - // This is currently only used for the data prefetch pass which is only - // enabled for BG/Q by default. - return CacheLineSize; + // Check first if the user specified a custom line size. + if (CacheLineSize.getNumOccurrences() > 0) + return CacheLineSize; + + // On P7, P8 or P9 we have a cache line size of 128. + unsigned Directive = ST->getDarwinDirective(); + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || + Directive == PPC::DIR_PWR9) + return 128; + + // On other processors return a default of 64 bytes. + return 64; } unsigned PPCTTIImpl::getPrefetchDistance() { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 6ce70fbd8778e..2e0116fee04c6 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index a20331cd0a3ed..efdde04c582d0 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -56,5 +56,5 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, } TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 1da4d36043045..49c67e0819f72 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -114,7 +114,7 @@ namespace { /// Sparc Code Generator Pass Configuration Options. class SparcPassConfig : public TargetPassConfig { public: - SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM) + SparcPassConfig(SparcTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SparcTargetMachine &getSparcTargetMachine() const { @@ -128,7 +128,7 @@ public: } // namespace TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SparcPassConfig(this, PM); + return new SparcPassConfig(*this, PM); } void SparcPassConfig::addIRPasses() { diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 48193fe095bed..faf714cbe2c98 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -40,6 +40,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Sparc 32-bit target machine diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index ede5005fa4916..f30d52f859d75 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -119,7 +119,7 @@ namespace { /// SystemZ Code Generator Pass Configuration Options. class SystemZPassConfig : public TargetPassConfig { public: - SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM) + SystemZPassConfig(SystemZTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SystemZTargetMachine &getSystemZTargetMachine() const { @@ -212,7 +212,7 @@ void SystemZPassConfig::addPreEmitPass() { } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SystemZPassConfig(this, PM); + return new SystemZPassConfig(*this, PM); } TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index a10ca64fa6329..eb2f17a2091c3 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -51,6 +51,8 @@ public: } bool targetSchedulesPostRAScheduling() const override { return true; }; + + bool isMachineVerifierClean() const override { return false; } }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index b974681fb6afa..d9b2b8743649e 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -129,7 +129,7 @@ namespace { /// WebAssembly Code Generator Pass Configuration Options. class WebAssemblyPassConfig final : public TargetPassConfig { public: - WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM) + WebAssemblyPassConfig(WebAssemblyTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const { @@ -154,7 +154,7 @@ TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { TargetPassConfig * WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) { - return new WebAssemblyPassConfig(this, PM); + return new WebAssemblyPassConfig(*this, PM); } FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 6e062ec59347b..b5a926f915afa 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -587,6 +587,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSLLDQZ256rr: case X86::VPSLLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSLLDQZ128rm: case X86::VPSLLDQZ256rm: case X86::VPSLLDQZ512rm: @@ -604,6 +605,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSRLDQZ256rr: case X86::VPSRLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSRLDQZ128rm: case X86::VPSRLDQZ256rm: case X86::VPSRLDQZ512rm: @@ -1091,6 +1093,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask); @@ -1099,6 +1102,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 313920e02c3ed..5582526541bae 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -123,18 +123,26 @@ namespace { EdgeBundles *Bundles; // Return a bitmask of FP registers in block's live-in list. - static unsigned calcLiveInMask(MachineBasicBlock *MBB) { + static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { unsigned Mask = 0; - for (const auto &LI : MBB->liveins()) { - if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) - continue; - Mask |= 1 << (LI.PhysReg - X86::FP0); + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(); + I != MBB->livein_end(); ) { + MCPhysReg Reg = I->PhysReg; + static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums"); + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + Mask |= 1 << (Reg - X86::FP0); + if (RemoveFPs) { + I = MBB->removeLiveIn(I); + continue; + } + } + ++I; } return Mask; } // Partition all the CFG edges into LiveBundles. - void bundleCFG(MachineFunction &MF); + void bundleCFGRecomputeKillFlags(MachineFunction &MF); MachineBasicBlock *MBB; // Current basic block @@ -327,7 +335,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. - bundleCFG(MF); + bundleCFGRecomputeKillFlags(MF); StackTop = 0; @@ -375,13 +383,15 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { /// registers live-out from a block is identical to the live-in set of all /// successors. This is not enforced by the normal live-in lists since /// registers may be implicitly defined, or not used by all successors. -void FPS::bundleCFG(MachineFunction &MF) { +void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) { assert(LiveBundles.empty() && "Stale data in LiveBundles"); LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. for (MachineBasicBlock &MBB : MF) { - const unsigned Mask = calcLiveInMask(&MBB); + setKillFlags(MBB); + + const unsigned Mask = calcLiveInMask(&MBB, false); if (!Mask) continue; // Update MBB ingoing bundle mask. @@ -396,7 +406,6 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; MBB = &BB; - setKillFlags(BB); setupBlockStack(); for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { @@ -453,6 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { unsigned Reg = DeadRegs[i]; // Check if Reg is live on the stack. An inline-asm register operand that // is in the clobber list and marked dead might not be live on the stack. + static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers"); if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); freeStackSlotAfter(I, Reg-X86::FP0); @@ -506,7 +516,6 @@ void FPS::setupBlockStack() { // Push the fixed live-in registers. for (unsigned i = Bundle.FixCount; i > 0; --i) { - MBB->addLiveIn(X86::ST0+i-1); DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" << unsigned(Bundle.FixStack[i-1]) << '\n'); pushReg(Bundle.FixStack[i-1]); @@ -515,7 +524,8 @@ void FPS::setupBlockStack() { // Kill off unwanted live-ins. This can happen with a critical edge. // FIXME: We could keep these live registers around as zombies. They may need // to be revived at the end of a short block. It might save a few instrs. - adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true); + adjustLiveRegs(Mask, MBB->begin()); DEBUG(MBB->dump()); } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 331e56976db7c..328a80304602f 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1062,6 +1062,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { + assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved"); + // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. @@ -1124,13 +1126,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, nullptr, DwarfFramePtr)); } } - - // Mark the FramePtr as live-in in every block. Don't do this again for - // funclet prologues. - if (!IsFunclet) { - for (MachineBasicBlock &EveryMBB : MF) - EveryMBB.addLiveIn(MachineFramePtr); - } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index c899f0fd5100e..2a1633de0a239 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -418,8 +418,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { case X86ISD::XOR: case X86ISD::OR: case ISD::ADD: - case ISD::ADDC: - case ISD::ADDE: case ISD::ADDCARRY: case ISD::AND: case ISD::OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8d78308afe9df..0a41f35f93208 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,3 +1,4 @@ + //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure @@ -80,12 +81,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); -static cl::opt<bool> MulConstantOptimization( - "mul-constant-optimization", cl::init(true), - cl::desc("Replace 'mul x, Const' with more effective instructions like " - "SHIFT, LEA, etc."), - cl::Hidden); - /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -317,16 +312,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); } - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - if (VT == MVT::i64 && !Subtarget.is64Bit()) - continue; - // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. - setOperationAction(ISD::ADDC, VT, Custom); - setOperationAction(ISD::ADDE, VT, Custom); - setOperationAction(ISD::SUBC, VT, Custom); - setOperationAction(ISD::SUBE, VT, Custom); - } - setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, @@ -428,7 +413,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SETCCE, VT, Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -1583,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); + setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { @@ -16304,6 +16289,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; + LLVM_FALLTHROUGH; default: NeedOF = true; break; @@ -17167,17 +17153,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; + case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; + case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; - case ISD::SETUGE: Swap = true; + case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } @@ -17398,19 +17384,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } -SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); - assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); - assert(Carry.getOpcode() != ISD::CARRY_FALSE); + // Recreate the carry if needed. + EVT CarryVT = Carry.getValueType(); + APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), + Carry, DAG.getConstant(NegOne, DL, CarryVT)); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); - SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); if (Op.getSimpleValueType() == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -23269,32 +23260,6 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getNode()->getSimpleValueType(0); - - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid code"); - case ISD::ADDC: Opc = X86ISD::ADD; break; - case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; - case ISD::SUBC: Opc = X86ISD::SUB; break; - case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; - } - - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1), Op.getOperand(2)); -} - static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); @@ -23785,7 +23750,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); - case ISD::SETCCE: return LowerSETCCE(Op, DAG); + case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -23830,10 +23795,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: @@ -28946,12 +28907,118 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Try to match patterns such as +// (i16 bitcast (v16i1 x)) +// -> +// (i16 movmsk (16i8 sext (v16i1 x))) +// before the illegal vector is scalarized on subtargets that don't have legal +// vxi1 types. +static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, + const X86Subtarget &Subtarget) { + EVT VT = BitCast.getValueType(); + SDValue N0 = BitCast.getOperand(0); + EVT VecVT = N0->getValueType(0); + + if (!VT.isScalarInteger() || !VecVT.isSimple()) + return SDValue(); + + // With AVX512 vxi1 types are legal and we prefer using k-regs. + // MOVMSK is supported in SSE2 or later. + if (Subtarget.hasAVX512() || !Subtarget.hasSSE2()) + return SDValue(); + + // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and + // v8f64. So all legal 128-bit and 256-bit vectors are covered except for + // v8i16 and v16i16. + // For these two cases, we can shuffle the upper element bytes to a + // consecutive sequence at the start of the vector and treat the results as + // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However, + // for v16i16 this is not the case, because the shuffle is expensive, so we + // avoid sign-exteding to this type entirely. + // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: + // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) + MVT SExtVT; + MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + switch (VecVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i1: + SExtVT = MVT::v2i64; + FPCastVT = MVT::v2f64; + break; + case MVT::v4i1: + SExtVT = MVT::v4i32; + FPCastVT = MVT::v4f32; + // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) + // sign-extend to a 256-bit operation to avoid truncation. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v4i64; + FPCastVT = MVT::v4f64; + } + break; + case MVT::v8i1: + SExtVT = MVT::v8i16; + // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), + // sign-extend to a 256-bit operation to match the compare. + // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over + // 256-bit because the shuffle is cheaper than sign extending the result of + // the compare. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v8i32; + FPCastVT = MVT::v8f32; + } + break; + case MVT::v16i1: + SExtVT = MVT::v16i8; + // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), + // it is not profitable to sign-extend to 256-bit because this will + // require an extra cross-lane shuffle which is more exprensive than + // truncating the result of the compare to 128-bits. + break; + case MVT::v32i1: + // TODO: Handle pre-AVX2 cases by splitting to two v16i1's. + if (!Subtarget.hasInt256()) + return SDValue(); + SExtVT = MVT::v32i8; + break; + }; + + SDLoc DL(BitCast); + SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); + if (SExtVT == MVT::v8i16) { + V = DAG.getBitcast(MVT::v16i8, V); + V = DAG.getVectorShuffle( + MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), + {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + } else + assert(SExtVT.getScalarType() != MVT::i16 && + "Vectors of i16 must be shuffled"); + if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + V = DAG.getBitcast(FPCastVT, V); + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getZExtOrTrunc(V, DL, VT); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); + // Try to match patterns such as + // (i16 bitcast (v16i1 x)) + // -> + // (i16 movmsk (16i8 sext (v16i1 x))) + // before the setcc result is scalarized on subtargets that don't have legal + // vxi1 types. + if (DCI.isBeforeLegalize()) + if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + return V; // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -29944,6 +30011,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -29974,6 +30042,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30008,6 +30077,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30036,6 +30106,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -30933,75 +31004,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, } } -static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, - EVT VT, SDLoc DL) { - - auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(Mult, DL, VT)); - Result = DAG.getNode(ISD::SHL, DL, VT, Result, - DAG.getConstant(Shift, DL, MVT::i8)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - auto combineMulMulAddOrSub = [&](bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(9, DL, VT)); - Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - switch (MulAmt) { - default: - break; - case 11: - // mul x, 11 => add ((shl (mul x, 5), 1), x) - return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); - case 21: - // mul x, 21 => add ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); - case 22: - // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); - case 19: - // mul x, 19 => sub ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); - case 13: - // mul x, 13 => add ((shl (mul x, 3), 2), x) - return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); - case 23: - // mul x, 13 => sub ((shl (mul x, 3), 3), x) - return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); - case 14: - // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); - case 26: - // mul x, 26 => sub ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ false); - case 28: - // mul x, 28 => add ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ true); - case 29: - // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulMulAddOrSub(/*isAdd*/ true)); - case 30: - // mul x, 30 => sub (sub ((shl x, 5), x), x) - return DAG.getNode( - ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(5, DL, MVT::i8)))); - } - return SDValue(); -} - /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -31011,8 +31013,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); - if (!MulConstantOptimization) - return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -31068,8 +31068,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); - } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); + } if (!NewMul) { assert(MulAmt != 0 && @@ -34558,8 +34557,7 @@ static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG, isOneConstant(Carry.getOperand(1)))) Carry = Carry.getOperand(0); - if (Carry.getOpcode() == ISD::SETCC || - Carry.getOpcode() == X86ISD::SETCC || + if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { if (Carry.getConstantOperandVal(0) == X86::COND_B) return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1)); @@ -35126,7 +35124,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); @@ -35510,6 +35508,7 @@ TargetLowering::ConstraintWeight switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': @@ -35861,6 +35860,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR64RegClass); break; } + LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 18106c2eb3941..f51b6641db2fb 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1163,7 +1163,7 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 33fbd41bb6318..0aee30081a35c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -195,6 +195,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; + LLVM_FALLTHROUGH; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 53a8e83b36fca..cb21f1bd7706f 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -323,7 +323,7 @@ namespace { /// X86 Code Generator Pass Configuration Options. class X86PassConfig : public TargetPassConfig { public: - X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} X86TargetMachine &getX86TargetMachine() const { @@ -369,7 +369,7 @@ INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", "X86 Execution Dependency Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { - return new X86PassConfig(this, PM); + return new X86PassConfig(*this, PM); } void X86PassConfig::addIRPasses() { @@ -433,6 +433,7 @@ bool X86PassConfig::addPreISel() { void X86PassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { + addPass(&LiveRangeShrinkID); addPass(createX86FixupSetCC()); addPass(createX86OptimizeLEAs()); addPass(createX86CallFrameOptimization()); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index cf933f52604ef..1bf267d34ec2c 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -49,6 +49,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // end namespace llvm diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 2950e2efbea3e..1a1cbd4748887 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -54,7 +54,7 @@ namespace { /// XCore Code Generator Pass Configuration Options. class XCorePassConfig : public TargetPassConfig { public: - XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM) + XCorePassConfig(XCoreTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} XCoreTargetMachine &getXCoreTargetMachine() const { @@ -70,7 +70,7 @@ public: } // end anonymous namespace TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) { - return new XCorePassConfig(this, PM); + return new XCorePassConfig(*this, PM); } void XCorePassConfig::addIRPasses() { |