vendor/llvm/llvm-trunk-r304460

author: Dimitry Andric <dim@FreeBSD.org> 2017-06-01 20:58:36 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-06-01 20:58:36 +0000
commit: f382538d471e38a9b98f016c4caebd24c8d60b62 (patch)
tree: d30f3d58b1044b5355d50c17a6a96c6a0b35703a /lib/Target
parent: ee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff)
download: src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.tar.gz
src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.zip
74 files changed, 1829 insertions, 683 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 4af5fef4287cc..abe28460c83ab 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -190,6 +190,7 @@ def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
@@ -226,6 +227,7 @@ def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 3fbbc0be682d7..3b71cf8399a0d 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel {
   let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
   let LoadLatency           =  4; // Optimistic load cases.
   let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
-  let CompleteModel         =  0; // Use the default model otherwise.
+  let CompleteModel         =  1; // Use the default model otherwise.
 }
 
 //===----------------------------------------------------------------------===//
@@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
 def M1WriteB1 : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
 
 def M1WriteL5 : SchedWriteRes<[M1UnitL]>   { let Latency = 5; }
-def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
+def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
                                                             M1WriteA1]>,
                                    SchedVar<NoSchedPred,   [M1WriteL5]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
 def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; }
 def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
-def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
+def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
                                                             M1WriteA1]>,
                                    SchedVar<NoSchedPred,   [M1WriteS1]>]>;
 
@@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; }
 // Load instructions.
 def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
 def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
-def : SchedAlias<WriteLDIdx, M1WriteLA>;
+def : SchedAlias<WriteLDIdx, M1WriteLX>;
 
 // Store instructions.
 def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
 def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
 def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
-def : SchedAlias<WriteSTIdx, M1WriteSA>;
+def : SchedAlias<WriteSTIdx, M1WriteSX>;
 
 // FP data instructions.
 def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
@@ -231,6 +231,111 @@ def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
 def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
 def M1WriteTB      : SchedWriteRes<[M1UnitC,
                                     M1UnitALU]>    { let Latency = 2; }
+def M1WriteVLDA    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL]>      { let Latency = 6; }
+def M1WriteVLDB    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 7; }
+def M1WriteVLDC    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 8; }
+def M1WriteVLDD    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDE    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU]>   { let Latency = 6; }
+def M1WriteVLDF    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL]>      { let Latency = 10;
+                                                     let ResourceCycles = [5]; }
+def M1WriteVLDG    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDH    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 6; }
+def M1WriteVLDI    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 12;
+                                                     let ResourceCycles = [6]; }
+def M1WriteVLDJ    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 9;
+                                                     let ResourceCycles = [4]; }
+def M1WriteVLDK    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 9;
+                                                     let ResourceCycles = [4]; }
+def M1WriteVLDL    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDM    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDN    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 14;
+                                                     let ResourceCycles = [7]; }
+
+def M1WriteVSTA    : WriteSequence<[WriteVST], 2>;
+def M1WriteVSTB    : WriteSequence<[WriteVST], 3>;
+def M1WriteVSTC    : WriteSequence<[WriteVST], 4>;
+def M1WriteVSTD    : SchedWriteRes<[M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST]>    { let Latency = 7;
+                                                     let ResourceCycles = [7]; }
+def M1WriteVSTE    : SchedWriteRes<[M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST]>    { let Latency = 8;
+                                                     let ResourceCycles = [8]; }
+def M1WriteVSTF    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>     { let Latency = 15;
+                                                      let ResourceCycles = [15]; }
+def M1WriteVSTG    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>     { let Latency = 16;
+                                                      let ResourceCycles = [16]; }
+def M1WriteVSTH    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>      { let Latency = 14;
+                                                       let ResourceCycles = [14]; }
+def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>      { let Latency = 17;
+                                                       let ResourceCycles = [17]; }
 
 // Branch instructions
 def : InstRW<[M1WriteB1], (instrs Bcc)>;
@@ -360,8 +465,233 @@ def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64
 def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 
 // ASIMD load instructions.
+def : InstRW<[M1WriteVLDD],   (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[M1WriteVLDD,
+              WriteAdr],      (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVLDE],   (instregex "LD1i(64)$")>;
+def : InstRW<[M1WriteVLDE,
+              WriteAdr],      (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(1d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteL5],     (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDG],   (instregex "LD2i(8|16)$")>;
+def : InstRW<[M1WriteVLDG,
+              WriteAdr],      (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDG],   (instregex "LD2i(32)$")>;
+def : InstRW<[M1WriteVLDG,
+              WriteAdr],      (instregex "LD2i(32)_POST$")>;
+def : InstRW<[M1WriteVLDH],   (instregex "LD2i(64)$")>;
+def : InstRW<[M1WriteVLDH,
+              WriteAdr],      (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(1d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(2d)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDJ],   (instregex "LD3i(8|16)$")>;
+def : InstRW<[M1WriteVLDJ,
+              WriteAdr],      (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDJ],   (instregex "LD3i(32)$")>;
+def : InstRW<[M1WriteVLDJ,
+              WriteAdr],      (instregex "LD3i(32)_POST$")>;
+def : InstRW<[M1WriteVLDL],   (instregex "LD3i(64)$")>;
+def : InstRW<[M1WriteVLDL,
+              WriteAdr],      (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(1d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(2d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(2d)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDK],   (instregex "LD4i(8|16)$")>;
+def : InstRW<[M1WriteVLDK,
+              WriteAdr],      (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDK],   (instregex "LD4i(32)$")>;
+def : InstRW<[M1WriteVLDK,
+              WriteAdr],      (instregex "LD4i(32)_POST$")>;
+def : InstRW<[M1WriteVLDM],   (instregex "LD4i(64)$")>;
+def : InstRW<[M1WriteVLDM,
+              WriteAdr],      (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(1d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(2d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
 
 // ASIMD store instructions.
+def : InstRW<[M1WriteVSTD],   (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD],   (instregex "ST1i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[WriteVST],      (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],      (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteVST],      (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],      (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTA],   (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTA,
+              WriteAdr],      (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTA],   (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTA,
+              WriteAdr],      (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTB],   (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTB,
+              WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTB],   (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTB,
+              WriteAdr],      (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTC],   (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTC,
+              WriteAdr],      (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTC],   (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTC,
+              WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTD],   (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD],   (instregex "ST2i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTD],   (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTE],   (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTE,
+              WriteAdr],      (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTE],   (instregex "ST2Twov(2d)$")>;
+def : InstRW<[M1WriteVSTE,
+              WriteAdr],      (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH],   (instregex "ST3i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH],   (instregex "ST3i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST3i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF],   (instregex "ST3i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF],   (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTG],   (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTG,
+              WriteAdr],      (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTG],   (instregex "ST3Threev(2d)$")>;
+def : InstRW<[M1WriteVSTG,
+              WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH],   (instregex "ST4i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH],   (instregex "ST4i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST4i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF],   (instregex "ST4i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF],   (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTI],   (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTI,
+              WriteAdr],      (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTI],   (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[M1WriteVSTI,
+              WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
 
 // Cryptography instructions.
 def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index cb3f72a524f51..d4a8cecdb29f1 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -256,9 +256,9 @@ namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
-  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+  AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {
-    if (TM->getOptLevel() != CodeGenOpt::None)
+    if (TM.getOptLevel() != CodeGenOpt::None)
       substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
@@ -317,7 +317,7 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
 }
 
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AArch64PassConfig(this, PM);
+  return new AArch64PassConfig(*this, PM);
 }
 
 void AArch64PassConfig::addIRPasses() {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index f473944cd5283..0959014812d8b 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -503,40 +503,37 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
 
-  if (!FrameInfo.hasCalls()) {
-    Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
-                   MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
-    // If there are no calls, MachineRegisterInfo can tell us the used register
-    // count easily.
-
-    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
-    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
-      if (MRI.isPhysRegUsed(Reg)) {
-        HighestVGPRReg = Reg;
-        break;
-      }
-    }
 
-    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
-    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
-      if (MRI.isPhysRegUsed(Reg)) {
-        HighestSGPRReg = Reg;
-        break;
-      }
-    }
+  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+                 MRI.isPhysRegUsed(AMDGPU::VCC_HI);
 
-    // We found the maximum register index. They start at 0, so add one to get the
-    // number of registers.
-    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
-      TRI.getHWRegIndex(HighestVGPRReg) + 1;
-    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
-      TRI.getHWRegIndex(HighestSGPRReg) + 1;
+  // If there are no calls, MachineRegisterInfo can tell us the used register
+  // count easily.
 
-    return Info;
+  MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      HighestVGPRReg = Reg;
+      break;
+    }
   }
 
-  llvm_unreachable("calls not implemented");
+  MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      HighestSGPRReg = Reg;
+      break;
+    }
+  }
+
+  // We found the maximum register index. They start at 0, so add one to get the
+  // number of registers.
+  Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+    TRI.getHWRegIndex(HighestVGPRReg) + 1;
+  Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+    TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+  return Info;
 }
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 48827f4639974..596f02ae4a649 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -456,7 +456,7 @@ namespace {
 
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
-  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
     // Exceptions and StackMaps are not supported, so these passes will never do
     // anything.
@@ -487,7 +487,7 @@ public:
 
 class R600PassConfig final : public AMDGPUPassConfig {
 public:
-  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) {}
 
   ScheduleDAGInstrs *createMachineScheduler(
@@ -503,7 +503,7 @@ public:
 
 class GCNPassConfig final : public AMDGPUPassConfig {
 public:
-  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) {}
 
   GCNTargetMachine &getGCNTargetMachine() const {
@@ -682,7 +682,7 @@ void R600PassConfig::addPreEmitPass() {
 }
 
 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new R600PassConfig(this, PM);
+  return new R600PassConfig(*this, PM);
 }
 
 //===----------------------------------------------------------------------===//
@@ -844,6 +844,6 @@ void GCNPassConfig::addPreEmitPass() {
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new GCNPassConfig(this, PM);
+  return new GCNPassConfig(*this, PM);
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 934bf7f31bab4..a3c7c1982d0a6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -69,7 +69,6 @@ public:
       return -1;
     return 0;
   }
-
 };
 
 //===----------------------------------------------------------------------===//
@@ -89,6 +88,10 @@ public:
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   const R600Subtarget *getSubtargetImpl(const Function &) const override;
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f5541e08e1b72..cc68c971b2490 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -161,7 +161,8 @@ public:
     ImmTyOpSel,
     ImmTyOpSelHi,
     ImmTyNegLo,
-    ImmTyNegHi
+    ImmTyNegHi,
+    ImmTySwizzle
   };
 
   struct TokOp {
@@ -474,6 +475,7 @@ public:
   bool isSWaitCnt() const;
   bool isHwreg() const;
   bool isSendMsg() const;
+  bool isSwizzle() const;
   bool isSMRDOffset8() const;
   bool isSMRDOffset20() const;
   bool isSMRDLiteralOffset() const;
@@ -659,6 +661,7 @@ public:
     case ImmTyOpSelHi: OS << "OpSelHi"; break;
     case ImmTyNegLo: OS << "NegLo"; break;
     case ImmTyNegHi: OS << "NegHi"; break;
+    case ImmTySwizzle: OS << "Swizzle"; break;
     }
   }
 
@@ -994,6 +997,12 @@ private:
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
 
+  bool trySkipId(const StringRef Id);
+  bool trySkipToken(const AsmToken::TokenKind Kind);
+  bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
+  bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+  bool parseExpr(int64_t &Imm);
+
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
@@ -1003,6 +1012,19 @@ public:
   OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
   OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
 
+  bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+                            const unsigned MinVal,
+                            const unsigned MaxVal,
+                            const StringRef ErrMsg);
+  OperandMatchResultTy parseSwizzleOp(OperandVector &Operands);
+  bool parseSwizzleOffset(int64_t &Imm);
+  bool parseSwizzleMacro(int64_t &Imm);
+  bool parseSwizzleQuadPerm(int64_t &Imm);
+  bool parseSwizzleBitmaskPerm(int64_t &Imm);
+  bool parseSwizzleBroadcast(int64_t &Imm);
+  bool parseSwizzleSwap(int64_t &Imm);
+  bool parseSwizzleReverse(int64_t &Imm);
+
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
@@ -2785,7 +2807,13 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  AMDGPUOperand::ImmTy OffsetType =
+    (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+     Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
+                                                      AMDGPUOperand::ImmTyOffset;
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
+
   if (!IsGdsHardcoded) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
@@ -3384,6 +3412,298 @@ bool AMDGPUOperand::isSendMsg() const {
 }
 
 //===----------------------------------------------------------------------===//
+// parser helpers
+//===----------------------------------------------------------------------===//
+
+bool
+AMDGPUAsmParser::trySkipId(const StringRef Id) {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == Id) {
+    Parser.Lex();
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) {
+  if (getLexer().getKind() == Kind) {
+    Parser.Lex();
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
+                           const StringRef ErrMsg) {
+  if (!trySkipToken(Kind)) {
+    Error(Parser.getTok().getLoc(), ErrMsg);
+    return false;
+  }
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseExpr(int64_t &Imm) {
+  return !getParser().parseAbsoluteExpression(Imm);
+}
+
+bool
+AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
+  SMLoc S = Parser.getTok().getLoc();
+  if (getLexer().getKind() == AsmToken::String) {
+    Val = Parser.getTok().getStringContents();
+    Parser.Lex();
+    return true;
+  } else {
+    Error(S, ErrMsg);
+    return false;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// swizzle
+//===----------------------------------------------------------------------===//
+
+LLVM_READNONE
+static unsigned
+encodeBitmaskPerm(const unsigned AndMask,
+                  const unsigned OrMask,
+                  const unsigned XorMask) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  return BITMASK_PERM_ENC |
+         (AndMask << BITMASK_AND_SHIFT) |
+         (OrMask  << BITMASK_OR_SHIFT)  |
+         (XorMask << BITMASK_XOR_SHIFT);
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+                                      const unsigned MinVal,
+                                      const unsigned MaxVal,
+                                      const StringRef ErrMsg) {
+  for (unsigned i = 0; i < OpNum; ++i) {
+    if (!skipToken(AsmToken::Comma, "expected a comma")){
+      return false;
+    }
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (!parseExpr(Op[i])) {
+      return false;
+    }
+    if (Op[i] < MinVal || Op[i] > MaxVal) {
+      Error(ExprLoc, ErrMsg);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  int64_t Lane[LANE_NUM];
+  if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX,
+                           "expected a 2-bit lane id")) {
+    Imm = QUAD_PERM_ENC;
+    for (auto i = 0; i < LANE_NUM; ++i) {
+      Imm |= Lane[i] << (LANE_SHIFT * i);
+    }
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+  int64_t LaneIdx;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+                            2, 32,
+                            "group size must be in the interval [2,32]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+  if (parseSwizzleOperands(1, &LaneIdx,
+                           0, GroupSize - 1,
+                           "lane id must be in the interval [0,group size - 1]")) {
+    Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0);
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+      2, 32, "group size must be in the interval [2,32]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+
+  Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+      1, 16, "group size must be in the interval [1,16]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+
+  Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (!skipToken(AsmToken::Comma, "expected a comma")) {
+    return false;
+  }
+
+  StringRef Ctl;
+  SMLoc StrLoc = Parser.getTok().getLoc();
+  if (!parseString(Ctl)) {
+    return false;
+  }
+  if (Ctl.size() != BITMASK_WIDTH) {
+    Error(StrLoc, "expected a 5-character mask");
+    return false;
+  }
+
+  unsigned AndMask = 0;
+  unsigned OrMask = 0;
+  unsigned XorMask = 0;
+
+  for (size_t i = 0; i < Ctl.size(); ++i) {
+    unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i);
+    switch(Ctl[i]) {
+    default:
+      Error(StrLoc, "invalid mask");
+      return false;
+    case '0':
+      break;
+    case '1':
+      OrMask |= Mask;
+      break;
+    case 'p':
+      AndMask |= Mask;
+      break;
+    case 'i':
+      AndMask |= Mask;
+      XorMask |= Mask;
+      break;
+    }
+  }
+
+  Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
+
+  SMLoc OffsetLoc = Parser.getTok().getLoc();
+
+  if (!parseExpr(Imm)) {
+    return false;
+  }
+  if (!isUInt<16>(Imm)) {
+    Error(OffsetLoc, "expected a 16-bit offset");
+    return false;
+  }
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (skipToken(AsmToken::LParen, "expected a left parentheses")) {
+
+    SMLoc ModeLoc = Parser.getTok().getLoc();
+    bool Ok = false;
+
+    if (trySkipId(IdSymbolic[ID_QUAD_PERM])) {
+      Ok = parseSwizzleQuadPerm(Imm);
+    } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) {
+      Ok = parseSwizzleBitmaskPerm(Imm);
+    } else if (trySkipId(IdSymbolic[ID_BROADCAST])) {
+      Ok = parseSwizzleBroadcast(Imm);
+    } else if (trySkipId(IdSymbolic[ID_SWAP])) {
+      Ok = parseSwizzleSwap(Imm);
+    } else if (trySkipId(IdSymbolic[ID_REVERSE])) {
+      Ok = parseSwizzleReverse(Imm);
+    } else {
+      Error(ModeLoc, "expected a swizzle mode");
+    }
+
+    return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses");
+  }
+
+  return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t Imm = 0;
+
+  if (trySkipId("offset")) {
+
+    bool Ok = false;
+    if (skipToken(AsmToken::Colon, "expected a colon")) {
+      if (trySkipId("swizzle")) {
+        Ok = parseSwizzleMacro(Imm);
+      } else {
+        Ok = parseSwizzleOffset(Imm);
+      }
+    }
+
+    Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle));
+
+    return Ok? MatchOperand_Success : MatchOperand_ParseFail;
+  } else {
+    return MatchOperand_NoMatch;
+  }
+}
+
+bool
+AMDGPUOperand::isSwizzle() const {
+  return isImmTy(ImmTySwizzle);
+}
+
+//===----------------------------------------------------------------------===//
 // sopp branch targets
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index 357e18108e7e8..fc516c3b39c28 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -145,10 +145,10 @@ class DS_1A2D_Off8_RET<string opName,
   let hasPostISelHook = 1;
 }
 
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
-  (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
   "$vdst, $addr$offset$gds"> {
 
   let has_data0 = 0;
@@ -440,7 +440,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
 def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
 }
 
 let mayStore = 0 in {
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index a817ff3cbaf09..523eea41897ea 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -1160,6 +1160,112 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
   O << SImm16; // Unknown simm16 code.
 }
 
+static void printSwizzleBitmask(const uint16_t AndMask,
+                                const uint16_t OrMask,
+                                const uint16_t XorMask,
+                                raw_ostream &O) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  uint16_t Probe0 = ((0            & AndMask) | OrMask) ^ XorMask;
+  uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask;
+
+  O << "\"";
+
+  for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) {
+    uint16_t p0 = Probe0 & Mask;
+    uint16_t p1 = Probe1 & Mask;
+
+    if (p0 == p1) {
+      if (p0 == 0) {
+        O << "0";
+      } else {
+        O << "1";
+      }
+    } else {
+      if (p0 == 0) {
+        O << "p";
+      } else {
+        O << "i";
+      }
+    }
+  }
+
+  O << "\"";
+}
+
+void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == 0) {
+    return;
+  }
+
+  O << " offset:";
+
+  if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
+
+    O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
+    for (auto i = 0; i < LANE_NUM; ++i) {
+      O << ",";
+      O << formatDec(Imm & LANE_MASK);
+      Imm >>= LANE_SHIFT;
+    }
+    O << ")";
+
+  } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) {
+
+    uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK;
+    uint16_t OrMask  = (Imm >> BITMASK_OR_SHIFT)  & BITMASK_MASK;
+    uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK;
+
+    if (AndMask == BITMASK_MAX &&
+        OrMask == 0 &&
+        countPopulation(XorMask) == 1) {
+
+      O << "swizzle(" << IdSymbolic[ID_SWAP];
+      O << ",";
+      O << formatDec(XorMask);
+      O << ")";
+
+    } else if (AndMask == BITMASK_MAX &&
+               OrMask == 0 && XorMask > 0 &&
+               isPowerOf2_64(XorMask + 1)) {
+
+      O << "swizzle(" << IdSymbolic[ID_REVERSE];
+      O << ",";
+      O << formatDec(XorMask + 1);
+      O << ")";
+
+    } else {
+
+      uint16_t GroupSize = BITMASK_MAX - AndMask + 1;
+      if (GroupSize > 1 &&
+          isPowerOf2_64(GroupSize) &&
+          OrMask < GroupSize &&
+          XorMask == 0) {
+
+        O << "swizzle(" << IdSymbolic[ID_BROADCAST];
+        O << ",";
+        O << formatDec(GroupSize);
+        O << ",";
+        O << formatDec(OrMask);
+        O << ")";
+
+      } else {
+        O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM];
+        O << ",";
+        printSwizzleBitmask(AndMask, OrMask, XorMask, O);
+        O << ")";
+      }
+    }
+  } else {
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c0b8e5c510893..c8094c4b840a1 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -193,6 +193,8 @@ private:
                    raw_ostream &O);
   void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
   void printWaitFlag(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 80967edee0ab1..5cd90323ff67b 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -281,6 +281,46 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
 
 } // namespace Hwreg
 
+namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
+
+enum Id { // id of symbolic names
+  ID_QUAD_PERM = 0,
+  ID_BITMASK_PERM,
+  ID_SWAP,
+  ID_REVERSE,
+  ID_BROADCAST
+};
+
+enum EncBits {
+
+  // swizzle mode encodings
+
+  QUAD_PERM_ENC         = 0x8000,
+  QUAD_PERM_ENC_MASK    = 0xFF00,
+
+  BITMASK_PERM_ENC      = 0x0000,
+  BITMASK_PERM_ENC_MASK = 0x8000,
+
+  // QUAD_PERM encodings
+
+  LANE_MASK             = 0x3,
+  LANE_MAX              = LANE_MASK,
+  LANE_SHIFT            = 2,
+  LANE_NUM              = 4,
+
+  // BITMASK_PERM encodings
+
+  BITMASK_MASK          = 0x1F,
+  BITMASK_MAX           = BITMASK_MASK,
+  BITMASK_WIDTH         = 5,
+
+  BITMASK_AND_SHIFT     = 0,
+  BITMASK_OR_SHIFT      = 5,
+  BITMASK_XOR_SHIFT     = 10
+};
+
+} // namespace Swizzle
+
 namespace SDWA {
 
 enum SdwaSel {
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b5e3ce3dfe3ed..e22166d03e9ae 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -826,7 +826,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::RETURN ||
-      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
@@ -1149,8 +1150,10 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
-  if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) {
-    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+  uint64_t TSFlags = Inst.getDesc().TSFlags;
+  if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
+    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
+	TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
     } else {
@@ -1183,7 +1186,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
-        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) {
+        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
   } else if (TII->isSMRD(Inst)) {
@@ -1715,6 +1718,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfo>();
   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1859,5 +1863,19 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to to the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    MachineBasicBlock &EntryBB = MF.front();
+    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+    Modified = true;
+  }
+
   return Modified;
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c5287c7f64ba4..445bf79a7814e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -383,6 +383,14 @@ def SendMsgMatchClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+def SwizzleMatchClass : AsmOperandClass {
+  let Name = "Swizzle";
+  let PredicateMethod = "isSwizzle";
+  let ParserMethod = "parseSwizzleOp";
+  let RenderMethod = "addImmOperands";
+  let IsOptional = 1;
+}
+
 def ExpTgtMatchClass : AsmOperandClass {
   let Name = "ExpTgt";
   let PredicateMethod = "isExpTgt";
@@ -395,6 +403,11 @@ def SendMsgImm : Operand<i32> {
   let ParserMatchClass = SendMsgMatchClass;
 }
 
+def SwizzleImm : Operand<i16> {
+  let PrintMethod = "printSwizzle";
+  let ParserMatchClass = SwizzleMatchClass;
+}
+
 def SWaitMatchClass : AsmOperandClass {
   let Name = "SWaitCnt";
   let RenderMethod = "addImmOperands";
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index b91cdddc5520f..a648c178101a1 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -66,6 +66,12 @@ public:
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
+  // Stack access is very expensive. CSRs are also the high registers, and we
+  // want to minimize the number of used registers.
+  unsigned getCSRFirstUseCost() const override {
+    return 100;
+  }
+
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index b6868de6a74e3..03b11ae80500e 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -65,5 +65,18 @@ const char* const IdSymbolic[] = {
 };
 
 } // namespace Hwreg
+
+namespace Swizzle {
+
+// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  "QUAD_PERM",
+  "BITMASK_PERM",
+  "SWAP",
+  "REVERSE",
+  "BROADCAST",
+};
+
+} // namespace Swizzle
 } // namespace AMDGPU
 } // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index b2dc2c0e364cd..ebb2be22b4879 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
 extern const char* const IdSymbolic[];
 
 } // namespace Hwreg
+
+namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Swizzle
 } // namespace AMDGPU
 } // namespace llvm
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 90baabcdb6520..ec49f0d37af44 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -757,14 +757,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MI.eraseFromParent();
 }
 
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MBB->addLiveIn(*I);
-}
-
 /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
-/// possible. This only gets used at -O0 so we don't care about efficiency of the
-/// generated code.
+/// possible. This only gets used at -O0 so we don't care about efficiency of
+/// the generated code.
 bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      unsigned LdrexOp, unsigned StrexOp,
@@ -773,16 +768,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   bool IsThumb = STI->isThumb();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Dest = MI.getOperand(0);
   unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
-
-  LivePhysRegs LiveRegs(TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  bool StatusDead = MI.getOperand(1).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  unsigned NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -795,25 +789,35 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
   if (UxtOp) {
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
-            .addReg(Desired.getReg(), RegState::Kill);
+        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg)
+            .addReg(DesiredReg, RegState::Kill);
     if (!IsThumb)
       MIB.addImm(0);
     MIB.add(predOps(ARMCC::AL));
   }
 
   // .Lloadcmp:
+  //     mov wStatus, #0
   //     ldrex rDest, [rAddr]
   //     cmp rDest, rDesired
   //     bne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+  if (!StatusDead) {
+    if (IsThumb) {
+      BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg)
+        .addDef(ARM::CPSR, RegState::Dead)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
+    } else {
+      BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg)
+        .addImm(0)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
+    }
+  }
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
-  MIB.addReg(Addr.getReg());
+  MIB.addReg(AddrReg);
   if (LdrexOp == ARM::t2LDREX)
     MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
   MIB.add(predOps(ARMCC::AL));
@@ -821,7 +825,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
   BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
       .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-      .add(Desired)
+      .addReg(DesiredReg)
       .add(predOps(ARMCC::AL));
   unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
   BuildMI(LoadCmpBB, DL, TII->get(Bcc))
@@ -835,21 +839,16 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   //     strex rStatus, rNew, [rAddr]
   //     cmp rStatus, #0
   //     bne .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
-
-  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
-  MIB.add(New);
-  MIB.add(Addr);
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg)
+    .addReg(NewReg)
+    .addReg(AddrReg);
   if (StrexOp == ARM::t2STREX)
     MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
   MIB.add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
   BuildMI(StoreBB, DL, TII->get(CMPri))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addImm(0)
       .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
@@ -861,12 +860,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -894,19 +905,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &Dest = MI.getOperand(0);
   unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
+  bool StatusDead = MI.getOperand(1).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  MachineOperand New = MI.getOperand(4);
+  New.setIsKill(false);
 
   unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
   unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
-  unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
-  unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
-
-  LivePhysRegs LiveRegs(TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
+  unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -922,26 +933,21 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   //     cmp rDestLo, rDesiredLo
   //     sbcs rStatus<dead>, rDestHi, rDesiredHi
   //     bne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
   unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
   addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
-  MIB.addReg(Addr.getReg()).add(predOps(ARMCC::AL));
+  MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
   BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
       .addReg(DestLo, getKillRegState(Dest.isDead()))
-      .addReg(DesiredLo, getKillRegState(Desired.isDead()))
+      .addReg(DesiredLo)
       .add(predOps(ARMCC::AL));
 
   BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
       .addReg(DestHi, getKillRegState(Dest.isDead()))
-      .addReg(DesiredHi, getKillRegState(Desired.isDead()))
+      .addReg(DesiredHi)
       .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill);
 
   unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
@@ -956,18 +962,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   //     strexd rStatus, rNewLo, rNewHi, [rAddr]
   //     cmp rStatus, #0
   //     bne .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
   MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
   addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
-  MIB.add(Addr).add(predOps(ARMCC::AL));
+  MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
   BuildMI(StoreBB, DL, TII->get(CMPri))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addImm(0)
       .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
@@ -979,12 +981,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 4f7a0ab4e2203..c2b2502843c0a 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -968,8 +968,9 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
-      if (!isLiveIn)
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      bool isLiveIn = MRI.isLiveIn(Reg);
+      if (!isLiveIn && !MRI.isReserved(Reg))
         MBB.addLiveIn(Reg);
       // If NoGap is true, push consecutive registers and then leave the rest
       // for other instructions. e.g.
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index bee83dfb6f636..423f97ccacd64 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1413,7 +1413,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 
 // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
 // and make use of the same compressed jump table format as Thumb-2.
-let Size = 2 in {
+let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
+    isIndirectBranch = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
         (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
         Sched<[WriteBr]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index bf3d820e7b7d0..45471a4e95b39 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3494,7 +3494,8 @@ def t2B   : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
   let AsmMatchConverter = "cvtThumbBranches";
 }
 
-let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+let Size = 4, isNotDuplicable = 1, isBranch = 1, isTerminator = 1,
+    isBarrier = 1, isIndirectBranch = 1 in {
 
 // available in both v8-M.Baseline and Thumb2 targets
 def t2BR_JT : t2basePseudoInst<(outs),
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index c4f23c66e4eab..f5e4043882ff7 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -382,7 +382,7 @@ namespace {
 /// ARM Code Generator Pass Configuration Options.
 class ARMPassConfig : public TargetPassConfig {
 public:
-  ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+  ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   ARMBaseTargetMachine &getARMTargetMachine() const {
@@ -419,7 +419,7 @@ INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
                 "ARM Execution Dependency Fix", false, false)
 
 TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARMPassConfig(this, PM);
+  return new ARMPassConfig(*this, PM);
 }
 
 void ARMPassConfig::addIRPasses() {
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index e5eb27114c726..2fcee73228fe7 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -60,6 +60,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// ARM/Thumb little endian target machine.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 40bf545e83224..b0d1d3fb9ef0b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -729,6 +729,15 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
     // linker can handle it. GNU AS produces an error in this case.
     if (Sym->isExternal() || Value >= 0x400004)
       IsResolved = false;
+    // When an ARM function is called from a Thumb function, produce a
+    // relocation so the linker will use the correct branch instruction for ELF
+    // binaries.
+    if (Sym->isELF()) {
+      unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
+      if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) &&
+          !Asm.isThumbFunc(Sym))
+        IsResolved = false;
+    }
   }
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index f917c35b9cebb..f10427e2ed575 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -698,13 +698,14 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       CopyRegs.insert(ArgReg);
 
   // Push the low registers and lr
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (!LoRegsToSave.empty()) {
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
     for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave.count(Reg)) {
-        bool isKill = !MF.getRegInfo().isLiveIn(Reg);
-        if (isKill)
+        bool isKill = !MRI.isLiveIn(Reg);
+        if (isKill && !MRI.isReserved(Reg))
           MBB.addLiveIn(Reg);
 
         MIB.addReg(Reg, getKillRegState(isKill));
@@ -746,8 +747,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     SmallVector<unsigned, 4> RegsToPush;
     while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
       if (HiRegsToSave.count(*HiRegToSave)) {
-        bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave);
-        if (isKill)
+        bool isKill = !MRI.isLiveIn(*HiRegToSave);
+        if (isKill && !MRI.isReserved(*HiRegToSave))
           MBB.addLiveIn(*HiRegToSave);
 
         // Emit a MOV from the high reg to the low reg.
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index ef9c00e4b784e..7d3faac1dcc20 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -1500,9 +1500,9 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   unsigned DstReg = MI.getOperand(0).getReg();
 
   // BB:
-  // cp 0, N
+  // cpi N, 0
   // breq RemBB
-  BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0);
+  BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
   BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
 
   // LoopBB:
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index f10ca394f36ce..5dd8b2c27b212 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -904,7 +904,7 @@ let Defs = [SREG] in
   // Compares a register with an 8 bit immediate.
   def CPIRdK : FRdK<0b0011,
                     (outs),
-                    (ins GPR8:$rd, imm_ldi8:$k),
+                    (ins LD8:$rd, imm_ldi8:$k),
                     "cpi\t$rd, $k",
                     [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
 }
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index fb3262916b4fd..2ab0b1080c6af 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -57,7 +57,7 @@ namespace {
 /// AVR Code Generator Pass Configuration Options.
 class AVRPassConfig : public TargetPassConfig {
 public:
-  AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM)
+  AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   AVRTargetMachine &getAVRTargetMachine() const {
@@ -71,7 +71,7 @@ public:
 } // namespace
 
 TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AVRPassConfig(this, PM);
+  return new AVRPassConfig(*this, PM);
 }
 
 extern "C" void LLVMInitializeAVRTarget() {
diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h
index 10345193d14af..795e94e6af03a 100644
--- a/lib/Target/AVR/AVRTargetMachine.h
+++ b/lib/Target/AVR/AVRTargetMachine.h
@@ -41,6 +41,10 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
+
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AVRSubtarget SubTarget;
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 897695633e46b..cf8e735409042 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -58,7 +58,7 @@ namespace {
 // BPF Code Generator Pass Configuration Options.
 class BPFPassConfig : public TargetPassConfig {
 public:
-  BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM)
+  BPFPassConfig(BPFTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   BPFTargetMachine &getBPFTargetMachine() const {
@@ -70,7 +70,7 @@ public:
 }
 
 TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new BPFPassConfig(this, PM);
+  return new BPFPassConfig(*this, PM);
 }
 
 // Install an instruction selector pass using
diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt
index e2654b0465df1..4918653ff19da 100644
--- a/lib/Target/BPF/CMakeLists.txt
+++ b/lib/Target/BPF/CMakeLists.txt
@@ -4,7 +4,7 @@ tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM BPFGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv)
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 8e10c521a77d3..e4434136bf866 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -71,6 +71,9 @@ public:
     return true;
   }
 
+  bool ComplexPatternFuncMutatesDAG() const override {
+    return true;
+  }
   void PreprocessISelDAG() override;
   void EmitFunctionEntryCode() override;
 
@@ -81,6 +84,7 @@ public:
   inline bool SelectAddrGP(SDValue &N, SDValue &R);
   bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP);
   bool SelectAddrFI(SDValue &N, SDValue &R);
+  bool DetectUseSxtw(SDValue &N, SDValue &R);
 
   StringRef getPassName() const override {
     return "Hexagon DAG->DAG Pattern Instruction Selection";
@@ -106,7 +110,6 @@ public:
   void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
   void SelectStore(SDNode *N);
   void SelectSHL(SDNode *N);
-  void SelectMul(SDNode *N);
   void SelectZeroExtend(SDNode *N);
   void SelectIntrinsicWChain(SDNode *N);
   void SelectIntrinsicWOChain(SDNode *N);
@@ -118,7 +121,7 @@ public:
   #include "HexagonGenDAGISel.inc"
 
 private:
-  bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src);
+  bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
   bool isOrEquivalentToAdd(const SDNode *N) const;
   bool isAlignedMemNode(const MemSDNode *N) const;
   bool isPositiveHalfWord(const SDNode *N) const;
@@ -597,90 +600,6 @@ void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   SelectCode(ST);
 }
 
-void HexagonDAGToDAGISel::SelectMul(SDNode *N) {
-  SDLoc dl(N);
-
-  // %conv.i = sext i32 %tmp1 to i64
-  // %conv2.i = sext i32 %add to i64
-  // %mul.i = mul nsw i64 %conv2.i, %conv.i
-  //
-  //   --- match with the following ---
-  //
-  // %mul.i = mpy (%tmp1, %add)
-  //
-
-  if (N->getValueType(0) == MVT::i64) {
-    // Shifting a i64 signed multiply.
-    SDValue MulOp0 = N->getOperand(0);
-    SDValue MulOp1 = N->getOperand(1);
-
-    SDValue OP0;
-    SDValue OP1;
-
-    // Handle sign_extend and sextload.
-    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
-      SDValue Sext0 = MulOp0.getOperand(0);
-      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
-        return;
-      }
-      OP0 = Sext0;
-    } else if (MulOp0.getOpcode() == ISD::LOAD) {
-      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
-      if (LD->getMemoryVT() != MVT::i32 ||
-          LD->getExtensionType() != ISD::SEXTLOAD ||
-          LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
-        return;
-      }
-      SDValue Chain = LD->getChain();
-      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
-                                            MVT::Other,
-                                            LD->getBasePtr(), TargetConst0,
-                                            Chain), 0);
-    } else {
-      SelectCode(N);
-      return;
-    }
-
-    // Same goes for the second operand.
-    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
-      SDValue Sext1 = MulOp1.getOperand(0);
-      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
-        return;
-      }
-      OP1 = Sext1;
-    } else if (MulOp1.getOpcode() == ISD::LOAD) {
-      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
-      if (LD->getMemoryVT() != MVT::i32 ||
-          LD->getExtensionType() != ISD::SEXTLOAD ||
-          LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
-        return;
-      }
-      SDValue Chain = LD->getChain();
-      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
-                                            MVT::Other,
-                                            LD->getBasePtr(), TargetConst0,
-                                            Chain), 0);
-    } else {
-      SelectCode(N);
-      return;
-    }
-
-    // Generate a mpy instruction.
-    SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl,
-                                            MVT::i64, OP0, OP1);
-    ReplaceNode(N, Result);
-    return;
-  }
-
-  SelectCode(N);
-}
-
 void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
   SDLoc dl(N);
   SDValue Shl_0 = N->getOperand(0);
@@ -843,7 +762,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
 
   SDValue V = N->getOperand(1);
   SDValue U;
-  if (isValueExtension(V, Bits, U)) {
+  if (keepsLowBits(V, Bits, U)) {
     SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
                                 N->getOperand(0), U);
     ReplaceNode(N, R.getNode());
@@ -949,7 +868,6 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
   case ISD::SHL:                  return SelectSHL(N);
   case ISD::LOAD:                 return SelectLoad(N);
   case ISD::STORE:                return SelectStore(N);
-  case ISD::MUL:                  return SelectMul(N);
   case ISD::ZERO_EXTEND:          return SelectZeroExtend(N);
   case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
   case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
@@ -1327,7 +1245,7 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
 }
 
 // Match a frame index that can be used in an addressing mode.
-bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) {
+bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) {
   if (N.getOpcode() != ISD::FrameIndex)
     return false;
   auto &HFI = *HST->getFrameLowering();
@@ -1388,16 +1306,83 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
   return false;
 }
 
-bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
-      unsigned FromBits, SDValue &Src) {
+bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
+  // This (complex pattern) function is meant to detect a sign-extension
+  // i32->i64 on a per-operand basis. This would allow writing single
+  // patterns that would cover a number of combinations of different ways
+  // a sign-extensions could be written. For example:
+  //   (mul (DetectUseSxtw x) (DetectUseSxtw y)) -> (M2_dpmpyss_s0 x y)
+  // could match either one of these:
+  //   (mul (sext x) (sext_inreg y))
+  //   (mul (sext-load *p) (sext_inreg y))
+  //   (mul (sext_inreg x) (sext y))
+  // etc.
+  //
+  // The returned value will have type i64 and its low word will
+  // contain the value being extended. The high bits are not specified.
+  // The returned type is i64 because the original type of N was i64,
+  // but the users of this function should only use the low-word of the
+  // result, e.g.
+  //  (mul sxtw:x, sxtw:y) -> (M2_dpmpyss_s0 (LoReg sxtw:x), (LoReg sxtw:y))
+
+  if (N.getValueType() != MVT::i64)
+    return false;
+  EVT SrcVT;
+  unsigned Opc = N.getOpcode();
+  switch (Opc) {
+    case ISD::SIGN_EXTEND:
+    case ISD::SIGN_EXTEND_INREG: {
+      // sext_inreg has the source type as a separate operand.
+      EVT T = Opc == ISD::SIGN_EXTEND
+                ? N.getOperand(0).getValueType()
+                : cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() != 32)
+        return false;
+      R = N.getOperand(0);
+      break;
+    }
+    case ISD::LOAD: {
+      LoadSDNode *L = cast<LoadSDNode>(N);
+      if (L->getExtensionType() != ISD::SEXTLOAD)
+        return false;
+      // All extending loads extend to i32, so even if the value in
+      // memory is shorter than 32 bits, it will be i32 after the load.
+      if (L->getMemoryVT().getSizeInBits() > 32)
+        return false;
+      R = N;
+      break;
+    }
+    default:
+      return false;
+  }
+  EVT RT = R.getValueType();
+  if (RT == MVT::i64)
+    return true;
+  assert(RT == MVT::i32);
+  // This is only to produce a value of type i64. Do not rely on the
+  // high bits produced by this.
+  const SDLoc &dl(N);
+  SDValue Ops[] = {
+    CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32),
+    R, CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32),
+    R, CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32)
+  };
+  SDNode *T = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl,
+                                     MVT::i64, Ops);
+  R = SDValue(T, 0);
+  return true;
+}
+
+bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
+      SDValue &Src) {
   unsigned Opc = Val.getOpcode();
   switch (Opc) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: {
-    SDValue const &Op0 = Val.getOperand(0);
+    const SDValue &Op0 = Val.getOperand(0);
     EVT T = Op0.getValueType();
-    if (T.isInteger() && T.getSizeInBits() == FromBits) {
+    if (T.isInteger() && T.getSizeInBits() == NumBits) {
       Src = Op0;
       return true;
     }
@@ -1408,23 +1393,23 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
   case ISD::AssertZext:
     if (Val.getOperand(0).getValueType().isInteger()) {
       VTSDNode *T = cast<VTSDNode>(Val.getOperand(1));
-      if (T->getVT().getSizeInBits() == FromBits) {
+      if (T->getVT().getSizeInBits() == NumBits) {
         Src = Val.getOperand(0);
         return true;
       }
     }
     break;
   case ISD::AND: {
-    // Check if this is an AND with "FromBits" of lower bits set to 1.
-    uint64_t FromMask = (1 << FromBits) - 1;
+    // Check if this is an AND with NumBits of lower bits set to 1.
+    uint64_t Mask = (1 << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
-      if (C->getZExtValue() == FromMask) {
+      if (C->getZExtValue() == Mask) {
         Src = Val.getOperand(1);
         return true;
       }
     }
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
-      if (C->getZExtValue() == FromMask) {
+      if (C->getZExtValue() == Mask) {
         Src = Val.getOperand(0);
         return true;
       }
@@ -1433,16 +1418,16 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
   }
   case ISD::OR:
   case ISD::XOR: {
-    // OR/XOR with the lower "FromBits" bits set to 0.
-    uint64_t FromMask = (1 << FromBits) - 1;
+    // OR/XOR with the lower NumBits bits set to 0.
+    uint64_t Mask = (1 << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
-      if ((C->getZExtValue() & FromMask) == 0) {
+      if ((C->getZExtValue() & Mask) == 0) {
         Src = Val.getOperand(1);
         return true;
       }
     }
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
-      if ((C->getZExtValue() & FromMask) == 0) {
+      if ((C->getZExtValue() & Mask) == 0) {
         Src = Val.getOperand(0);
         return true;
       }
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 5ecf9320d5c26..4c6c6eeafbe0e 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1928,11 +1928,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
   setOperationAction(ISD::BSWAP, MVT::i64, Legal);
-
-  // We custom lower i64 to i64 mul, so that it is not considered as a legal
-  // operation. There is a pattern that will match i64 mul and transform it
-  // to a series of instructions.
-  setOperationAction(ISD::MUL,   MVT::i64, Expand);
+  setOperationAction(ISD::MUL,   MVT::i64, Legal);
 
   for (unsigned IntExpOp :
        { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 66e07c67958e9..0fef91ec4d3e0 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1769,161 +1769,6 @@ bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
   return getType(MI) == HexagonII::TypeCJ && MI.isBranch();
 }
 
-bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
-  return (MI.isBranch() && isPredicated(MI)) ||
-         isConditionalTransfer(MI) ||
-         isConditionalALU32(MI)    ||
-         isConditionalLoad(MI)     ||
-         // Predicated stores which don't have a .new on any operands.
-         (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
-          !isPredicatedNew(MI));
-}
-
-bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    case Hexagon::A2_paddf:
-    case Hexagon::A2_paddfnew:
-    case Hexagon::A2_paddif:
-    case Hexagon::A2_paddifnew:
-    case Hexagon::A2_paddit:
-    case Hexagon::A2_padditnew:
-    case Hexagon::A2_paddt:
-    case Hexagon::A2_paddtnew:
-    case Hexagon::A2_pandf:
-    case Hexagon::A2_pandfnew:
-    case Hexagon::A2_pandt:
-    case Hexagon::A2_pandtnew:
-    case Hexagon::A2_porf:
-    case Hexagon::A2_porfnew:
-    case Hexagon::A2_port:
-    case Hexagon::A2_portnew:
-    case Hexagon::A2_psubf:
-    case Hexagon::A2_psubfnew:
-    case Hexagon::A2_psubt:
-    case Hexagon::A2_psubtnew:
-    case Hexagon::A2_pxorf:
-    case Hexagon::A2_pxorfnew:
-    case Hexagon::A2_pxort:
-    case Hexagon::A2_pxortnew:
-    case Hexagon::A4_paslhf:
-    case Hexagon::A4_paslhfnew:
-    case Hexagon::A4_paslht:
-    case Hexagon::A4_paslhtnew:
-    case Hexagon::A4_pasrhf:
-    case Hexagon::A4_pasrhfnew:
-    case Hexagon::A4_pasrht:
-    case Hexagon::A4_pasrhtnew:
-    case Hexagon::A4_psxtbf:
-    case Hexagon::A4_psxtbfnew:
-    case Hexagon::A4_psxtbt:
-    case Hexagon::A4_psxtbtnew:
-    case Hexagon::A4_psxthf:
-    case Hexagon::A4_psxthfnew:
-    case Hexagon::A4_psxtht:
-    case Hexagon::A4_psxthtnew:
-    case Hexagon::A4_pzxtbf:
-    case Hexagon::A4_pzxtbfnew:
-    case Hexagon::A4_pzxtbt:
-    case Hexagon::A4_pzxtbtnew:
-    case Hexagon::A4_pzxthf:
-    case Hexagon::A4_pzxthfnew:
-    case Hexagon::A4_pzxtht:
-    case Hexagon::A4_pzxthtnew:
-    case Hexagon::C2_ccombinewf:
-    case Hexagon::C2_ccombinewt:
-      return true;
-  }
-  return false;
-}
-
-// FIXME - Function name and it's functionality don't match.
-// It should be renamed to hasPredNewOpcode()
-bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
-  if (!MI.getDesc().mayLoad() || !isPredicated(MI))
-    return false;
-
-  int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
-  // Instruction with valid predicated-new opcode can be promoted to .new.
-  return PNewOpcode >= 0;
-}
-
-// Returns true if an instruction is a conditional store.
-//
-// Note: It doesn't include conditional new-value stores as they can't be
-// converted to .new predicate.
-bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    default: return false;
-    case Hexagon::S4_storeirbt_io:
-    case Hexagon::S4_storeirbf_io:
-    case Hexagon::S4_pstorerbt_rr:
-    case Hexagon::S4_pstorerbf_rr:
-    case Hexagon::S2_pstorerbt_io:
-    case Hexagon::S2_pstorerbf_io:
-    case Hexagon::S2_pstorerbt_pi:
-    case Hexagon::S2_pstorerbf_pi:
-    case Hexagon::S2_pstorerdt_io:
-    case Hexagon::S2_pstorerdf_io:
-    case Hexagon::S4_pstorerdt_rr:
-    case Hexagon::S4_pstorerdf_rr:
-    case Hexagon::S2_pstorerdt_pi:
-    case Hexagon::S2_pstorerdf_pi:
-    case Hexagon::S2_pstorerht_io:
-    case Hexagon::S2_pstorerhf_io:
-    case Hexagon::S4_storeirht_io:
-    case Hexagon::S4_storeirhf_io:
-    case Hexagon::S4_pstorerht_rr:
-    case Hexagon::S4_pstorerhf_rr:
-    case Hexagon::S2_pstorerht_pi:
-    case Hexagon::S2_pstorerhf_pi:
-    case Hexagon::S2_pstorerit_io:
-    case Hexagon::S2_pstorerif_io:
-    case Hexagon::S4_storeirit_io:
-    case Hexagon::S4_storeirif_io:
-    case Hexagon::S4_pstorerit_rr:
-    case Hexagon::S4_pstorerif_rr:
-    case Hexagon::S2_pstorerit_pi:
-    case Hexagon::S2_pstorerif_pi:
-
-    // V4 global address store before promoting to dot new.
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerif_abs:
-      return true;
-
-    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
-    // from the "Conditional Store" list. Because a predicated new value store
-    // would NOT be promoted to a double dot new store.
-    // This function returns yes for those stores that are predicated but not
-    // yet promoted to predicate dot new instructions.
-  }
-}
-
-bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    case Hexagon::A2_tfrt:
-    case Hexagon::A2_tfrf:
-    case Hexagon::C2_cmoveit:
-    case Hexagon::C2_cmoveif:
-    case Hexagon::A2_tfrtnew:
-    case Hexagon::A2_tfrfnew:
-    case Hexagon::C2_cmovenewit:
-    case Hexagon::C2_cmovenewif:
-    case Hexagon::A2_tfrpt:
-    case Hexagon::A2_tfrpf:
-      return true;
-
-    default:
-      return false;
-  }
-  return false;
-}
-
 // TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
 // isFPImm and later getFPImm as well.
 bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
@@ -3474,6 +3319,8 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
 // Returns the opcode to use when converting MI, which is a conditional jump,
 // into a conditional instruction which uses the .new value of the predicate.
 // We also use branch probabilities to add a hint to the jump.
+// If MBPI is null, all edges will be treated as equally likely for the
+// purposes of establishing a predication hint.
 int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
   // We assume that block can have at most two successors.
@@ -3482,9 +3329,16 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
   bool Taken = false;
   const BranchProbability OneHalf(1, 2);
 
+  auto getEdgeProbability = [MBPI] (const MachineBasicBlock *Src,
+                                    const MachineBasicBlock *Dst) {
+    if (MBPI)
+      return MBPI->getEdgeProbability(Src, Dst);
+    return BranchProbability(1, Src->succ_size());
+  };
+
   if (BrTarget.isMBB()) {
     const MachineBasicBlock *Dst = BrTarget.getMBB();
-    Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf;
+    Taken = getEdgeProbability(Src, Dst) >= OneHalf;
   } else {
     // The branch target is not a basic block (most likely a function).
     // Since BPI only gives probabilities for targets that are basic blocks,
@@ -3521,7 +3375,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
         for (const MachineBasicBlock *SB : B.successors()) {
           if (!B.isLayoutSuccessor(SB))
             continue;
-          Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf;
+          Taken = getEdgeProbability(Src, SB) < OneHalf;
           break;
         }
       } else {
@@ -3534,7 +3388,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
           BT = Op.getMBB();
           break;
         }
-        Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf;
+        Taken = BT && getEdgeProbability(Src, BT) < OneHalf;
       }
     } // if (!Bad)
   }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 97b9bc9546885..944d0161a7c8e 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -314,11 +314,6 @@ public:
   bool isAccumulator(const MachineInstr &MI) const;
   bool isComplex(const MachineInstr &MI) const;
   bool isCompoundBranchInstr(const MachineInstr &MI) const;
-  bool isCondInst(const MachineInstr &MI) const;
-  bool isConditionalALU32 (const MachineInstr &MI) const;
-  bool isConditionalLoad(const MachineInstr &MI) const;
-  bool isConditionalStore(const MachineInstr &MI) const;
-  bool isConditionalTransfer(const MachineInstr &MI) const;
   bool isConstExtended(const MachineInstr &MI) const;
   bool isDeallocRet(const MachineInstr &MI) const;
   bool isDependent(const MachineInstr &ProdMI,
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index e6ea67d55b43c..9aa185fc85a6a 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -59,6 +59,9 @@ cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
   cl::Hidden, cl::init(false),
   cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
 
+static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
+  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
+
 static const char *HexagonVolatileMemcpyName
   = "hexagon_memcpy_forward_vp4cp4n2";
 
@@ -477,7 +480,7 @@ Value *Simplifier::simplify(Context &C) {
   WorkListType Q;
   Q.push_back(C.Root);
   unsigned Count = 0;
-  const unsigned Limit = 100000;
+  const unsigned Limit = SimplifyLimit;
 
   while (!Q.empty()) {
     if (Count++ >= Limit)
@@ -501,8 +504,7 @@ Value *Simplifier::simplify(Context &C) {
         Q.push_back(Op);
     }
   }
-  assert(Count < Limit && "Infinite loop in HLIR/simplify?");
-  return C.Root;
+  return Count < Limit ? C.Root : nullptr;
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 81b5e10c11731..70ed123bc8981 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -382,48 +382,42 @@ def: T_MType_acc_pat3 <M4_or_andn, and, or>;
 def: T_MType_acc_pat3 <M4_and_andn, and, and>;
 def: T_MType_acc_pat3 <M4_xor_andn, and, xor>;
 
+// This complex pattern is really only to detect various forms of
+// sign-extension i32->i64. The selected value will be of type i64
+// whose low word is the value being extended. The high word is
+// unspecified.
+def Usxtw : ComplexPattern<i64, 1, "DetectUseSxtw", [], []>;
+
 def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
-def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>;
 def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
+def Sext64: PatLeaf<(i64 Usxtw:$Rs)>;
 
-// Return true if for a 32 to 64-bit sign-extended load.
-def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
-  if (!LD)
-    return false;
-  return LD->getExtensionType() == ISD::SEXTLOAD &&
-         LD->getMemoryVT().getScalarType() == MVT::i32;
-}]>;
-
-def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)),
-         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>;
-
-def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)),
-         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)),
+         (M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>;
 
-def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2),
-         (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>;
+def: Pat<(mul Sext64:$Rs, Sext64:$Rt),
+         (M2_dpmpyss_s0 (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
 // Multiply and accumulate, use full result.
 // Rxx[+-]=mpy(Rs,Rt)
 
-def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
-         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)),
+         (M2_dpmpyss_acc_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
-def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
-         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)),
+         (M2_dpmpyss_nac_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
-def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
-         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))),
+         (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
-         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))),
+         (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
-         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))),
+         (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
-         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))),
+         (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
 class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset,
                   InstHexagon MI>
@@ -545,7 +539,8 @@ def: Storexm_simple_pat<truncstorei8,  I64, LoReg, S2_storerb_io>;
 def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>;
 def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>;
 
-def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>;
+def: Pat <(i64 (sext I32:$src)),            (A2_sxtw I32:$src)>;
+def: Pat <(i64 (sext_inreg I64:$src, i32)), (A2_sxtw (LoReg I64:$src))>;
 
 def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src),
          (A2_abs IntRegs:$src)>;
@@ -1159,8 +1154,8 @@ multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
   defm: T_MinMax_pats<Op, I64, Inst, SwapInst>;
 }
 
-def: Pat<(add (Sext64 I32:$Rs), I64:$Rt),
-         (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(add Sext64:$Rs, I64:$Rt),
+         (A2_addsp (LoReg Sext64:$Rs), DoubleRegs:$Rt)>;
 
 let AddedComplexity = 200 in {
   defm: MinMax_pats_p<setge,  A2_maxp,  A2_minp>;
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 8e93df6201aea..14ecf297d351c 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -223,7 +223,7 @@ namespace {
 /// Hexagon Code Generator Pass Configuration Options.
 class HexagonPassConfig : public TargetPassConfig {
 public:
-  HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+  HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
@@ -245,7 +245,7 @@ public:
 } // namespace
 
 TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new HexagonPassConfig(this, PM);
+  return new HexagonPassConfig(*this, PM);
 }
 
 void HexagonPassConfig::addIRPasses() {
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index cd474921d4bc8..fa08afe4019d7 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -273,25 +273,17 @@ bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI,
     if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
       return true;
 
-  // Check if this is a predicate dependence.
-  const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
-  if (RC == &Hexagon::PredRegsRegClass)
-    return true;
-
-  // Assumes that the first operand of the CALLr is the function address.
-  if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
-    const MachineOperand MO = MI.getOperand(0);
-    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
-      return true;
+  // Call-like instructions can be packetized with preceding instructions
+  // that define registers implicitly used or modified by the call. Explicit
+  // uses are still prohibited, as in the case of indirect calls:
+  //   r0 = ...
+  //   J2_jumpr r0
+  if (DepType == SDep::Data) {
+    for (const MachineOperand MO : MI.operands())
+      if (MO.isReg() && MO.getReg() == DepReg && !MO.isImplicit())
+        return true;
   }
 
-  if (HII->isJumpR(MI)) {
-    const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1)
-                                                     : MI.getOperand(0);
-    assert(MO.isReg() && MO.isUse());
-    if (MO.getReg() == DepReg)
-      return true;
-  }
   return false;
 }
 
@@ -333,11 +325,13 @@ bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI,
       const TargetRegisterClass *NewRC) {
   // Vector stores can be predicated, and can be new-value stores, but
   // they cannot be predicated on a .new predicate value.
-  if (NewRC == &Hexagon::PredRegsRegClass)
+  if (NewRC == &Hexagon::PredRegsRegClass) {
     if (HII->isHVXVec(MI) && MI.mayStore())
       return false;
-  return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() ||
-         HII->mayBeNewStore(MI);
+    return HII->isPredicated(MI) && HII->getDotNewPredOp(MI, nullptr) > 0;
+  }
+  // If the class is not PredRegs, it could only apply to new-value stores.
+  return HII->mayBeNewStore(MI);
 }
 
 // Promote an instructiont to its .cur form.
@@ -760,11 +754,14 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
   return false;
 }
 
-static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
+static bool isImplicitDependency(const MachineInstr &I, bool CheckDef,
+      unsigned DepReg) {
   for (auto &MO : I.operands()) {
-    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+    if (CheckDef && MO.isRegMask() && MO.clobbersPhysReg(DepReg))
       return true;
-    if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+    if (!MO.isReg() || MO.getReg() != DepReg || !MO.isImplicit())
+      continue;
+    if (CheckDef == MO.isDef())
       return true;
   }
   return false;
@@ -798,7 +795,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
 
   // If dependency is trough an implicitly defined register, we should not
   // newify the use.
-  if (isImplicitDependency(PI, DepReg))
+  if (isImplicitDependency(PI, true, DepReg) ||
+      isImplicitDependency(MI, false, DepReg))
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
@@ -808,8 +806,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
 
   // predicate .new
   if (RC == &Hexagon::PredRegsRegClass)
-    if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn())
-      return HII->predCanBeUsedAsDotNew(PI, DepReg);
+    return HII->predCanBeUsedAsDotNew(PI, DepReg);
 
   if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
     return false;
diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp
index 2a9bc25d7fadb..a2f005ce445a8 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -76,7 +76,7 @@ namespace {
 // Lanai Code Generator Pass Configuration Options.
 class LanaiPassConfig : public TargetPassConfig {
 public:
-  LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager)
+  LanaiPassConfig(LanaiTargetMachine &TM, PassManagerBase *PassManager)
       : TargetPassConfig(TM, *PassManager) {}
 
   LanaiTargetMachine &getLanaiTargetMachine() const {
@@ -91,7 +91,7 @@ public:
 
 TargetPassConfig *
 LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
-  return new LanaiPassConfig(this, &PassManager);
+  return new LanaiPassConfig(*this, &PassManager);
 }
 
 // Install an instruction selector pass.
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
index 5278c70d909da..083ba6fdf8416 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -49,6 +49,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 } // namespace llvm
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index bebe5fa35ad42..d8fdc8ba674e6 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -52,7 +52,7 @@ namespace {
 /// MSP430 Code Generator Pass Configuration Options.
 class MSP430PassConfig : public TargetPassConfig {
 public:
-  MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+  MSP430PassConfig(MSP430TargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   MSP430TargetMachine &getMSP430TargetMachine() const {
@@ -65,7 +65,7 @@ public:
 } // namespace
 
 TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new MSP430PassConfig(this, PM);
+  return new MSP430PassConfig(*this, PM);
 }
 
 bool MSP430PassConfig::addInstSelector() {
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index e7ceca9612a92..a222080f6b814 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,20 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Mips16FrameLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16FrameLowering.h"
 #include "Mips16InstrInfo.h"
 #include "MipsInstrInfo.h"
 #include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
 
 using namespace llvm;
 
@@ -63,7 +72,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-  if (CSI.size()) {
+  if (!CSI.empty()) {
     const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
@@ -80,7 +89,6 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
       .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
-
 }
 
 void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 092de216e9b89..a9d6ab0558927 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -201,7 +201,7 @@ namespace {
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
 public:
-  MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+  MipsPassConfig(MipsTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
     // The current implementation of long branch pass requires a scratch
     // register ($at) to be available before branch instructions. Tail merging
@@ -227,7 +227,7 @@ public:
 } // end anonymous namespace
 
 TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new MipsPassConfig(this, PM);
+  return new MipsPassConfig(*this, PM);
 }
 
 void MipsPassConfig::addIRPasses() {
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 140d7133f879b..a3462868cb111 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -66,6 +66,10 @@ public:
 
   bool isLittleEndian() const { return isLittle; }
   const MipsABIInfo &getABI() const { return ABI; }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// Mips32/64 big endian target machine.
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index ab5298d0dcfd6..8dfbfece9b8eb 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -132,7 +132,7 @@ namespace {
 
 class NVPTXPassConfig : public TargetPassConfig {
 public:
-  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+  NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   NVPTXTargetMachine &getNVPTXTargetMachine() const {
@@ -163,7 +163,7 @@ private:
 } // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new NVPTXPassConfig(this, PM);
+  return new NVPTXPassConfig(*this, PM);
 }
 
 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 1ed8e3b1e9357..2f3981be22f83 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -65,6 +65,9 @@ public:
 
   TargetIRAnalysis getTargetIRAnalysis() override;
 
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 }; // NVPTXTargetMachine.
 
 class NVPTXTargetMachine32 : public NVPTXTargetMachine {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 5fa7b2c6bfb1b..54414457388d3 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -77,6 +77,11 @@ STATISTIC(SignExtensionsAdded,
           "Number of sign extensions for compare inputs added.");
 STATISTIC(ZeroExtensionsAdded,
           "Number of zero extensions for compare inputs added.");
+STATISTIC(NumLogicOpsOnComparison,
+          "Number of logical ops on i1 values calculated in GPR.");
+STATISTIC(OmittedForNonExtendUses,
+          "Number of compares not eliminated as they have non-extending uses.");
+
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -275,6 +280,8 @@ private:
 
     bool trySETCC(SDNode *N);
     bool tryEXTEND(SDNode *N);
+    bool tryLogicOpOfCompares(SDNode *N);
+    SDValue computeLogicOpInGPR(SDValue LogicOp);
     SDValue signExtendInputIfNeeded(SDValue Input);
     SDValue zeroExtendInputIfNeeded(SDValue Input);
     SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
@@ -282,6 +289,10 @@ private:
                                 int64_t RHSValue, SDLoc dl);
     SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                 int64_t RHSValue, SDLoc dl);
+    SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
     SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
     void PeepholePPC64();
@@ -2501,6 +2512,11 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   return true;
 }
 
+// Is this opcode a bitwise logical operation?
+static bool isLogicOp(unsigned Opc) {
+  return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
+}
+
 /// If this node is a sign/zero extension of an integer comparison,
 /// it can usually be computed in GPR's rather than using comparison
 /// instructions and ISEL. We only do this on 64-bit targets for now
@@ -2513,13 +2529,20 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
           N->getOpcode() == ISD::SIGN_EXTEND) &&
           "Expecting a zero/sign extend node!");
 
-  if (N->getOperand(0).getOpcode() != ISD::SETCC)
+  SDValue WideRes;
+  // If we are zero-extending the result of a logical operation on i1
+  // values, we can keep the values in GPRs.
+  if (isLogicOp(N->getOperand(0).getOpcode()) &&
+      N->getOperand(0).getValueType() == MVT::i1 &&
+      N->getOpcode() == ISD::ZERO_EXTEND)
+    WideRes = computeLogicOpInGPR(N->getOperand(0));
+  else if (N->getOperand(0).getOpcode() != ISD::SETCC)
     return false;
-
-  SDValue WideRes =
-    getSETCCInGPR(N->getOperand(0),
-                  N->getOpcode() == ISD::SIGN_EXTEND ?
-                  SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+  else
+    WideRes =
+      getSETCCInGPR(N->getOperand(0),
+                    N->getOpcode() == ISD::SIGN_EXTEND ?
+                    SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
 
   if (!WideRes)
     return false;
@@ -2540,6 +2563,159 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
   return true;
 }
 
+// Lower a logical operation on i1 values into a GPR sequence if possible.
+// The result can be kept in a GPR if requested.
+// Three types of inputs can be handled:
+// - SETCC
+// - TRUNCATE
+// - Logical operation (AND/OR/XOR)
+// There is also a special case that is handled (namely a complement operation
+// achieved with xor %a, -1).
+SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) {
+  assert(isLogicOp(LogicOp.getOpcode()) &&
+         "Can only handle logic operations here.");
+  assert(LogicOp.getValueType() == MVT::i1 &&
+         "Can only handle logic operations on i1 values here.");
+  SDLoc dl(LogicOp);
+  SDValue LHS, RHS;
+
+  // Special case: xor %a, -1
+  bool IsBitwiseNegation = isBitwiseNot(LogicOp);
+
+  // Produces a GPR sequence for each operand of the binary logic operation.
+  // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
+  // the value in a GPR and for logic operations, it will recursively produce
+  // a GPR sequence for the operation.
+  auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
+    unsigned OperandOpcode = Operand.getOpcode();
+    if (OperandOpcode == ISD::SETCC)
+      return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
+    else if (OperandOpcode == ISD::TRUNCATE) {
+      SDValue InputOp = Operand.getOperand(0);
+      EVT InVT = InputOp.getValueType();
+      return
+        SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
+                                       PPC::RLDICL, dl, InVT, InputOp,
+                                       getI64Imm(0, dl), getI64Imm(63, dl)), 0);
+    } else if (isLogicOp(OperandOpcode))
+      return computeLogicOpInGPR(Operand);
+    return SDValue();
+  };
+  LHS = getLogicOperand(LogicOp.getOperand(0));
+  RHS = getLogicOperand(LogicOp.getOperand(1));
+
+  // If a GPR sequence can't be produced for the LHS we can't proceed.
+  // Not producing a GPR sequence for the RHS is only a problem if this isn't
+  // a bitwise negation operation.
+  if (!LHS || (!RHS && !IsBitwiseNegation))
+    return SDValue();
+
+  NumLogicOpsOnComparison++;
+
+  // We will use the inputs as 64-bit values.
+  if (LHS.getValueType() == MVT::i32)
+    LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
+  if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
+    RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
+
+  unsigned NewOpc;
+  switch (LogicOp.getOpcode()) {
+  default: llvm_unreachable("Unknown logic operation.");
+  case ISD::AND: NewOpc = PPC::AND8; break;
+  case ISD::OR:  NewOpc = PPC::OR8;  break;
+  case ISD::XOR: NewOpc = PPC::XOR8; break;
+  }
+
+  if (IsBitwiseNegation) {
+    RHS = getI64Imm(1, dl);
+    NewOpc = PPC::XORI8;
+  }
+
+  return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
+
+}
+
+/// Try performing logical operations on results of comparisons in GPRs.
+/// It is typically preferred from a performance perspective over performing
+/// the operations on individual bits in the CR. We only do this on 64-bit
+/// targets for now as the code is specialized for 64-bit (it uses 64-bit
+/// instructions and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) {
+  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+    return false;
+  if (N->getValueType(0) != MVT::i1)
+    return false;
+  assert(isLogicOp(N->getOpcode()) &&
+         "Expected a logic operation on setcc results.");
+  SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
+  if (!LoweredLogical)
+    return false;
+
+  SDLoc dl(N);
+  bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
+  unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
+  SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+  SDValue LHS = LoweredLogical.getOperand(0);
+  SDValue RHS = LoweredLogical.getOperand(1);
+  SDValue WideOp;
+  SDValue OpToConvToRecForm;
+
+  // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode
+  // that is input to the XORI.
+  if (IsBitwiseNegate &&
+      LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
+    OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
+  else if (IsBitwiseNegate)
+    // If the input to the XORI isn't an extension, that's what we're after.
+    OpToConvToRecForm = LoweredLogical.getOperand(0);
+  else
+    // If this is not an XORI, it is a reg-reg logical op and we can convert it
+    // to record-form.
+    OpToConvToRecForm = LoweredLogical;
+
+  // Get the record-form version of the node we're looking to use to get the
+  // CR result from.
+  uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
+  int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
+
+  // Convert the right node to record-form. This is either the logical we're
+  // looking at or it is the input node to the negation (if we're looking at
+  // a bitwise negation).
+  if (NewOpc != -1 && IsBitwiseNegate) {
+    // The input to the XORI has a record-form. Use it.
+    assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
+           "Expected a PPC::XORI8 only for bitwise negation.");
+    // Emit the record-form instruction.
+    std::vector<SDValue> Ops;
+    for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
+      Ops.push_back(OpToConvToRecForm.getOperand(i));
+
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc, dl,
+                                     OpToConvToRecForm.getValueType(),
+                                     MVT::Glue, Ops), 0);
+  } else {
+    assert((NewOpc != -1 || !IsBitwiseNegate) &&
+           "No record form available for AND8/OR8/XOR8?");
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
+                                     MVT::i64, MVT::Glue, LHS, RHS), 0);
+  }
+
+  // Select this node to a single bit from CR0 set by the record-form node
+  // just created. For bitwise negation, use the EQ bit which is the equivalent
+  // of negating the result (i.e. it is a bit set when the result of the
+  // operation is zero).
+  SDValue SRIdxVal =
+    CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
+  SDValue CRBit =
+    SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                   MVT::i1, CR0Reg, SRIdxVal,
+                                   WideOp.getValue(1)), 0);
+  ReplaceNode(N, CRBit.getNode());
+  return true;
+}
+
 /// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
 /// Useful when emitting comparison code for 32-bit values without using
 /// the compare instruction (which only considers the lower 32-bits).
@@ -2677,6 +2853,77 @@ SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
   }
 }
 
+/// Produces a zero-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
+    // (zext (setcc %a, 0, seteq)) ->  (lshr (ctlz %a), 6)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
+                                          getI64Imm(58, dl), getI64Imm(63, dl)),
+                   0);
+  }
+  }
+}
+
+/// Produces a sign-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+    // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
+    // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+    // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+    SDValue AddInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Addic =
+      SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+                                     AddInput, getI32Imm(~0U, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
+                                          Addic, Addic.getValue(1)), 0);
+  }
+  }
+}
+
+/// Does this SDValue have any uses for which keeping the value in a GPR is
+/// appropriate. This is meant to be used on values that have type i1 since
+/// it is somewhat meaningless to ask if values of other types can be kept in
+/// GPR's.
+static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
+  assert(Compare.getOpcode() == ISD::SETCC &&
+         "An ISD::SETCC node required here.");
+
+  // For values that have a single use, the caller should obviously already have
+  // checked if that use is an extending use. We check the other uses here.
+  if (Compare.hasOneUse())
+    return true;
+  // We want the value in a GPR if it is being extended, used for a select, or
+  // used in logical operations.
+  for (auto CompareUse : Compare.getNode()->uses())
+    if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
+        CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
+        CompareUse->getOpcode() != ISD::SELECT &&
+        !isLogicOp(CompareUse->getOpcode())) {
+      OmittedForNonExtendUses++;
+      return false;
+    }
+  return true;
+}
+
 /// Returns an equivalent of a SETCC node but with the result the same width as
 /// the inputs. This can nalso be used for SELECT_CC if either the true or false
 /// values is a power of two while the other is zero.
@@ -2686,6 +2933,11 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
           Compare.getOpcode() == ISD::SELECT_CC) &&
          "An ISD::SETCC node required here.");
 
+  // Don't convert this comparison to a GPR sequence because there are uses
+  // of the i1 result (i.e. uses that require the result in the CR).
+  if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
+    return SDValue();
+
   SDValue LHS = Compare.getOperand(0);
   SDValue RHS = Compare.getOperand(1);
 
@@ -2694,30 +2946,35 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
   ISD::CondCode CC =
     cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
   EVT InputVT = LHS.getValueType();
-  if (InputVT != MVT::i32)
+  if (InputVT != MVT::i32 && InputVT != MVT::i64)
     return SDValue();
 
-  SDLoc dl(Compare);
-  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
-
   if (ConvOpts == SetccInGPROpts::ZExtInvert ||
       ConvOpts == SetccInGPROpts::SExtInvert)
     CC = ISD::getSetCCInverse(CC, true);
 
-  if (ISD::isSignedIntSetCC(CC)) {
+  bool Inputs32Bit = InputVT == MVT::i32;
+  if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) {
     LHS = signExtendInputIfNeeded(LHS);
     RHS = signExtendInputIfNeeded(RHS);
-  } else if (ISD::isUnsignedIntSetCC(CC)) {
+  } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) {
     LHS = zeroExtendInputIfNeeded(LHS);
     RHS = zeroExtendInputIfNeeded(RHS);
   }
 
+  SDLoc dl(Compare);
+  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
   bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
     ConvOpts == SetccInGPROpts::SExtInvert;
-  if (IsSext)
+
+  if (IsSext && Inputs32Bit)
     return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
-  return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (Inputs32Bit)
+    return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (IsSext)
+    return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
 }
 
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
@@ -2906,6 +3163,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::AND: {
+    if (tryLogicOpOfCompares(N))
+      return;
+
     unsigned Imm, Imm2, SH, MB, ME;
     uint64_t Imm64;
 
@@ -3025,6 +3285,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       if (tryBitfieldInsert(N))
         return;
 
+    if (tryLogicOpOfCompares(N))
+      return;
+
     short Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
         isIntS16Immediate(N->getOperand(1), Imm)) {
@@ -3042,6 +3305,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ISD::XOR: {
+    if (tryLogicOpOfCompares(N))
+      return;
+    break;
+  }
   case ISD::ADD: {
     short Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 216efcc4a1ee8..41ff9d903aa07 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadsPerMemcmp = 128;
+  } else {
+    MaxLoadsPerMemcmp = 8;
+    MaxLoadsPerMemcmpOptSize = 4;
   }
 }
 
@@ -1112,6 +1116,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
   case PPCISD::XXINSERT:        return "PPCISD::XXINSERT";
+  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
@@ -1593,17 +1598,25 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   return true;
 }
 
-  // Check that the mask is shuffling words
-static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
-  for (unsigned i = 0; i < 4; ++i) {
-    unsigned B0 = N->getMaskElt(i*4);
-    unsigned B1 = N->getMaskElt(i*4+1);
-    unsigned B2 = N->getMaskElt(i*4+2);
-    unsigned B3 = N->getMaskElt(i*4+3);
-    if (B0 % 4)
-      return false;
-    if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+// Check that the mask is shuffling N byte elements.
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) {
+  assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
+         "Unexpected element width.");
+
+  unsigned NumOfElem = 16 / Width;
+  unsigned MaskVal[16]; //  Width is never greater than 16
+  for (unsigned i = 0; i < NumOfElem; ++i) {
+    MaskVal[0] = N->getMaskElt(i * Width);
+    if (MaskVal[0] % Width) {
       return false;
+    }
+
+    for (unsigned int j = 1; j < Width; ++j) {
+      MaskVal[j] = N->getMaskElt(i * Width + j);
+      if (MaskVal[j] != MaskVal[j-1] + 1) {
+        return false;
+      }
+    }
   }
 
   return true;
@@ -1611,7 +1624,7 @@ static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
 
 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                           unsigned &InsertAtByte, bool &Swap, bool IsLE) {
-  if (!isWordShuffleMask(N))
+  if (!isNByteElemShuffleMask(N, 4))
     return false;
 
   // Now we look at mask elements 0,4,8,12
@@ -1688,7 +1701,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                                bool &Swap, bool IsLE) {
   assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
   // Ensure each byte index of the word is consecutive.
-  if (!isWordShuffleMask(N))
+  if (!isNByteElemShuffleMask(N, 4))
     return false;
 
   // Now we look at mask elements 0,4,8,12, which are the beginning of words.
@@ -1746,6 +1759,66 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
   }
 }
 
+/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
+/// if the inputs to the instruction should be swapped and set \p DM to the
+/// value for the immediate.
+/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
+/// AND element 0 of the result comes from the first input (LE) or second input
+/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
+/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
+/// mask.
+bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
+                               bool &Swap, bool IsLE) {
+  assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+
+  // Ensure each byte index of the double word is consecutive.
+  if (!isNByteElemShuffleMask(N, 8))
+    return false;
+
+  unsigned M0 = N->getMaskElt(0) / 8;
+  unsigned M1 = N->getMaskElt(8) / 8;
+  assert(((M0 | M1) < 4) && "A mask element out of bounds?");
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    if ((M0 | M1) < 2) {
+      DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
+      Swap = false;
+      return true;
+    } else
+      return false;
+  }
+
+  if (IsLE) {
+    if (M0 > 1 && M1 < 2) {
+      Swap = false;
+    } else if (M0 < 2 && M1 > 1) {
+      M0 = (M0 + 2) % 4;
+      M1 = (M1 + 2) % 4;
+      Swap = true;
+    } else
+      return false;
+
+    // Note: if control flow comes here that means Swap is already set above
+    DM = (((~M1) & 1) << 1) + ((~M0) & 1);
+    return true;
+  } else { // BE
+    if (M0 < 2 && M1 > 1) {
+      Swap = false;
+    } else if (M0 > 1 && M1 < 2) {
+      M0 = (M0 + 2) % 4;
+      M1 = (M1 + 2) % 4;
+      Swap = true;
+    } else
+      return false;
+
+    // Note: if control flow comes here that means Swap is already set above
+    DM = (M0 << 1) + (M1 & 1);
+    return true;
+  }
+}
+
 
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
@@ -7760,6 +7833,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
   }
 
+  if (Subtarget.hasVSX() &&
+    PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
+    SDValue Conv2 =
+        DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
+
+    SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
+  }
+
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
       int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 2f9eb95f6de68..7982a4a9e9fb7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -90,6 +90,10 @@ namespace llvm {
       ///
       VECSHL,
 
+      /// XXPERMDI - The PPC XXPERMDI instruction
+      ///
+      XXPERMDI,
+
       /// The CMPB instruction (takes two operands of i32 or i64).
       CMPB,
 
@@ -454,6 +458,10 @@ namespace llvm {
     /// for a XXSLDWI instruction.
     bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                               bool &Swap, bool IsLE);
+    /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXPERMDI instruction.
+    bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                              bool &Swap, bool IsLE);
 
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
     /// shift amount, otherwise return -1.
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 165970f9678c3..295590b2acf62 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -735,12 +735,12 @@ def RLDICL_32_64 : MDForm_1<30, 0,
                             "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                             []>, isPPC64;
 // End fast-isel.
-let isCodeGenOnly = 1 in
-def RLDICL_32 : MDForm_1<30, 0,
-                         (outs gprc:$rA),
-                         (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
-                         "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
-                         []>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLDICL_32 : MDForm_1r<30, 0,
+                           (outs gprc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                           []>, isPPC64;
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index fd6785e963a64..f3c68c443b1bc 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1983,3 +1983,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
     return &PPC::VSRCRegClass;
   return RC;
 }
+
+int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) {
+  return PPC::getRecordFormOpcode(Opcode);
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index b30d09e03ec47..8dd4dbb608794 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -290,6 +290,7 @@ public:
     return Reg >= PPC::V0 && Reg <= PPC::V31;
   }
   const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
+  static int getRecordFormOpcode(unsigned Opcode);
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 26b99eced23cb..8223aa655e38b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
 ]>;
 
+def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
 def SDT_PPCvcmp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
 ]>;
@@ -170,6 +174,7 @@ def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
 def PPCxxinsert  : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
 def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 1589ab03e5075..c4139ca8b7bdb 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -843,7 +843,9 @@ let Uses = [RM] in {
 
   def XXPERMDI : XX3Form_2<60, 10,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
-                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm,
+                       [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB,
+                         imm32SExt16:$DM))]>;
   let isCodeGenOnly = 1 in
   def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
                              "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index ddae5befee3e4..b9004cc8a9f54 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -296,7 +296,7 @@ namespace {
 /// PPC Code Generator Pass Configuration Options.
 class PPCPassConfig : public TargetPassConfig {
 public:
-  PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+  PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   PPCTargetMachine &getPPCTargetMachine() const {
@@ -316,7 +316,7 @@ public:
 } // end anonymous namespace
 
 TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new PPCPassConfig(this, PM);
+  return new PPCPassConfig(*this, PM);
 }
 
 void PPCPassConfig::addIRPasses() {
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index f2838351cee56..b8f5a2083d808 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -55,6 +55,10 @@ public:
     const Triple &TT = getTargetTriple();
     return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
   };
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// PowerPC 32-bit target machine.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 7ee1317bf72f2..5559cdc5fe467 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  MaxLoadSize = 8;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
@@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
 }
 
 unsigned PPCTTIImpl::getCacheLineSize() {
-  // This is currently only used for the data prefetch pass which is only
-  // enabled for BG/Q by default.
-  return CacheLineSize;
+  // Check first if the user specified a custom line size.
+  if (CacheLineSize.getNumOccurrences() > 0)
+    return CacheLineSize;
+
+  // On P7, P8 or P9 we have a cache line size of 128.
+  unsigned Directive = ST->getDarwinDirective();
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
+      Directive == PPC::DIR_PWR9)
+    return 128;
+
+  // On other processors return a default of 64 bytes.
+  return 64;
 }
 
 unsigned PPCTTIImpl::getPrefetchDistance() {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 6ce70fbd8778e..2e0116fee04c6 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,7 @@ public:
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index a20331cd0a3ed..efdde04c582d0 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -56,5 +56,5 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
 }
 
 TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new TargetPassConfig(this, PM);
+  return new TargetPassConfig(*this, PM);
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 1da4d36043045..49c67e0819f72 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -114,7 +114,7 @@ namespace {
 /// Sparc Code Generator Pass Configuration Options.
 class SparcPassConfig : public TargetPassConfig {
 public:
-  SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+  SparcPassConfig(SparcTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   SparcTargetMachine &getSparcTargetMachine() const {
@@ -128,7 +128,7 @@ public:
 } // namespace
 
 TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new SparcPassConfig(this, PM);
+  return new SparcPassConfig(*this, PM);
 }
 
 void SparcPassConfig::addIRPasses() {
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 48193fe095bed..faf714cbe2c98 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -40,6 +40,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// Sparc 32-bit target machine
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index ede5005fa4916..f30d52f859d75 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -119,7 +119,7 @@ namespace {
 /// SystemZ Code Generator Pass Configuration Options.
 class SystemZPassConfig : public TargetPassConfig {
 public:
-  SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+  SystemZPassConfig(SystemZTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   SystemZTargetMachine &getSystemZTargetMachine() const {
@@ -212,7 +212,7 @@ void SystemZPassConfig::addPreEmitPass() {
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new SystemZPassConfig(this, PM);
+  return new SystemZPassConfig(*this, PM);
 }
 
 TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index a10ca64fa6329..eb2f17a2091c3 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -51,6 +51,8 @@ public:
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
+
+  bool isMachineVerifierClean() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b974681fb6afa..d9b2b8743649e 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -129,7 +129,7 @@ namespace {
 /// WebAssembly Code Generator Pass Configuration Options.
 class WebAssemblyPassConfig final : public TargetPassConfig {
 public:
-  WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM)
+  WebAssemblyPassConfig(WebAssemblyTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const {
@@ -154,7 +154,7 @@ TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
 
 TargetPassConfig *
 WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new WebAssemblyPassConfig(this, PM);
+  return new WebAssemblyPassConfig(*this, PM);
 }
 
 FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 6e062ec59347b..b5a926f915afa 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -587,6 +587,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSLLDQZ256rr:
   case X86::VPSLLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
   case X86::VPSLLDQZ128rm:
   case X86::VPSLLDQZ256rm:
   case X86::VPSLLDQZ512rm:
@@ -604,6 +605,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSRLDQZ256rr:
   case X86::VPSRLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
   case X86::VPSRLDQZ128rm:
   case X86::VPSRLDQZ256rm:
   case X86::VPSRLDQZ512rm:
@@ -1091,6 +1093,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
     DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
@@ -1099,6 +1102,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
     DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 313920e02c3ed..5582526541bae 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -123,18 +123,26 @@ namespace {
     EdgeBundles *Bundles;
 
     // Return a bitmask of FP registers in block's live-in list.
-    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
       unsigned Mask = 0;
-      for (const auto &LI : MBB->liveins()) {
-        if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
-          continue;
-        Mask |= 1 << (LI.PhysReg - X86::FP0);
+      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin();
+           I != MBB->livein_end(); ) {
+        MCPhysReg Reg = I->PhysReg;
+        static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums");
+        if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+          Mask |= 1 << (Reg - X86::FP0);
+          if (RemoveFPs) {
+            I = MBB->removeLiveIn(I);
+            continue;
+          }
+        }
+        ++I;
       }
       return Mask;
     }
 
     // Partition all the CFG edges into LiveBundles.
-    void bundleCFG(MachineFunction &MF);
+    void bundleCFGRecomputeKillFlags(MachineFunction &MF);
 
     MachineBasicBlock *MBB;     // Current basic block
 
@@ -327,7 +335,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
 
   // Prepare cross-MBB liveness.
-  bundleCFG(MF);
+  bundleCFGRecomputeKillFlags(MF);
 
   StackTop = 0;
 
@@ -375,13 +383,15 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
 /// registers live-out from a block is identical to the live-in set of all
 /// successors. This is not enforced by the normal live-in lists since
 /// registers may be implicitly defined, or not used by all successors.
-void FPS::bundleCFG(MachineFunction &MF) {
+void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) {
   assert(LiveBundles.empty() && "Stale data in LiveBundles");
   LiveBundles.resize(Bundles->getNumBundles());
 
   // Gather the actual live-in masks for all MBBs.
   for (MachineBasicBlock &MBB : MF) {
-    const unsigned Mask = calcLiveInMask(&MBB);
+    setKillFlags(MBB);
+
+    const unsigned Mask = calcLiveInMask(&MBB, false);
     if (!Mask)
       continue;
     // Update MBB ingoing bundle mask.
@@ -396,7 +406,6 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   bool Changed = false;
   MBB = &BB;
 
-  setKillFlags(BB);
   setupBlockStack();
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
@@ -453,6 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       unsigned Reg = DeadRegs[i];
       // Check if Reg is live on the stack. An inline-asm register operand that
       // is in the clobber list and marked dead might not be live on the stack.
+      static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
       if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
         DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
@@ -506,7 +516,6 @@ void FPS::setupBlockStack() {
 
   // Push the fixed live-in registers.
   for (unsigned i = Bundle.FixCount; i > 0; --i) {
-    MBB->addLiveIn(X86::ST0+i-1);
     DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
                  << unsigned(Bundle.FixStack[i-1]) << '\n');
     pushReg(Bundle.FixStack[i-1]);
@@ -515,7 +524,8 @@ void FPS::setupBlockStack() {
   // Kill off unwanted live-ins. This can happen with a critical edge.
   // FIXME: We could keep these live registers around as zombies. They may need
   // to be revived at the end of a short block. It might save a few instrs.
-  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true);
+  adjustLiveRegs(Mask, MBB->begin());
   DEBUG(MBB->dump());
 }
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 331e56976db7c..328a80304602f 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1062,6 +1062,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (HasFP) {
+    assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
+
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     // If required, include space for extra hidden slot for stashing base pointer.
@@ -1124,13 +1126,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
                                     nullptr, DwarfFramePtr));
       }
     }
-
-    // Mark the FramePtr as live-in in every block. Don't do this again for
-    // funclet prologues.
-    if (!IsFunclet) {
-      for (MachineBasicBlock &EveryMBB : MF)
-        EveryMBB.addLiveIn(MachineFramePtr);
-    }
   } else {
     assert(!IsFunclet && "funclets without FPs not yet implemented");
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index c899f0fd5100e..2a1633de0a239 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -418,8 +418,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
     case X86ISD::XOR:
     case X86ISD::OR:
     case ISD::ADD:
-    case ISD::ADDC:
-    case ISD::ADDE:
     case ISD::ADDCARRY:
     case ISD::AND:
     case ISD::OR:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8d78308afe9df..0a41f35f93208 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1,3 +1,4 @@
+
 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -80,12 +81,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
-static cl::opt<bool> MulConstantOptimization(
-    "mul-constant-optimization", cl::init(true),
-    cl::desc("Replace 'mul x, Const' with more effective instructions like "
-             "SHIFT, LEA, etc."),
-    cl::Hidden);
-
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
@@ -317,16 +312,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
-    if (VT == MVT::i64 && !Subtarget.is64Bit())
-      continue;
-    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
-    setOperationAction(ISD::ADDC, VT, Custom);
-    setOperationAction(ISD::ADDE, VT, Custom);
-    setOperationAction(ISD::SUBC, VT, Custom);
-    setOperationAction(ISD::SUBE, VT, Custom);
-  }
-
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
@@ -428,7 +413,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
-    setOperationAction(ISD::SETCCE, VT, Custom);
   }
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
@@ -1583,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // Support carry in as value rather than glue.
     setOperationAction(ISD::ADDCARRY, VT, Custom);
     setOperationAction(ISD::SUBCARRY, VT, Custom);
+    setOperationAction(ISD::SETCCCARRY, VT, Custom);
   }
 
   if (!Subtarget.is64Bit()) {
@@ -16304,6 +16289,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::SHL:
       if (Op.getNode()->getFlags().hasNoSignedWrap())
         break;
+      LLVM_FALLTHROUGH;
     default:
       NeedOF = true;
       break;
@@ -17167,17 +17153,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:  Invert = true;
+  case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
-  case ISD::SETLT:  Swap = true;
+  case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
-  case ISD::SETGE:  Swap = true;
+  case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
                     Invert = true; break;
-  case ISD::SETULT: Swap = true;
+  case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
                     FlipSigns = true; break;
-  case ISD::SETUGE: Swap = true;
+  case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETULE: Opc = X86ISD::PCMPGT;
                     FlipSigns = true; Invert = true; break;
   }
@@ -17398,19 +17384,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return SetCC;
 }
 
-SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
-  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
 
-  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  // Recreate the carry if needed.
+  EVT CarryVT = Carry.getValueType();
+  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
+
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
-  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
   SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
   if (Op.getSimpleValueType() == MVT::i1)
     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -23269,32 +23260,6 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getNode()->getSimpleValueType(0);
-
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid code");
-  case ISD::ADDC: Opc = X86ISD::ADD; break;
-  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
-  case ISD::SUBC: Opc = X86ISD::SUB; break;
-  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                       Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                     Op.getOperand(1), Op.getOperand(2));
-}
-
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   MVT VT = N->getSimpleValueType(0);
@@ -23785,7 +23750,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
+  case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -23830,10 +23795,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
@@ -28946,12 +28907,118 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+// Try to match patterns such as
+// (i16 bitcast (v16i1 x))
+// ->
+// (i16 movmsk (16i8 sext (v16i1 x)))
+// before the illegal vector is scalarized on subtargets that don't have legal
+// vxi1 types.
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+                                  const X86Subtarget &Subtarget) {
+  EVT VT = BitCast.getValueType();
+  SDValue N0 = BitCast.getOperand(0);
+  EVT VecVT = N0->getValueType(0);
+
+  if (!VT.isScalarInteger() || !VecVT.isSimple())
+    return SDValue();
+
+  // With AVX512 vxi1 types are legal and we prefer using k-regs.
+  // MOVMSK is supported in SSE2 or later.
+  if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
+    return SDValue();
+
+  // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
+  // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
+  // v8i16 and v16i16.
+  // For these two cases, we can shuffle the upper element bytes to a
+  // consecutive sequence at the start of the vector and treat the results as
+  // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However,
+  // for v16i16 this is not the case, because the shuffle is expensive, so we
+  // avoid sign-exteding to this type entirely.
+  // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
+  // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
+  MVT SExtVT;
+  MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  switch (VecVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v2i1:
+    SExtVT = MVT::v2i64;
+    FPCastVT = MVT::v2f64;
+    break;
+  case MVT::v4i1:
+    SExtVT = MVT::v4i32;
+    FPCastVT = MVT::v4f32;
+    // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
+    // sign-extend to a 256-bit operation to avoid truncation.
+    if (N0->getOpcode() == ISD::SETCC &&
+        N0->getOperand(0)->getValueType(0).is256BitVector() &&
+        Subtarget.hasInt256()) {
+      SExtVT = MVT::v4i64;
+      FPCastVT = MVT::v4f64;
+    }
+    break;
+  case MVT::v8i1:
+    SExtVT = MVT::v8i16;
+    // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
+    // sign-extend to a 256-bit operation to match the compare.
+    // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
+    // 256-bit because the shuffle is cheaper than sign extending the result of
+    // the compare.
+    if (N0->getOpcode() == ISD::SETCC &&
+        N0->getOperand(0)->getValueType(0).is256BitVector() &&
+        Subtarget.hasInt256()) {
+      SExtVT = MVT::v8i32;
+      FPCastVT = MVT::v8f32;
+    }
+    break;
+  case MVT::v16i1:
+    SExtVT = MVT::v16i8;
+    // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
+    // it is not profitable to sign-extend to 256-bit because this will
+    // require an extra cross-lane shuffle which is more exprensive than
+    // truncating the result of the compare to 128-bits.
+    break;
+  case MVT::v32i1:
+    // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
+    if (!Subtarget.hasInt256())
+      return SDValue();
+    SExtVT = MVT::v32i8;
+    break;
+  };
+
+  SDLoc DL(BitCast);
+  SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
+  if (SExtVT == MVT::v8i16) {
+    V = DAG.getBitcast(MVT::v16i8, V);
+    V = DAG.getVectorShuffle(
+        MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
+        {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+  } else
+    assert(SExtVT.getScalarType() != MVT::i16 &&
+           "Vectors of i16 must be shuffled");
+  if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+    V = DAG.getBitcast(FPCastVT, V);
+  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+  return DAG.getZExtOrTrunc(V, DL, VT);
+}
+
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
 
+  // Try to match patterns such as
+  // (i16 bitcast (v16i1 x))
+  // ->
+  // (i16 movmsk (16i8 sext (v16i1 x)))
+  // before the setcc result is scalarized on subtargets that don't have legal
+  // vxi1 types.
+  if (DCI.isBeforeLegalize())
+    if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+      return V;
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.
@@ -29944,6 +30011,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
@@ -29974,6 +30042,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
@@ -30008,6 +30077,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
@@ -30036,6 +30106,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
@@ -30933,75 +31004,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   }
 }
 
-static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
-                                 EVT VT, SDLoc DL) {
-
-  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
-    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
-                                 DAG.getConstant(Mult, DL, VT));
-    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
-                         DAG.getConstant(Shift, DL, MVT::i8));
-    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
-                         Result);
-    return Result;
-  };
-
-  auto combineMulMulAddOrSub = [&](bool isAdd) {
-    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
-                                 DAG.getConstant(9, DL, VT));
-    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
-    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
-                         Result);
-    return Result;
-  };
-
-  switch (MulAmt) {
-  default:
-    break;
-  case 11:
-    // mul x, 11 => add ((shl (mul x, 5), 1), x)
-    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
-  case 21:
-    // mul x, 21 => add ((shl (mul x, 5), 2), x)
-    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
-  case 22:
-    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
-    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
-  case 19:
-    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
-    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
-  case 13:
-    // mul x, 13 => add ((shl (mul x, 3), 2), x)
-    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
-  case 23:
-    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
-    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
-  case 14:
-    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
-    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
-  case 26:
-    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
-    return combineMulMulAddOrSub(/*isAdd*/ false);
-  case 28:
-    // mul x, 28 => add ((mul (mul x, 9), 3), x)
-    return combineMulMulAddOrSub(/*isAdd*/ true);
-  case 29:
-    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
-    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                       combineMulMulAddOrSub(/*isAdd*/ true));
-  case 30:
-    // mul x, 30 => sub (sub ((shl x, 5), x), x)
-    return DAG.getNode(
-        ISD::SUB, DL, VT, N->getOperand(0),
-        DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0),
-                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                                DAG.getConstant(5, DL, MVT::i8))));
-  }
-  return SDValue();
-}
-
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -31011,8 +31013,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
-  if (!MulConstantOptimization)
-    return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
@@ -31068,8 +31068,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
-  } else if (!Subtarget.slowLEA())
-    NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
+  }
 
   if (!NewMul) {
     assert(MulAmt != 0 &&
@@ -34558,8 +34557,7 @@ static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
             isOneConstant(Carry.getOperand(1))))
       Carry = Carry.getOperand(0);
 
-    if (Carry.getOpcode() == ISD::SETCC ||
-        Carry.getOpcode() == X86ISD::SETCC ||
+    if (Carry.getOpcode() == X86ISD::SETCC ||
         Carry.getOpcode() == X86ISD::SETCC_CARRY) {
       if (Carry.getConstantOperandVal(0) == X86::COND_B)
         return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
@@ -35126,7 +35124,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
-  case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
+  case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
@@ -35510,6 +35508,7 @@ TargetLowering::ConstraintWeight
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    LLVM_FALLTHROUGH;
   case 'R':
   case 'q':
   case 'Q':
@@ -35861,6 +35860,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &X86::GR64RegClass);
         break;
       }
+      LLVM_FALLTHROUGH;
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 18106c2eb3941..f51b6641db2fb 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1163,7 +1163,7 @@ namespace llvm {
     SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
                       SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 33fbd41bb6318..0aee30081a35c 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -195,6 +195,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
       return false;
+    LLVM_FALLTHROUGH;
   case X86::MOVSX32rr16:
   case X86::MOVZX32rr16:
   case X86::MOVSX64rr16:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 53a8e83b36fca..cb21f1bd7706f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -323,7 +323,7 @@ namespace {
 /// X86 Code Generator Pass Configuration Options.
 class X86PassConfig : public TargetPassConfig {
 public:
-  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+  X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   X86TargetMachine &getX86TargetMachine() const {
@@ -369,7 +369,7 @@ INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
                 "X86 Execution Dependency Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new X86PassConfig(this, PM);
+  return new X86PassConfig(*this, PM);
 }
 
 void X86PassConfig::addIRPasses() {
@@ -433,6 +433,7 @@ bool X86PassConfig::addPreISel() {
 
 void X86PassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&LiveRangeShrinkID);
     addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
     addPass(createX86CallFrameOptimization());
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index cf933f52604ef..1bf267d34ec2c 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -49,6 +49,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 2950e2efbea3e..1a1cbd4748887 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -54,7 +54,7 @@ namespace {
 /// XCore Code Generator Pass Configuration Options.
 class XCorePassConfig : public TargetPassConfig {
 public:
-  XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+  XCorePassConfig(XCoreTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   XCoreTargetMachine &getXCoreTargetMachine() const {
@@ -70,7 +70,7 @@ public:
 } // end anonymous namespace
 
 TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new XCorePassConfig(this, PM);
+  return new XCorePassConfig(*this, PM);
 }
 
 void XCorePassConfig::addIRPasses() {
author	Dimitry Andric <dim@FreeBSD.org>	2017-06-01 20:58:36 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-06-01 20:58:36 +0000
commit	f382538d471e38a9b98f016c4caebd24c8d60b62 (patch)
tree	d30f3d58b1044b5355d50c17a6a96c6a0b35703a /lib/Target
parent	ee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff)
download	src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.tar.gz src-test-f382538d471e38a9b98f016c4caebd24c8d60b62.zip