[ARM] Optimize {s,u}{add,sub}.with.overflow.

The ARM backend contains code that tries to optimize compares by replacing them with an existing instruction that sets the flags the same way. This allows it to replace a "cmp" with a "adds", generalizing the code that replaces "cmp" with "sub". It also heuristically disables sinking of instructions that could potentially be used to replace compares (currently only if they're next to each other). Differential revision: https://reviews.llvm.org/D38378 llvm-svn: 322737
llvm · Jan 17, 2018 · fe7fa40 · fe7fa40
1 parent 727f153
commit fe7fa40
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 47 deletions.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2534,14 +2534,28 @@ inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
   }
 }
 
+/// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by ADD(a,b,X).
+inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::HS: return ARMCC::LO;
+  case ARMCC::LO: return ARMCC::HS;
+  case ARMCC::VS: return ARMCC::VS;
+  case ARMCC::VC: return ARMCC::VC;
+  }
+}
+
 /// isRedundantFlagInstr - check whether the first instruction, whose only
 /// purpose is to update flags, can be made redundant.
 /// CMPrr can be made redundant by SUBrr if the operands are the same.
 /// CMPri can be made redundant by SUBri if the operands are the same.
+/// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
 /// This function can be extended later on.
-inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr *OI) {
+inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
+                                        unsigned SrcReg, unsigned SrcReg2,
+                                        int ImmValue, const MachineInstr *OI) {
   if ((CmpI->getOpcode() == ARM::CMPrr ||
        CmpI->getOpcode() == ARM::t2CMPrr) &&
       (OI->getOpcode() == ARM::SUBrr ||
@@ -2559,6 +2573,14 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
       OI->getOperand(1).getReg() == SrcReg &&
       OI->getOperand(2).getImm() == ImmValue)
     return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr ||
+       OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) &&
+      OI->getOperand(0).isReg() && OI->getOperand(1).isReg() &&
+      OI->getOperand(0).getReg() == SrcReg &&
+      OI->getOperand(1).getReg() == SrcReg2)
+    return true;
   return false;
 }
 
@@ -2661,17 +2683,18 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   if (I == B) return false;
 
   // There are two possible candidates which can be changed to set CPSR:
-  // One is MI, the other is a SUB instruction.
-  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // One is MI, the other is a SUB or ADD instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2), SUB(r2,r1), or
+  // ADDr[ri](r1, r2, X).
   // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
-  MachineInstr *Sub = nullptr;
+  MachineInstr *SubAdd = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
     MI = nullptr;
   else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
-    // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+    // For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate.
     // Thus we cannot return here.
     if (CmpInstr.getOpcode() == ARM::CMPri ||
         CmpInstr.getOpcode() == ARM::t2CMPri)
@@ -2713,38 +2736,43 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
     }
     I = CmpInstr;
     E = MI;
+  } else {
+    // Allow the loop below to search E (which was initially MI).  Since MI and
+    // SubAdd have different tests, even if that instruction could not be MI, it
+    // could still potentially be SubAdd.
+    --E;
   }
 
   // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change. At the same time, search for Sub.
+  // want to change. At the same time, search for SubAdd.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
   for (; I != E; --I) {
     const MachineInstr &Instr = *I;
 
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      SubAdd = &*I;
+      break;
+    }
+
     if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
         Instr.readsRegister(ARM::CPSR, TRI))
       // This instruction modifies or uses CPSR after the one we want to
       // change. We can't do this transformation.
       return false;
 
-    // Check whether CmpInstr can be made redundant by the current instruction.
-    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
-      Sub = &*I;
-      break;
-    }
-
     if (I == B)
       // The 'and' is below the comparison instruction.
       return false;
   }
 
   // Return false if no candidates exist.
-  if (!MI && !Sub)
+  if (!MI && !SubAdd)
     return false;
 
   // The single candidate is called MI.
-  if (!MI) MI = Sub;
+  if (!MI) MI = SubAdd;
 
   // We can't use a predicated instruction - it doesn't always write the flags.
   if (isPredicated(*MI))
@@ -2802,25 +2830,31 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
         break;
       }
 
-      if (Sub) {
-        ARMCC::CondCodes NewCC = getSwappedCondition(CC);
-        if (NewCC == ARMCC::AL)
-          return false;
+      if (SubAdd) {
         // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
         // on CMP needs to be updated to be based on SUB.
+        // If we have ADD(r1, r2, X) and CMP(r1, r2), the condition code also
+        // needs to be modified.
         // Push the condition code operands to OperandsToUpdate.
         // If it is safe to remove CmpInstr, the condition code of these
         // operands will be modified.
-        if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-            Sub->getOperand(2).getReg() == SrcReg) {
+        unsigned Opc = SubAdd->getOpcode();
+        bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
+                     Opc == ARM::SUBri || Opc == ARM::t2SUBri;
+        if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 &&
+                       SubAdd->getOperand(2).getReg() == SrcReg)) {
           // VSel doesn't support condition code update.
           if (IsInstrVSel)
             return false;
+          // Ensure we can swap the condition.
+          ARMCC::CondCodes NewCC = (IsSub ? getSwappedCondition(CC) : getCmpToAddCondition(CC));
+          if (NewCC == ARMCC::AL)
+            return false;
           OperandsToUpdate.push_back(
               std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
         }
       } else {
-        // No Sub, so this is x = <op> y, z; cmp x, 0.
+        // No SubAdd, so this is x = <op> y, z; cmp x, 0.
         switch (CC) {
         case ARMCC::EQ: // Z
         case ARMCC::NE: // Z
@@ -2874,6 +2908,23 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   return true;
 }
 
+bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
+  // Do not sink MI if it might be used to optimize a redundant compare.
+  // We heuristically only look at the instruction immediately following MI to
+  // avoid potentially searching the entire basic block.
+  if (isPredicated(MI))
+    return true;
+  MachineBasicBlock::const_iterator Next = &MI;
+  ++Next;
+  unsigned SrcReg, SrcReg2;
+  int CmpMask, CmpValue;
+  if (Next != MI.getParent()->end() &&
+      analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
+      isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI))
+    return false;
+  return true;
+}
+
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                      unsigned Reg,
                                      MachineRegisterInfo *MRI) const {

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -215,6 +215,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  bool shouldSink(const MachineInstr &MI) const override;
+
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,

diff --git a/llvm/test/CodeGen/ARM/intrinsics-overflow.ll b/llvm/test/CodeGen/ARM/intrinsics-overflow.ll
@@ -33,10 +33,10 @@ define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
 
   ; CHECK-LABEL: sadd_overflow:
 
-  ; ARM: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
-  ; ARM: mov r[[R1]], #1
-  ; ARM: cmp r[[R2]], r[[R0]]
-  ; ARM: movvc r[[R1]], #0
+  ; ARM: adds r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; ARM: mov r[[R0]], #1
+  ; ARM: movvc r[[R0]], #0
+  ; ARM: mov pc, lr
 
   ; THUMBV6: mov  r[[R2:[0-9]+]], r[[R0:[0-9]+]]
   ; THUMBV6: adds r[[R3:[0-9]+]], r[[R2]], r[[R1:[0-9]+]]
@@ -47,11 +47,10 @@ define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
   ; THUMBV6: mov  r[[R0]], r[[R1]]
   ; THUMBV6: .L[[LABEL]]:
 
-  ; THUMBV7: movs  r[[R1]], #1
-  ; THUMBV7: cmp   r[[R2]], r[[R0]]
+  ; THUMBV7: adds  r[[R2:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
+  ; THUMBV7: mov.w r[[R0:[0-9]+]], #1
   ; THUMBV7: it    vc
-  ; THUMBV7: movvc r[[R1]], #0
-  ; THUMBV7: mov   r[[R0]], r[[R1]]
+  ; THUMBV7: movvc r[[R0]], #0
 }
 
 define i32 @usub_overflow(i32 %a, i32 %b) #0 {

diff --git a/llvm/test/CodeGen/ARM/su-addsub-overflow.ll b/llvm/test/CodeGen/ARM/su-addsub-overflow.ll
@@ -2,9 +2,7 @@
 
 define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: sadd:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    add r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
+; CHECK:    adds r0, r0, r1
 ; CHECK-NEXT:    movvc pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -23,10 +21,8 @@ cont:
 
 define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: uadd:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    adds r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
-; CHECK-NEXT:    movhs pc, lr
+; CHECK:    adds r0, r0, r1
+; CHECK-NEXT:    movlo pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %1 = extractvalue { i32, i1 } %0, 1
@@ -44,8 +40,7 @@ cont:
 
 define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: ssub:
-; CHECK:    cmp r0, r1
-; CHECK-NEXT:    subvc r0, r0, r1
+; CHECK:    subs r0, r0, r1
 ; CHECK-NEXT:    movvc pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -64,9 +59,7 @@ cont:
 
 define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: usub:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    subs r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R0]], r1
+; CHECK:    subs r0, r0, r1
 ; CHECK-NEXT:    movhs pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -87,11 +80,9 @@ define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 {
 ; CHECK-LABEL: sum:
 ; CHECK:    ldr [[R0:r[0-9]+]],
 ; CHECK-NEXT:    ldr [[R1:r[0-9]+|lr]],
-; CHECK-NEXT:    add [[R2:r[0-9]+]], [[R1]], [[R0]]
-; CHECK-NEXT:    cmp [[R2]], [[R1]]
+; CHECK-NEXT:    adds [[R2:r[0-9]+]], [[R1]], [[R0]]
 ; CHECK-NEXT:    strvc [[R2]],
-; CHECK-NEXT:    addvc
-; CHECK-NEXT:    cmpvc
+; CHECK-NEXT:    addsvc
 ; CHECK-NEXT:    bvs
 entry:
   %cmp7 = icmp eq i32 %n, 0
@@ -128,6 +119,46 @@ cont2:
 
 }
 
+define void @extern_loop(i32 %n) local_unnamed_addr #0 {
+; Do not replace the compare around the clobbering call.
+; CHECK: add {{r[0-9]+}}, {{r[0-9]+}}, #1
+; CHECK-NEXT: bl external_fn
+; CHECK: cmp
+entry:
+  %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %n, i32 1)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont.lr.ph
+
+cont.lr.ph:
+  %2 = extractvalue { i32, i1 } %0, 0
+  %cmp5 = icmp sgt i32 %2, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.046 = phi i32 [ %5, %cont1 ], [ 0, %for.body.preheader ]
+  tail call void bitcast (void (...)* @external_fn to void ()*)() #4
+  %3 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.046, i32 1)
+  %4 = extractvalue { i32, i1 } %3, 1
+  br i1 %4, label %trap, label %cont1
+
+cont1:
+  %5 = extractvalue { i32, i1 } %3, 0
+  %cmp = icmp slt i32 %5, %2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+declare void @external_fn(...) local_unnamed_addr #0
+
 declare void @llvm.trap() #2
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1