Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 9424bb9

Browse files
committed
[AArch64] Emit CSR loads in the same order as stores
Optionally allow the order of restoring the callee-saved registers in the epilogue to be reversed. The flag -reverse-csr-restore-seq generates the following code: ``` stp x26, x25, [sp, #-64]! stp x24, x23, [sp, #16] stp x22, x21, [sp, #32] stp x20, x19, [sp, #48] ; [..] ldp x24, x23, [sp, #16] ldp x22, x21, [sp, #32] ldp x20, x19, [sp, #48] ldp x26, x25, [sp], #64 ret ``` Note how the CSRs are restored in the same order as they are saved. One exception to this rule is the last `ldp`, which allows us to merge the stack adjustment and the ldp into a post-index ldp. This is done by first generating: ldp x26, x27, [sp] add sp, sp, #64 which gets merged by the arm64 load store optimizer into ldp x26, x25, [sp], #64 The flag is disabled by default. rdar://problem/33759434 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327569 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 915f319 commit 9424bb9

File tree

2 files changed

+141
-14
lines changed

2 files changed

+141
-14
lines changed

lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 70 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
140140
cl::desc("enable use of redzone on AArch64"),
141141
cl::init(false), cl::Hidden);
142142

143+
static cl::opt<bool>
144+
ReverseCSRRestoreSeq("reverse-csr-restore-seq",
145+
cl::desc("reverse the CSR restore sequence"),
146+
cl::init(false), cl::Hidden);
147+
143148
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
144149

145150
/// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -843,14 +848,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
843848
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
844849
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
845850

851+
uint64_t AfterCSRPopSize = ArgumentPopSize;
846852
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
847853
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
848-
849-
if (!CombineSPBump && PrologueSaveSize != 0)
850-
convertCalleeSaveRestoreToSPPrePostIncDec(
851-
MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
854+
// Assume we can't combine the last pop with the sp restore.
855+
856+
if (!CombineSPBump && PrologueSaveSize != 0) {
857+
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
858+
// Converting the last ldp to a post-index ldp is valid only if the last
859+
// ldp's offset is 0.
860+
const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
861+
// If the offset is 0, convert it to a post-index ldp.
862+
if (OffsetOp.getImm() == 0) {
863+
convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
864+
PrologueSaveSize);
865+
} else {
866+
// If not, make sure to emit an add after the last ldp.
867+
// We're doing this by transfering the size to be restored from the
868+
// adjustment *before* the CSR pops to the adjustment *after* the CSR
869+
// pops.
870+
AfterCSRPopSize += PrologueSaveSize;
871+
}
872+
}
852873

853874
// Move past the restores of the callee-saved registers.
875+
// If we plan on combining the sp bump of the local stack size and the callee
876+
// save stack size, we might need to adjust the CSR save and restore offsets.
854877
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
855878
MachineBasicBlock::iterator Begin = MBB.begin();
856879
while (LastPopI != Begin) {
@@ -865,7 +888,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
865888
// If there is a single SP update, insert it before the ret and we're done.
866889
if (CombineSPBump) {
867890
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
868-
NumBytes + ArgumentPopSize, TII,
891+
NumBytes + AfterCSRPopSize, TII,
869892
MachineInstr::FrameDestroy);
870893
return;
871894
}
@@ -877,18 +900,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
877900
bool RedZone = canUseRedZone(MF);
878901
// If this was a redzone leaf function, we don't need to restore the
879902
// stack pointer (but we may need to pop stack args for fastcc).
880-
if (RedZone && ArgumentPopSize == 0)
903+
if (RedZone && AfterCSRPopSize == 0)
881904
return;
882905

883906
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
884907
int StackRestoreBytes = RedZone ? 0 : NumBytes;
885908
if (NoCalleeSaveRestore)
886-
StackRestoreBytes += ArgumentPopSize;
909+
StackRestoreBytes += AfterCSRPopSize;
887910
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
888911
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
889912
// If we were able to combine the local stack pop with the argument pop,
890913
// then we're done.
891-
if (NoCalleeSaveRestore || ArgumentPopSize == 0)
914+
if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
892915
return;
893916
NumBytes = 0;
894917
}
@@ -908,9 +931,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
908931
// This must be placed after the callee-save restore code because that code
909932
// assumes the SP is at the same location as it was after the callee-save save
910933
// code in the prologue.
911-
if (ArgumentPopSize)
934+
if (AfterCSRPopSize) {
935+
// Sometimes (when we restore in the same order as we save), we can end up
936+
// with code like this:
937+
//
938+
// ldp x26, x25, [sp]
939+
// ldp x24, x23, [sp, #16]
940+
// ldp x22, x21, [sp, #32]
941+
// ldp x20, x19, [sp, #48]
942+
// add sp, sp, #64
943+
//
944+
// In this case, it is always better to put the first ldp at the end, so
945+
// that the load-store optimizer can run and merge the ldp and the add into
946+
// a post-index ldp.
947+
// If we managed to grab the first pop instruction, move it to the end.
948+
if (LastPopI != Begin)
949+
MBB.splice(MBB.getFirstTerminator(), &MBB, LastPopI);
950+
// We should end up with something like this now:
951+
//
952+
// ldp x24, x23, [sp, #16]
953+
// ldp x22, x21, [sp, #32]
954+
// ldp x20, x19, [sp, #48]
955+
// ldp x26, x25, [sp]
956+
// add sp, sp, #64
957+
//
958+
// and the load-store optimizer can merge the last two instructions into:
959+
//
960+
// ldp x26, x25, [sp], #64
961+
//
912962
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
913-
ArgumentPopSize, TII, MachineInstr::FrameDestroy);
963+
AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
964+
}
914965
}
915966

916967
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1179,9 +1230,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
11791230

11801231
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
11811232

1182-
for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
1183-
++RPII) {
1184-
RegPairInfo RPI = *RPII;
1233+
auto EmitMI = [&](const RegPairInfo &RPI) {
11851234
unsigned Reg1 = RPI.Reg1;
11861235
unsigned Reg2 = RPI.Reg2;
11871236

@@ -1220,7 +1269,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
12201269
MIB.addMemOperand(MF.getMachineMemOperand(
12211270
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
12221271
MachineMemOperand::MOLoad, 8, 8));
1223-
}
1272+
};
1273+
1274+
if (ReverseCSRRestoreSeq)
1275+
for (const RegPairInfo &RPI : reverse(RegPairs))
1276+
EmitMI(RPI);
1277+
else
1278+
for (const RegPairInfo &RPI : RegPairs)
1279+
EmitMI(RPI);
12241280
return true;
12251281
}
12261282

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT
2+
# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT
3+
--- |
4+
5+
define void @foo() nounwind { entry: unreachable }
6+
7+
define void @bar() nounwind { entry: unreachable }
8+
9+
...
10+
---
11+
name: foo
12+
# CHECK-LABEL: name: foo
13+
tracksRegLiveness: true
14+
body: |
15+
bb.0:
16+
%x19 = IMPLICIT_DEF
17+
%x20 = IMPLICIT_DEF
18+
%x21 = IMPLICIT_DEF
19+
%x22 = IMPLICIT_DEF
20+
%x23 = IMPLICIT_DEF
21+
%x24 = IMPLICIT_DEF
22+
%x25 = IMPLICIT_DEF
23+
%x26 = IMPLICIT_DEF
24+
25+
; The local stack size is 0, so the last ldp in the sequence will also
26+
; restore the stack.
27+
; CHECK: %x24, %x23 = LDPXi %sp, 2
28+
; CHECK-NEXT: %x22, %x21 = LDPXi %sp, 4
29+
; CHECK-NEXT: %x20, %x19 = LDPXi %sp, 6
30+
31+
; Before running the load-store optimizer, we emit a ldp and an add.
32+
; BEFORELDSTOPT-NEXT: %x26, %x25 = LDPXi %sp, 0
33+
; BEFORELDSTOPT-NEXT: %sp = ADDXri %sp, 64, 0
34+
35+
; We want to make sure that after running the load-store optimizer, the ldp
36+
; and the add get merged into a post-index ldp.
37+
; AFTERLDSTOPT-NEXT: early-clobber %sp, %x26, %x25 = LDPXpost %sp, 8
38+
39+
RET_ReallyLR
40+
...
41+
---
42+
name: bar
43+
# CHECK-LABEL: name: bar
44+
tracksRegLiveness: true
45+
stack:
46+
- { id : 0, size: 8, alignment: 4,
47+
stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
48+
local-offset: -4, di-variable: '', di-expression: '', di-location: '' }
49+
50+
body: |
51+
bb.0:
52+
%x19 = IMPLICIT_DEF
53+
%x20 = IMPLICIT_DEF
54+
%x21 = IMPLICIT_DEF
55+
%x22 = IMPLICIT_DEF
56+
%x23 = IMPLICIT_DEF
57+
%x24 = IMPLICIT_DEF
58+
%x25 = IMPLICIT_DEF
59+
%x26 = IMPLICIT_DEF
60+
61+
; The local stack size is not 0, and we can combine the CSR stack size with
62+
; the local stack size. This results in rewriting the offsets for all the
63+
; save/restores and forbids us to merge the stack adjustment and the last pop.
64+
; In this case, there is no point of moving the first CSR pair at the end.
65+
; CHECK: %x26, %x25 = LDPXi %sp, 2
66+
; CHECK-NEXT: %x24, %x23 = LDPXi %sp, 4
67+
; CHECK-NEXT: %x22, %x21 = LDPXi %sp, 6
68+
; CHECK-NEXT: %x20, %x19 = LDPXi %sp, 8
69+
; CHECK-NEXT: %sp = ADDXri %sp, 80, 0
70+
RET_ReallyLR
71+
...

0 commit comments

Comments
 (0)