@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
140
140
cl::desc (" enable use of redzone on AArch64" ),
141
141
cl::init(false ), cl::Hidden);
142
142
143
+ static cl::opt<bool >
144
+ ReverseCSRRestoreSeq (" reverse-csr-restore-seq" ,
145
+ cl::desc (" reverse the CSR restore sequence" ),
146
+ cl::init(false ), cl::Hidden);
147
+
143
148
STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
144
149
145
150
// / This is the biggest offset to the stack pointer we can encode in aarch64
@@ -843,14 +848,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
843
848
Subtarget.isCallingConvWin64 (MF.getFunction ().getCallingConv ());
844
849
unsigned FixedObject = IsWin64 ? alignTo (AFI->getVarArgsGPRSize (), 16 ) : 0 ;
845
850
851
+ uint64_t AfterCSRPopSize = ArgumentPopSize;
846
852
auto PrologueSaveSize = AFI->getCalleeSavedStackSize () + FixedObject;
847
853
bool CombineSPBump = shouldCombineCSRLocalStackBump (MF, NumBytes);
848
-
849
- if (!CombineSPBump && PrologueSaveSize != 0 )
850
- convertCalleeSaveRestoreToSPPrePostIncDec (
851
- MBB, std::prev (MBB.getFirstTerminator ()), DL, TII, PrologueSaveSize);
854
+ // Assume we can't combine the last pop with the sp restore.
855
+
856
+ if (!CombineSPBump && PrologueSaveSize != 0 ) {
857
+ MachineBasicBlock::iterator Pop = std::prev (MBB.getFirstTerminator ());
858
+ // Converting the last ldp to a post-index ldp is valid only if the last
859
+ // ldp's offset is 0.
860
+ const MachineOperand &OffsetOp = Pop->getOperand (Pop->getNumOperands () - 1 );
861
+ // If the offset is 0, convert it to a post-index ldp.
862
+ if (OffsetOp.getImm () == 0 ) {
863
+ convertCalleeSaveRestoreToSPPrePostIncDec (MBB, Pop, DL, TII,
864
+ PrologueSaveSize);
865
+ } else {
866
+ // If not, make sure to emit an add after the last ldp.
867
+ // We're doing this by transfering the size to be restored from the
868
+ // adjustment *before* the CSR pops to the adjustment *after* the CSR
869
+ // pops.
870
+ AfterCSRPopSize += PrologueSaveSize;
871
+ }
872
+ }
852
873
853
874
// Move past the restores of the callee-saved registers.
875
+ // If we plan on combining the sp bump of the local stack size and the callee
876
+ // save stack size, we might need to adjust the CSR save and restore offsets.
854
877
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator ();
855
878
MachineBasicBlock::iterator Begin = MBB.begin ();
856
879
while (LastPopI != Begin) {
@@ -865,7 +888,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
865
888
// If there is a single SP update, insert it before the ret and we're done.
866
889
if (CombineSPBump) {
867
890
emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
868
- NumBytes + ArgumentPopSize , TII,
891
+ NumBytes + AfterCSRPopSize , TII,
869
892
MachineInstr::FrameDestroy);
870
893
return ;
871
894
}
@@ -877,18 +900,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
877
900
bool RedZone = canUseRedZone (MF);
878
901
// If this was a redzone leaf function, we don't need to restore the
879
902
// stack pointer (but we may need to pop stack args for fastcc).
880
- if (RedZone && ArgumentPopSize == 0 )
903
+ if (RedZone && AfterCSRPopSize == 0 )
881
904
return ;
882
905
883
906
bool NoCalleeSaveRestore = PrologueSaveSize == 0 ;
884
907
int StackRestoreBytes = RedZone ? 0 : NumBytes;
885
908
if (NoCalleeSaveRestore)
886
- StackRestoreBytes += ArgumentPopSize ;
909
+ StackRestoreBytes += AfterCSRPopSize ;
887
910
emitFrameOffset (MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
888
911
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
889
912
// If we were able to combine the local stack pop with the argument pop,
890
913
// then we're done.
891
- if (NoCalleeSaveRestore || ArgumentPopSize == 0 )
914
+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0 )
892
915
return ;
893
916
NumBytes = 0 ;
894
917
}
@@ -908,9 +931,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
908
931
// This must be placed after the callee-save restore code because that code
909
932
// assumes the SP is at the same location as it was after the callee-save save
910
933
// code in the prologue.
911
- if (ArgumentPopSize)
934
+ if (AfterCSRPopSize) {
935
+ // Sometimes (when we restore in the same order as we save), we can end up
936
+ // with code like this:
937
+ //
938
+ // ldp x26, x25, [sp]
939
+ // ldp x24, x23, [sp, #16]
940
+ // ldp x22, x21, [sp, #32]
941
+ // ldp x20, x19, [sp, #48]
942
+ // add sp, sp, #64
943
+ //
944
+ // In this case, it is always better to put the first ldp at the end, so
945
+ // that the load-store optimizer can run and merge the ldp and the add into
946
+ // a post-index ldp.
947
+ // If we managed to grab the first pop instruction, move it to the end.
948
+ if (LastPopI != Begin)
949
+ MBB.splice (MBB.getFirstTerminator (), &MBB, LastPopI);
950
+ // We should end up with something like this now:
951
+ //
952
+ // ldp x24, x23, [sp, #16]
953
+ // ldp x22, x21, [sp, #32]
954
+ // ldp x20, x19, [sp, #48]
955
+ // ldp x26, x25, [sp]
956
+ // add sp, sp, #64
957
+ //
958
+ // and the load-store optimizer can merge the last two instructions into:
959
+ //
960
+ // ldp x26, x25, [sp], #64
961
+ //
912
962
emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
913
- ArgumentPopSize, TII, MachineInstr::FrameDestroy);
963
+ AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
964
+ }
914
965
}
915
966
916
967
// / getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1179,9 +1230,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
1179
1230
1180
1231
computeCalleeSaveRegisterPairs (MF, CSI, TRI, RegPairs);
1181
1232
1182
- for (auto RPII = RegPairs.begin (), RPIE = RegPairs.end (); RPII != RPIE;
1183
- ++RPII) {
1184
- RegPairInfo RPI = *RPII;
1233
+ auto EmitMI = [&](const RegPairInfo &RPI) {
1185
1234
unsigned Reg1 = RPI.Reg1 ;
1186
1235
unsigned Reg2 = RPI.Reg2 ;
1187
1236
@@ -1220,7 +1269,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
1220
1269
MIB.addMemOperand (MF.getMachineMemOperand (
1221
1270
MachinePointerInfo::getFixedStack (MF, RPI.FrameIdx ),
1222
1271
MachineMemOperand::MOLoad, 8 , 8 ));
1223
- }
1272
+ };
1273
+
1274
+ if (ReverseCSRRestoreSeq)
1275
+ for (const RegPairInfo &RPI : reverse (RegPairs))
1276
+ EmitMI (RPI);
1277
+ else
1278
+ for (const RegPairInfo &RPI : RegPairs)
1279
+ EmitMI (RPI);
1224
1280
return true ;
1225
1281
}
1226
1282
0 commit comments