From 2717444753318e461e0c3b30dacd03ffbac96903 Mon Sep 17 00:00:00 2001 From: Ariel Ben-Yehuda Date: Sat, 23 Dec 2017 23:22:39 +0200 Subject: [PATCH] try to improve the pass manager order This changes the order of the GVNs a bit. I removed @eddyb's GVN added in #35662 - I checked the original examples, and it does not seem to do anything. Instead of it, I added a GVN to the *start* of the optimization pipeline, to remove junk. I moved the second GVN to after memcpyopt, so that we'll have a GVN after MemCpy optimizations - we already have a GVN "before" it. Also, I duplicated IndVarSimplify instead of adding a SimplifyCfg - looking at benchmarks, it should have the same performance impact. --- lib/Transforms/IPO/PassManagerBuilder.cpp | 64 ++++++++++++++++++++--- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 260b2283b311..63837d16774b 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -291,6 +291,9 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { void PassManagerBuilder::addFunctionSimplificationPasses( legacy::PassManagerBase &MPM) { // Start of function pass. + + // **** Initial canonicalization sequence, clean up the code after inlining. + // Break up aggregate allocas, using SSAUpdater. MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies @@ -299,21 +302,62 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createJumpThreadingPass()); // Thread jumps. MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals MPM.add(createCFGSimplificationPass()); // Merge & remove BBs - // Combine silly seq's - addInstructionCombiningPass(MPM); + + // **** After we have taken out the trash, we can do more expensive and + // aggressive optimizations. + // + // I have no idea what is the best order of these passes. + // + // ($) there might be some `br i1 false` here that InstCombine discovered + // that we might want to kill somehow. However, every SimplifyCfg or + // JumpThreading I add takes 1% of the compiler's performance even if + // it does nothing. + // + // I believe we could have some sort of "br i1 false"-removal pass + // in strategic places, that should not be too slow. Ideally, in + // 90% of the inter-pass transitions the pass would have + // nothing to do and therefore be fast (there's an O(N*M) problem, + // where for a large function we might get hit with the full + // cost). That needs to be further investigated. + + addInstructionCombiningPass(MPM); // Combine silly seq's if (SizeLevel == 0 && !DisableLibCallsShrinkWrap) MPM.add(createLibCallsShrinkWrapPass()); addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createTailCallEliminationPass()); // Eliminate tail calls + if (OptLevel > 1) { + // Merge duplicate loads and do cross-BB load/store forwarding. This should + // happen before the loop passes. This is done earlier than in C++ because + // these optimizations are much more useful in Rust, because of noalias. + MPM.add(NewGVN ? createNewGVNPass() + : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies + } MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + + // **** Loop optimizations. There are 2 loop optimization "sequences", + // with an InstCombine+SimplifyCfg in the middle. + + // Seq #1 + MPM.add(createReassociatePass()); // Reassociate expressions // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants + MPM.add(createIndVarSimplifyPass()); // Simplify Indvars MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + + // Cleanup between seqs. + MPM.add(createCFGSimplificationPass()); addInstructionCombiningPass(MPM); + + // Seq #2 + + // I am intentionally duplicating IndVarSimplify. The SimplifyCfg pass after + // the first IndVarSimplify gets rid of a bunch of junk that interferes + // with loop idiom recognition, and the second IndVarSimplify was present + // in C++ so I don't want to remove it much. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops @@ -325,25 +369,31 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); + // End of loop optimization sequence. + + // Optimization sequences I know we need: + // UNROLL -> SIMPLIFY -> MEMCPYOPT -> INSTCOMBINE -> GVN - needed for + + // Exit out of LCSSA, and do some cleanup after loop unrolling. + MPM.add(createCFGSimplificationPass()); + + MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset if (OptLevel > 1) { if (EnableMLSM) MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds MPM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies } - MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP // Delete dead bit computations (instcombine runs after to fold away the dead // computations, and then ADCE will run later to exploit any new DCE // opportunities that creates). MPM.add(createBitTrackingDCEPass()); // Delete dead bit computations - // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. addInstructionCombiningPass(MPM); - if (OptLevel > 1) - MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies + addExtensionsToPM(EP_Peephole, MPM); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass());