ssa: removes map use for block traversals (#2235)

This removes the use of map in basic block traversals. As a result, overall compilation perf improves like the below: ### Zig ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_zig.txt │ new_zig.txt │ │ sec/op │ sec/op vs base │ Zig/Compile/test-opt.wasm-10 4.438 ± 1% 3.778 ± 0% -14.87% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 18.77 ± 1% 18.76 ± 0% ~ (p=0.818 n=6) Zig/Compile/test.wasm-10 5.083 ± 0% 4.673 ± 0% -8.07% (p=0.002 n=6) Zig/Run/test.wasm-10 19.27 ± 1% 19.30 ± 1% ~ (p=0.699 n=6) geomean 9.504 8.941 -5.92% │ old_zig.txt │ new_zig.txt │ │ B/op │ B/op vs base │ Zig/Compile/test-opt.wasm-10 396.7Mi ± 0% 394.7Mi ± 0% -0.51% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 741.7Mi ± 0% 741.7Mi ± 0% ~ (p=0.900 n=6) Zig/Compile/test.wasm-10 660.0Mi ± 0% 659.5Mi ± 0% -0.08% (p=0.002 n=6) Zig/Run/test.wasm-10 1.296Gi ± 0% 1.296Gi ± 0% ~ (p=0.892 n=6) geomean 712.6Mi 711.5Mi -0.15% │ old_zig.txt │ new_zig.txt │ │ allocs/op │ allocs/op vs base │ Zig/Compile/test-opt.wasm-10 363.2k ± 0% 362.6k ± 0% -0.17% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 51.58k ± 0% 51.58k ± 0% ~ (p=0.933 n=6) Zig/Compile/test.wasm-10 515.2k ± 0% 515.4k ± 0% ~ (p=0.485 n=6) Zig/Run/test.wasm-10 2.156M ± 0% 2.156M ± 0% ~ (p=0.998 n=6) geomean 379.8k 379.7k -0.03% ``` ### wasip1 ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_wasip1.txt │ new_wasip1.txt │ │ sec/op │ sec/op vs base │ Wasip1/Compile/src_archive_tar.test-10 2.198 ± 1% 2.067 ± 1% -5.96% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 398.8m ± 0% 398.8m ± 0% ~ (p=0.902 n=7) Wasip1/Compile/src_bufio.test-10 1.492 ± 0% 1.409 ± 1% -5.57% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 120.5m ± 1% 121.0m ± 1% +0.44% (p=0.017 n=7) Wasip1/Compile/src_bytes.test-10 1.543 ± 0% 1.454 ± 0% -5.72% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 469.0m ± 1% 467.4m ± 1% ~ (p=0.209 n=7) Wasip1/Compile/src_context.test-10 1.664 ± 0% 1.564 ± 1% -6.00% (p=0.001 n=7) Wasip1/Run/src_context.test-10 31.54m ± 1% 31.57m ± 0% ~ (p=0.445 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 1.261 ± ∞ ¹ geomean 527.3m 565.9m -2.92% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_wasip1.txt │ new_wasip1.txt │ │ B/op │ B/op vs base │ Wasip1/Compile/src_archive_tar.test-10 93.44Mi ± 0% 93.17Mi ± 0% -0.30% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 286.0Mi ± 0% 286.0Mi ± 0% ~ (p=0.593 n=7) Wasip1/Compile/src_bufio.test-10 74.38Mi ± 0% 74.13Mi ± 0% -0.35% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 105.3Mi ± 0% 105.3Mi ± 0% ~ (p=0.780 n=7) Wasip1/Compile/src_bytes.test-10 75.58Mi ± 0% 75.32Mi ± 0% -0.35% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 605.0Mi ± 0% 605.0Mi ± 0% ~ (p=0.331 n=7) Wasip1/Compile/src_context.test-10 78.33Mi ± 0% 78.07Mi ± 0% -0.33% (p=0.001 n=7) Wasip1/Run/src_context.test-10 71.52Mi ± 0% 71.52Mi ± 0% ~ (p=1.000 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 70.38Mi ± ∞ ¹ geomean 123.4Mi 115.7Mi -0.17% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_wasip1.txt │ new_wasip1.txt │ │ allocs/op │ allocs/op vs base │ Wasip1/Compile/src_archive_tar.test-10 265.4k ± 0% 265.0k ± 0% -0.16% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 7.831k ± 0% 7.830k ± 0% ~ (p=1.000 n=7) Wasip1/Compile/src_bufio.test-10 195.6k ± 0% 195.4k ± 0% -0.12% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 3.728k ± 0% 3.728k ± 0% ~ (p=1.000 n=7) ¹ Wasip1/Compile/src_bytes.test-10 204.1k ± 0% 203.7k ± 0% -0.20% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 6.377k ± 0% 6.377k ± 0% ~ (p=1.000 n=7) Wasip1/Compile/src_context.test-10 221.7k ± 0% 221.6k ± 0% -0.06% (p=0.001 n=7) Wasip1/Run/src_context.test-10 3.814k ± 0% 3.814k ± 1% ~ (p=0.140 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 182.3k ± ∞ ² geomean 33.71k 40.64k -0.07% ¹ all samples are equal ² need >= 6 samples for confidence interval at level 0.95 ``` ### TinyGo ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_tinygo.txt │ new_tinygo.txt │ │ sec/op │ sec/op vs base │ TinyGo/Compile/container_heap.test-10 410.8m ± 1% 399.8m ± 0% -2.69% (p=0.001 n=7) TinyGo/Run/container_heap.test-10 14.41m ± 0% 14.29m ± 2% -0.77% (p=0.026 n=7) TinyGo/Compile/container_list.test-10 410.5m ± 1% 398.1m ± 0% -3.02% (p=0.001 n=7) TinyGo/Run/container_list.test-10 14.27m ± 2% 14.16m ± 1% ~ (p=0.073 n=7) TinyGo/Compile/container_ring.test-10 403.7m ± 1% 392.5m ± 2% -2.77% (p=0.001 n=7) TinyGo/Run/container_ring.test-10 14.24m ± 0% 14.27m ± 1% ~ (p=0.259 n=7) TinyGo/Compile/crypto_des.test-10 418.8m ± 0% 408.1m ± 0% -2.56% (p=0.001 n=7) TinyGo/Run/crypto_des.test-10 18.23m ± 0% 18.17m ± 1% ~ (p=0.456 n=7) TinyGo/Compile/crypto_md5.test-10 417.3m ± 2% 406.1m ± 1% -2.68% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 20.50m ± 0% 20.45m ± 1% ~ (p=0.128 n=7) TinyGo/Compile/crypto_rc4.test-10 402.2m ± 1% 390.5m ± 0% -2.90% (p=0.001 n=7) TinyGo/Run/crypto_rc4.test-10 160.8m ± 0% 161.0m ± 1% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha1.test-10 417.2m ± 1% 404.5m ± 1% -3.04% (p=0.001 n=7) TinyGo/Run/crypto_sha1.test-10 15.93m ± 1% 15.90m ± 1% ~ (p=0.710 n=7) TinyGo/Compile/crypto_sha256.test-10 423.4m ± 1% 412.4m ± 1% -2.60% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 16.16m ± ∞ ¹ 16.05m ± ∞ ¹ ~ (p=0.381 n=2+5) geomean 94.17m 92.70m -1.56% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_tinygo.txt │ new_tinygo.txt │ │ B/op │ B/op vs base │ TinyGo/Compile/container_heap.test-10 48.55Mi ± 0% 48.30Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/container_heap.test-10 16.63Mi ± 0% 16.63Mi ± 0% ~ (p=0.557 n=7) TinyGo/Compile/container_list.test-10 48.53Mi ± 0% 48.29Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/container_list.test-10 16.40Mi ± 0% 16.40Mi ± 0% ~ (p=0.364 n=7) TinyGo/Compile/container_ring.test-10 47.78Mi ± 0% 47.53Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/container_ring.test-10 16.30Mi ± 0% 16.30Mi ± 0% ~ (p=0.128 n=7) TinyGo/Compile/crypto_des.test-10 48.67Mi ± 0% 48.42Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_des.test-10 16.76Mi ± 0% 16.76Mi ± 0% ~ (p=0.902 n=7) TinyGo/Compile/crypto_md5.test-10 48.73Mi ± 0% 48.48Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 44.97Mi ± 0% 44.97Mi ± 0% ~ (p=0.402 n=7) TinyGo/Compile/crypto_rc4.test-10 47.76Mi ± 0% 47.52Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_rc4.test-10 29.28Mi ± 0% 29.28Mi ± 0% ~ (p=0.104 n=7) TinyGo/Compile/crypto_sha1.test-10 48.97Mi ± 0% 48.72Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/crypto_sha1.test-10 17.44Mi ± 0% 17.44Mi ± 0% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha256.test-10 48.81Mi ± 0% 48.56Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 17.53Mi ± ∞ ¹ 17.53Mi ± ∞ ¹ ~ (p=0.381 n=2+5) geomean 31.45Mi 31.37Mi -0.26% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_tinygo.txt │ new_tinygo.txt │ │ allocs/op │ allocs/op vs base │ TinyGo/Compile/container_heap.test-10 83.67k ± 0% 83.46k ± 0% -0.25% (p=0.011 n=7) TinyGo/Run/container_heap.test-10 374.9k ± 0% 374.9k ± 0% ~ (p=1.000 n=7) TinyGo/Compile/container_list.test-10 83.34k ± 0% 83.19k ± 0% -0.19% (p=0.002 n=7) TinyGo/Run/container_list.test-10 370.0k ± 0% 370.0k ± 0% ~ (p=0.674 n=7) TinyGo/Compile/container_ring.test-10 83.26k ± 0% 83.08k ± 0% -0.22% (p=0.004 n=7) TinyGo/Run/container_ring.test-10 367.6k ± 0% 367.6k ± 0% ~ (p=0.249 n=7) TinyGo/Compile/crypto_des.test-10 83.68k ± 0% 83.53k ± 0% -0.18% (p=0.004 n=7) TinyGo/Run/crypto_des.test-10 378.1k ± 0% 378.1k ± 0% ~ (p=0.437 n=7) TinyGo/Compile/crypto_md5.test-10 83.86k ± 0% 83.67k ± 0% -0.23% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 393.3k ± 0% 393.3k ± 0% ~ (p=0.592 n=7) TinyGo/Compile/crypto_rc4.test-10 83.32k ± 0% 83.20k ± 0% -0.14% (p=0.011 n=7) TinyGo/Run/crypto_rc4.test-10 367.1k ± 0% 367.1k ± 0% ~ (p=0.102 n=7) TinyGo/Compile/crypto_sha1.test-10 84.05k ± 0% 83.87k ± 0% -0.21% (p=0.002 n=7) TinyGo/Run/crypto_sha1.test-10 392.7k ± 0% 392.7k ± 0% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha256.test-10 83.86k ± 0% 83.67k ± 0% -0.24% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 394.5k ± ∞ ¹ 394.5k ± ∞ ¹ ~ (p=0.952 n=2+5) geomean 178.2k 178.0k -0.10% ``` ### wazero compiled as wasip1 binary ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Compilation-10 2.413 ± 0% 2.258 ± 1% -6.42% (p=0.001 n=7) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Compilation-10 339.9Mi ± 0% 337.7Mi ± 0% -0.63% (p=0.001 n=7) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Compilation-10 603.9k ± 0% 602.4k ± 0% -0.25% (p=0.001 n=7) ``` Signed-off-by: Takeshi Yoneda <[email protected]>
tetratelabs · Jun 7, 2024 · 747609b · 747609b
1 parent f47fd2e
commit 747609b
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 45 deletions.
diff --git a/internal/engine/wazevo/ssa/basic_block.go b/internal/engine/wazevo/ssa/basic_block.go
@@ -112,7 +112,10 @@ type (
 
 		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
 		// This is used in builder.LayoutBlocks.
-		reversePostOrder int
+		reversePostOrder int32
+
+		// visited is used during various traversals.
+		visited int32
 
 		// child and sibling are the ones in the dominator tree.
 		child, sibling *basicBlock
@@ -274,6 +277,7 @@ func resetBasicBlock(bb *basicBlock) {
 	bb.unknownValues = bb.unknownValues[:0]
 	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
 	bb.reversePostOrder = -1
+	bb.visited = 0
 	bb.loopNestingForestChildren = basicBlockVarLengthNil
 	bb.loopHeader = false
 	bb.sibling = nil

diff --git a/internal/engine/wazevo/ssa/builder.go b/internal/engine/wazevo/ssa/builder.go
@@ -143,7 +143,6 @@ func NewBuilder() Builder {
 		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
 		valueAnnotations:               make(map[ValueID]string),
 		signatures:                     make(map[SignatureID]*Signature),
-		blkVisited:                     make(map[*basicBlock]int),
 		valueIDAliases:                 make(map[ValueID]Value),
 		redundantParameterIndexToValue: make(map[int]Value),
 		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
@@ -189,7 +188,6 @@ type builder struct {
 
 	// The followings are used for optimization passes/deterministic compilation.
 	instStack                      []*Instruction
-	blkVisited                     map[*basicBlock]int
 	valueIDToInstruction           []*Instruction
 	blkStack                       []*basicBlock
 	blkStack2                      []*basicBlock
@@ -266,11 +264,6 @@ func (b *builder) Init(s *Signature) {
 	b.blkStack2 = b.blkStack2[:0]
 	b.dominators = b.dominators[:0]
 	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
-
-	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
-		blk := b.basicBlocksPool.View(i)
-		delete(b.blkVisited, blk)
-	}
 	b.basicBlocksPool.Reset()
 
 	for v := ValueID(0); v < b.nextValueID; v++ {

diff --git a/internal/engine/wazevo/ssa/pass.go b/internal/engine/wazevo/ssa/pass.go
@@ -78,12 +78,11 @@ func (b *builder) runFinalizingPasses() {
 // passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
 func passDeadBlockEliminationOpt(b *builder) {
 	entryBlk := b.entryBlk()
-	b.clearBlkVisited()
 	b.blkStack = append(b.blkStack, entryBlk)
 	for len(b.blkStack) > 0 {
 		reachableBlk := b.blkStack[len(b.blkStack)-1]
 		b.blkStack = b.blkStack[:len(b.blkStack)-1]
-		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+		reachableBlk.visited = 1
 
 		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
 			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
@@ -94,17 +93,18 @@ func passDeadBlockEliminationOpt(b *builder) {
 		}
 
 		for _, succ := range reachableBlk.success {
-			if _, ok := b.blkVisited[succ]; ok {
+			if succ.visited == 1 {
 				continue
 			}
 			b.blkStack = append(b.blkStack, succ)
 		}
 	}
 
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
-		if _, ok := b.blkVisited[blk]; !ok {
+		if blk.visited != 1 {
 			blk.invalid = true
 		}
+		blk.visited = 0
 	}
 }
 
@@ -121,7 +121,7 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
 	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
 	//  -- Note --
-	// 	Currently, each iteration can run in an order of blocks, but it empirically converges quickly in practice when
+	// 	Currently, each iteration can run in any order of blocks, but it empirically converges quickly in practice when
 	// 	running on the reverse post-order. It might be possible to optimize this further by using the dominator tree.
 	for {
 		changed := false
@@ -355,18 +355,6 @@ func (b *builder) incRefCount(id ValueID, from *Instruction) {
 	b.valueRefCounts[id]++
 }
 
-// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
-func (b *builder) clearBlkVisited() {
-	b.blkStack2 = b.blkStack2[:0]
-	for key := range b.blkVisited {
-		b.blkStack2 = append(b.blkStack2, key)
-	}
-	for _, blk := range b.blkStack2 {
-		delete(b.blkVisited, blk)
-	}
-	b.blkStack2 = b.blkStack2[:0]
-}
-
 // passNopInstElimination eliminates the instructions which is essentially a no-op.
 func passNopInstElimination(b *builder) {
 	if int(b.nextValueID) >= len(b.valueIDToInstruction) {

diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts.go b/internal/engine/wazevo/ssa/pass_blk_layouts.go
@@ -23,8 +23,6 @@ import (
 //
 // This heuristic is done in maybeInvertBranches function.
 func passLayoutBlocks(b *builder) {
-	b.clearBlkVisited()
-
 	// We might end up splitting critical edges which adds more basic blocks,
 	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
 	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
@@ -47,20 +45,20 @@ func passLayoutBlocks(b *builder) {
 	for _, blk := range nonSplitBlocks {
 		for i := range blk.preds {
 			pred := blk.preds[i].blk
-			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+			if pred.visited == 1 || !pred.Valid() {
 				continue
 			} else if pred.reversePostOrder < blk.reversePostOrder {
 				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
 				// Split edge trampolines must come before the destination in reverse post-order.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
-				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+				pred.visited = 1 // mark as inserted.
 			}
 		}
 
 		// Now that we've already added all the potential trampoline blocks incoming to this block,
 		// we can add this block itself.
 		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
-		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+		blk.visited = 1 // mark as inserted.
 
 		if len(blk.success) < 2 {
 			// There won't be critical edge originating from this block.
@@ -116,7 +114,7 @@ func passLayoutBlocks(b *builder) {
 			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
 				// This can be lowered as fallthrough at the end of the block.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} else {
 				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
 			}
@@ -126,7 +124,7 @@ func passLayoutBlocks(b *builder) {
 			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
 				// This means the critical edge was backward, so we insert after the current block immediately.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} // If the target is forward, we can wait to insert until the target is inserted.
 		}
 		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
@@ -142,7 +140,7 @@ func passLayoutBlocks(b *builder) {
 
 	if wazevoapi.SSAValidationEnabled {
 		for _, trampoline := range trampolines {
-			if _, ok := b.blkVisited[trampoline]; !ok {
+			if trampoline.visited != 1 {
 				panic("BUG: trampoline block not inserted: " + trampoline.formatHeader(b))
 			}
 			trampoline.validate(b)

diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts_test.go b/internal/engine/wazevo/ssa/pass_blk_layouts_test.go
@@ -192,7 +192,7 @@ func TestBuilder_splitCriticalEdge(t *testing.T) {
 	predInfo := &basicBlockPredecessorInfo{blk: predBlk, branch: originalBrz}
 	trampoline := b.splitCriticalEdge(predBlk, dummyBlk, predInfo)
 	require.NotNil(t, trampoline)
-	require.Equal(t, 100, trampoline.reversePostOrder)
+	require.Equal(t, int32(100), trampoline.reversePostOrder)
 
 	require.Equal(t, trampoline, predInfo.blk)
 	require.Equal(t, originalBrz, predInfo.branch)

diff --git a/internal/engine/wazevo/ssa/pass_cfg.go b/internal/engine/wazevo/ssa/pass_cfg.go
@@ -15,10 +15,6 @@ import (
 // At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
 func passCalculateImmediateDominators(b *builder) {
 	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
-	exploreStack := b.blkStack[:0]
-	b.clearBlkVisited()
-
-	entryBlk := b.entryBlk()
 
 	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
 	// This calculation of reverse postorder is not described in the paper,
@@ -28,14 +24,17 @@ func passCalculateImmediateDominators(b *builder) {
 	// which is a reasonable assumption as long as SSA Builder is properly used.
 	//
 	// First we push blocks in postorder iteratively visit successors of the entry block.
-	exploreStack = append(exploreStack, entryBlk)
+	entryBlk := b.entryBlk()
+	exploreStack := append(b.blkStack[:0], entryBlk)
+	// These flags are used to track the state of the block in the DFS traversal.
+	// We temporarily use the reversePostOrder field to store the state.
 	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
-	b.blkVisited[entryBlk] = visitStateSeen
+	entryBlk.visited = visitStateSeen
 	for len(exploreStack) > 0 {
 		tail := len(exploreStack) - 1
 		blk := exploreStack[tail]
 		exploreStack = exploreStack[:tail]
-		switch b.blkVisited[blk] {
+		switch blk.visited {
 		case visitStateUnseen:
 			// This is likely a bug in the frontend.
 			panic("BUG: unsupported CFG")
@@ -48,16 +47,18 @@ func passCalculateImmediateDominators(b *builder) {
 				if succ.ReturnBlock() || succ.invalid {
 					continue
 				}
-				if b.blkVisited[succ] == visitStateUnseen {
-					b.blkVisited[succ] = visitStateSeen
+				if succ.visited == visitStateUnseen {
+					succ.visited = visitStateSeen
 					exploreStack = append(exploreStack, succ)
 				}
 			}
 			// Finally, we could pop this block once we pop all of its successors.
-			b.blkVisited[blk] = visitStateDone
+			blk.visited = visitStateDone
 		case visitStateDone:
 			// Note: at this point we push blk in postorder despite its name.
 			reversePostOrder = append(reversePostOrder, blk)
+		default:
+			panic("BUG")
 		}
 	}
 	// At this point, reversePostOrder has postorder actually, so we reverse it.
@@ -67,7 +68,7 @@ func passCalculateImmediateDominators(b *builder) {
 	}
 
 	for i, blk := range reversePostOrder {
-		blk.reversePostOrder = i
+		blk.reversePostOrder = int32(i)
 	}
 
 	// Reuse the dominators slice if possible from the previous computation of function.