From 1a955bcdb701d788b048a39d7273621729e257bc Mon Sep 17 00:00:00 2001
From: vpachkov <slava.pach@gmail.com>
Date: Thu, 2 Dec 2021 16:18:29 +0300
Subject: [PATCH] reflectdata: unroll a loop in array equal function generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As josharian mentioned, a compare function could benefit from
unrolling a loop for arrays. This commit introduces such
functionality.

name                     old time/op  new time/op  delta
EqArrayOfStrings5-12     12.5ns ± 1%   8.4ns ± 1%  -33.05%  (p=0.008 n=5+5)
EqArrayOfStrings64-12    71.7ns ± 1%  64.1ns ± 1%  -10.57%  (p=0.008 n=5+5)
EqArrayOfStrings1024-12  1.12µs ± 1%  1.01µs ± 0%   -9.77%  (p=0.008 n=5+5)
[Geo mean]                100ns         81ns       -18.56%

name                    old time/op  new time/op  delta
EqArrayOfFloats5-12     4.50ns ± 2%  3.32ns ± 1%  -26.09%  (p=0.008 n=5+5)
EqArrayOfFloats64-12    41.3ns ± 1%  35.7ns ± 0%  -13.63%  (p=0.016 n=5+4)
EqArrayOfFloats1024-12   619ns ± 1%   557ns ± 1%   -9.95%  (p=0.008 n=5+5)
[Geo mean]              48.6ns       40.4ns       -16.85%

Change-Id: If1b69c5cf3fb246bb0275a292118b0b93ad9c9a8
Reviewed-on: https://go-review.googlesource.com/c/go/+/368614
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
---
 src/cmd/compile/internal/reflectdata/alg.go   | 94 ++++++++++++-------
 .../compile/internal/reflectdata/alg_test.go  | 76 +++++++++++++++
 2 files changed, 136 insertions(+), 34 deletions(-)
 create mode 100644 src/cmd/compile/internal/reflectdata/alg_test.go

diff --git a/src/cmd/compile/internal/reflectdata/alg.go b/src/cmd/compile/internal/reflectdata/alg.go
index d000618bd64..526315d5570 100644
--- a/src/cmd/compile/internal/reflectdata/alg.go
+++ b/src/cmd/compile/internal/reflectdata/alg.go
@@ -412,22 +412,25 @@ func geneq(t *types.Type) *obj.LSym {
 		//
 		// if eq(p[0], q[0]) && eq(p[1], q[1]) && ... {
 		// } else {
-		//   return
+		//   goto neq
 		// }
 		//
 		// And so on.
 		//
 		// Otherwise it generates:
 		//
-		// for i := 0; i < nelem; i++ {
-		//   if eq(p[i], q[i]) {
+		// iterateTo := nelem/unroll*unroll
+		// for i := 0; i < iterateTo; i += unroll {
+		//   if eq(p[i+0], q[i+0]) && eq(p[i+1], q[i+1]) && ... && eq(p[i+unroll-1], q[i+unroll-1]) {
 		//   } else {
 		//     goto neq
 		//   }
 		// }
+		// if eq(p[iterateTo+0], q[iterateTo+0]) && eq(p[iterateTo+1], q[iterateTo+1]) && ... {
+		// } else {
+		//    goto neq
+		// }
 		//
-		// TODO(josharian): consider doing some loop unrolling
-		// for larger nelem as well, processing a few elements at a time in a loop.
 		checkAll := func(unroll int64, last bool, eq func(pi, qi ir.Node) ir.Node) {
 			// checkIdx generates a node to check for equality at index i.
 			checkIdx := func(i ir.Node) ir.Node {
@@ -442,38 +445,62 @@ func geneq(t *types.Type) *obj.LSym {
 				return eq(pi, qi)
 			}
 
-			if nelem <= unroll {
-				if last {
-					// Do last comparison in a different manner.
-					nelem--
-				}
-				// Generate a series of checks.
-				for i := int64(0); i < nelem; i++ {
-					// if check {} else { goto neq }
-					nif := ir.NewIfStmt(base.Pos, checkIdx(ir.NewInt(i)), nil, nil)
-					nif.Else.Append(ir.NewBranchStmt(base.Pos, ir.OGOTO, neq))
-					fn.Body.Append(nif)
-				}
-				if last {
-					fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, checkIdx(ir.NewInt(nelem))))
-				}
-			} else {
-				// Generate a for loop.
-				// for i := 0; i < nelem; i++
+			iterations := nelem / unroll
+			iterateTo := iterations * unroll
+			// If a loop is iterated only once, there shouldn't be any loop at all.
+			if iterations == 1 {
+				iterateTo = 0
+			}
+
+			if iterateTo > 0 {
+				// Generate an unrolled for loop.
+				// for i := 0; i < nelem/unroll*unroll; i += unroll
 				i := typecheck.Temp(types.Types[types.TINT])
 				init := ir.NewAssignStmt(base.Pos, i, ir.NewInt(0))
-				cond := ir.NewBinaryExpr(base.Pos, ir.OLT, i, ir.NewInt(nelem))
-				post := ir.NewAssignStmt(base.Pos, i, ir.NewBinaryExpr(base.Pos, ir.OADD, i, ir.NewInt(1)))
-				loop := ir.NewForStmt(base.Pos, nil, cond, post, nil)
+				cond := ir.NewBinaryExpr(base.Pos, ir.OLT, i, ir.NewInt(iterateTo))
+				loop := ir.NewForStmt(base.Pos, nil, cond, nil, nil)
 				loop.PtrInit().Append(init)
-				// if eq(pi, qi) {} else { goto neq }
-				nif := ir.NewIfStmt(base.Pos, checkIdx(i), nil, nil)
-				nif.Else.Append(ir.NewBranchStmt(base.Pos, ir.OGOTO, neq))
-				loop.Body.Append(nif)
-				fn.Body.Append(loop)
-				if last {
-					fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, ir.NewBool(true)))
+
+				// if eq(p[i+0], q[i+0]) && eq(p[i+1], q[i+1]) && ... && eq(p[i+unroll-1], q[i+unroll-1]) {
+				// } else {
+				//   goto neq
+				// }
+				for j := int64(0); j < unroll; j++ {
+					// if check {} else { goto neq }
+					nif := ir.NewIfStmt(base.Pos, checkIdx(i), nil, nil)
+					nif.Else.Append(ir.NewBranchStmt(base.Pos, ir.OGOTO, neq))
+					loop.Body.Append(nif)
+					post := ir.NewAssignStmt(base.Pos, i, ir.NewBinaryExpr(base.Pos, ir.OADD, i, ir.NewInt(1)))
+					loop.Body.Append(post)
 				}
+
+				fn.Body.Append(loop)
+
+				if nelem == iterateTo {
+					if last {
+						fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, ir.NewBool(true)))
+					}
+					return
+				}
+			}
+
+			// Generate remaining checks, if nelem is not a multiple of unroll.
+			if last {
+				// Do last comparison in a different manner.
+				nelem--
+			}
+			// if eq(p[iterateTo+0], q[iterateTo+0]) && eq(p[iterateTo+1], q[iterateTo+1]) && ... {
+			// } else {
+			//    goto neq
+			// }
+			for j := iterateTo; j < nelem; j++ {
+				// if check {} else { goto neq }
+				nif := ir.NewIfStmt(base.Pos, checkIdx(ir.NewInt(j)), nil, nil)
+				nif.Else.Append(ir.NewBranchStmt(base.Pos, ir.OGOTO, neq))
+				fn.Body.Append(nif)
+			}
+			if last {
+				fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, checkIdx(ir.NewInt(nelem))))
 			}
 		}
 
@@ -481,7 +508,6 @@ func geneq(t *types.Type) *obj.LSym {
 		case types.TSTRING:
 			// Do two loops. First, check that all the lengths match (cheap).
 			// Second, check that all the contents match (expensive).
-			// TODO: when the array size is small, unroll the length match checks.
 			checkAll(3, false, func(pi, qi ir.Node) ir.Node {
 				// Compare lengths.
 				eqlen, _ := EqString(pi, qi)
diff --git a/src/cmd/compile/internal/reflectdata/alg_test.go b/src/cmd/compile/internal/reflectdata/alg_test.go
new file mode 100644
index 00000000000..1e57b913fda
--- /dev/null
+++ b/src/cmd/compile/internal/reflectdata/alg_test.go
@@ -0,0 +1,76 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package reflectdata_test
+
+import "testing"
+
+func BenchmarkEqArrayOfStrings5(b *testing.B) {
+	var a [5]string
+	var c [5]string
+
+	for i := 0; i < 5; i++ {
+		a[i] = "aaaa"
+		c[i] = "cccc"
+	}
+
+	for j := 0; j < b.N; j++ {
+		_ = a == c
+	}
+}
+
+func BenchmarkEqArrayOfStrings64(b *testing.B) {
+	var a [64]string
+	var c [64]string
+
+	for i := 0; i < 64; i++ {
+		a[i] = "aaaa"
+		c[i] = "cccc"
+	}
+
+	for j := 0; j < b.N; j++ {
+		_ = a == c
+	}
+}
+
+func BenchmarkEqArrayOfStrings1024(b *testing.B) {
+	var a [1024]string
+	var c [1024]string
+
+	for i := 0; i < 1024; i++ {
+		a[i] = "aaaa"
+		c[i] = "cccc"
+	}
+
+	for j := 0; j < b.N; j++ {
+		_ = a == c
+	}
+}
+
+func BenchmarkEqArrayOfFloats5(b *testing.B) {
+	var a [5]float32
+	var c [5]float32
+
+	for i := 0; i < b.N; i++ {
+		_ = a == c
+	}
+}
+
+func BenchmarkEqArrayOfFloats64(b *testing.B) {
+	var a [64]float32
+	var c [64]float32
+
+	for i := 0; i < b.N; i++ {
+		_ = a == c
+	}
+}
+
+func BenchmarkEqArrayOfFloats1024(b *testing.B) {
+	var a [1024]float32
+	var c [1024]float32
+
+	for i := 0; i < b.N; i++ {
+		_ = a == c
+	}
+}