Symmetric changes to md4.go as for md5.go.

Use uint index variables in some cases instead of int to enable strength reduction; this makes it possible for the compiler to reduce % into masks. Old code: 6g -S md4.go md4block.go | grep "md4block.go:44" 0471 (md4block.go:44) MOVL AX,BX 0472 (md4block.go:44) MOVL AX,BP 0473 (md4block.go:44) MOVL AX,R8 0474 (md4block.go:44) SARL $31,R8 0475 (md4block.go:44) SHRL $30,R8 0476 (md4block.go:44) ADDL R8,BP 0477 (md4block.go:44) SARL $2,BP 0478 (md4block.go:44) IMULL $4,BP 0479 (md4block.go:44) SUBL BP,BX 0480 (md4block.go:44) MOVLQSX BX,BX 0481 (md4block.go:44) LEAQ shift1+0(SB),BP 0482 (md4block.go:44) CMPL BX,8(BP) 0483 (md4block.go:44) JCS ,485 0484 (md4block.go:44) CALL ,runtime.throwindex+0(SB) 0485 (md4block.go:44) MOVQ (BP),BP 0486 (md4block.go:44) MOVL (BP)(BX*4),DI New code: 6g -S md4.go md4block.go | grep "md4block.go:44" 0471 (md4block.go:44) MOVL AX,BX 0472 (md4block.go:44) ANDL $3,BX 0473 (md4block.go:44) MOVLQZX BX,BX 0474 (md4block.go:44) LEAQ shift1+0(SB),BP 0475 (md4block.go:44) CMPL BX,8(BP) 0476 (md4block.go:44) JCS ,478 0477 (md4block.go:44) CALL ,runtime.throwindex+0(SB) 0478 (md4block.go:44) MOVQ (BP),BP 0479 (md4block.go:44) MOVL (BP)(BX*4),DI R=agl, agl1 CC=golang-dev https://golang.org/cl/181086
2024-11-21 23:34:42 -07:00 · 2009-12-28 17:20:33 -08:00 · 2009-12-28 17:20:33 -08:00 · f0fcb2d59f
commit f0fcb2d59f
parent 9d07d37f31
3 changed files with 34 additions and 38 deletions
--- a/src/pkg/crypto/md4/md4.go
+++ b/src/pkg/crypto/md4/md4.go
@ -68,8 +68,8 @@ func (d *digest) Write(p []byte) (nn int, err os.Error) {
 	n := _Block(d, p)
 	p = p[n:]
 	if len(p) > 0 {
-		for i := 0; i < len(p); i++ {
-			d.x[i] = p[i]
+		for i, x := range p {
+			d.x[i] = x
 		}
 		d.nx = len(p)
 	}
@ -100,16 +100,12 @@ func (d *digest) Sum() []byte {

 	p := make([]byte, 16)
 	j := 0
-	for i := 0; i < 4; i++ {
-		s := d.s[i]
-		p[j] = byte(s)
-		j++
-		p[j] = byte(s >> 8)
-		j++
-		p[j] = byte(s >> 16)
-		j++
-		p[j] = byte(s >> 24)
-		j++
+	for _, s := range d.s {
+		p[j+0] = byte(s >> 0)
+		p[j+1] = byte(s >> 8)
+		p[j+2] = byte(s >> 16)
+		p[j+3] = byte(s >> 24)
+		j += 4
 	}
 	return p
 }
--- a/src/pkg/crypto/md4/md4block.go
+++ b/src/pkg/crypto/md4/md4block.go
@ -25,9 +25,10 @@ func _Block(dig *digest, p []byte) int {
 	for len(p) >= _Chunk {
 		aa, bb, cc, dd := a, b, c, d

+		j := 0
 		for i := 0; i < 16; i++ {
-			j := i * 4
 			X[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+			j += 4
 		}

 		// If this needs to be made faster in the future,
@ -37,9 +38,12 @@ func _Block(dig *digest, p []byte) int {
 		// with suitable variable renaming in each
 		// unrolled body, delete the a, b, c, d = d, a, b, c
 		// (or you can let the optimizer do the renaming).
+		//
+		// The index variables are uint so that % by a power
+		// of two can be optimized easily by a compiler.

 		// Round 1.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := i
 			s := shift1[i%4]
 			f := ((c ^ d) & b) ^ d
@ -49,7 +53,7 @@ func _Block(dig *digest, p []byte) int {
 		}

 		// Round 2.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := xIndex2[i]
 			s := shift2[i%4]
 			g := (b & c) | (b & d) | (c & d)
@ -59,7 +63,7 @@ func _Block(dig *digest, p []byte) int {
 		}

 		// Round 3.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := xIndex3[i]
 			s := shift3[i%4]
 			h := b ^ c ^ d
--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
@ -98,9 +98,10 @@ func _Block(dig *digest, p []byte) int {
 	for len(p) >= _Chunk {
 		aa, bb, cc, dd := a, b, c, d

+		j := 0
 		for i := 0; i < 16; i++ {
-			j := i * 4
 			X[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+			j += 4
 		}

 		// If this needs to be made faster in the future,
@ -110,52 +111,47 @@ func _Block(dig *digest, p []byte) int {
 		// with suitable variable renaming in each
 		// unrolled body, delete the a, b, c, d = d, a, b, c
 		// (or you can let the optimizer do the renaming).
+		//
+		// The index variables are uint so that % by a power
+		// of two can be optimized easily by a compiler.

 		// Round 1.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := i
-			t := i
 			s := shift1[i%4]
 			f := ((c ^ d) & b) ^ d
-			a += f + X[x] + table[t]
-			a = a<<s | a>>(32-s)
-			a += b
+			a += f + X[x] + table[i]
+			a = a<<s | a>>(32-s) + b
 			a, b, c, d = d, a, b, c
 		}

 		// Round 2.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := (1 + 5*i) % 16
-			t := 16 + i
 			s := shift2[i%4]
 			g := ((b ^ c) & d) ^ c
-			a += g + X[x] + table[t]
-			a = a<<s | a>>(32-s)
-			a += b
+			a += g + X[x] + table[i+16]
+			a = a<<s | a>>(32-s) + b
 			a, b, c, d = d, a, b, c
 		}

 		// Round 3.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := (5 + 3*i) % 16
-			t := 32 + i
 			s := shift3[i%4]
 			h := b ^ c ^ d
-			a += h + X[x] + table[t]
-			a = a<<s | a>>(32-s)
-			a += b
+			a += h + X[x] + table[i+32]
+			a = a<<s | a>>(32-s) + b
 			a, b, c, d = d, a, b, c
 		}

 		// Round 4.
-		for i := 0; i < 16; i++ {
+		for i := uint(0); i < 16; i++ {
 			x := (7 * i) % 16
 			s := shift4[i%4]
-			t := 48 + i
-			ii := c ^ (b | ^d)
-			a += ii + X[x] + table[t]
-			a = a<<s | a>>(32-s)
-			a += b
+			j := c ^ (b | ^d)
+			a += j + X[x] + table[i+48]
+			a = a<<s | a>>(32-s) + b
 			a, b, c, d = d, a, b, c
 		}