From 15436da232d83674b1e58efd6310ca38310b2dc4 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 22 May 2012 13:53:27 -0400 Subject: [PATCH] crypto/md5: faster inner loop, 3x faster overall The speedup is a combination of unrolling/specializing the actual code and also making the compiler generate better code. Go 1.0.1 (size: 1239 code + 320 data = 1559 total) md5.BenchmarkHash1K 1000000 7178 ns/op 142.64 MB/s md5.BenchmarkHash8K 200000 56834 ns/op 144.14 MB/s Partial unroll (size: 1115 code + 256 data = 1371 total) md5.BenchmarkHash1K 5000000 2513 ns/op 407.37 MB/s md5.BenchmarkHash8K 500000 19406 ns/op 422.13 MB/s Complete unroll (size: 1900 code + 0 data = 1900 code) md5.BenchmarkHash1K 5000000 2442 ns/op 419.18 MB/s md5.BenchmarkHash8K 500000 18957 ns/op 432.13 MB/s Comparing Go 1.0.1 and the complete unroll (this CL): benchmark old MB/s new MB/s speedup md5.BenchmarkHash1K 142.64 419.18 2.94x md5.BenchmarkHash8K 144.14 432.13 3.00x On the same machine, 'openssl speed md5' reports 441 MB/s and 531 MB/s for our two cases, so this CL is at 90% and 80% of those speeds, which is at least in the right ballpark. OpenSSL is using carefully engineered assembly, so we are unlikely to catch up completely. Measurements on a Mid-2010 MacPro5,1. R=golang-dev, bradfitz, agl CC=golang-dev https://golang.org/cl/6220046 --- src/pkg/crypto/md5/gen.go | 301 ++++++++++++++++++++++++++++ src/pkg/crypto/md5/md5_test.go | 25 +++ src/pkg/crypto/md5/md5block.go | 347 ++++++++++++++++++++------------- 3 files changed, 538 insertions(+), 135 deletions(-) create mode 100644 src/pkg/crypto/md5/gen.go diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go new file mode 100644 index 0000000000..ffa43a37c2 --- /dev/null +++ b/src/pkg/crypto/md5/gen.go @@ -0,0 +1,301 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +// This program generates md5block.go +// Invoke as +// +// go run gen.go [-full] |gofmt >md5block.go +// +// The -full flag causes the generated code to do a full +// (16x) unrolling instead of a 4x unrolling. + +package main + +import ( + "flag" + "log" + "os" + "strings" + "text/template" +) + +func main() { + flag.Parse() + + t := template.Must(template.New("main").Funcs(funcs).Parse(program)) + if err := t.Execute(os.Stdout, data); err != nil { + log.Fatal(err) + } +} + +type Data struct { + a, b, c, d string + Shift1 []int + Shift2 []int + Shift3 []int + Shift4 []int + Table1 []uint32 + Table2 []uint32 + Table3 []uint32 + Table4 []uint32 + Full bool +} + +var funcs = template.FuncMap{ + "dup": dup, + "relabel": relabel, + "rotate": rotate, +} + +func dup(count int, x []int) []int { + var out []int + for i := 0; i < count; i++ { + out = append(out, x...) + } + return out +} + +func relabel(s string) string { + return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s) +} + +func rotate() string { + data.a, data.b, data.c, data.d = data.d, data.a, data.b, data.c + return "" // no output +} + +func init() { + flag.BoolVar(&data.Full, "full", false, "complete unrolling") +} + +var data = Data{ + a: "a", + b: "b", + c: "c", + d: "d", + Shift1: []int{7, 12, 17, 22}, + Shift2: []int{5, 9, 14, 20}, + Shift3: []int{4, 11, 16, 23}, + Shift4: []int{6, 10, 15, 21}, + + // table[i] = int((1<<32) * abs(sin(i+1 radians))). + Table1: []uint32{ + // round 1 + 0xd76aa478, + 0xe8c7b756, + 0x242070db, + 0xc1bdceee, + 0xf57c0faf, + 0x4787c62a, + 0xa8304613, + 0xfd469501, + 0x698098d8, + 0x8b44f7af, + 0xffff5bb1, + 0x895cd7be, + 0x6b901122, + 0xfd987193, + 0xa679438e, + 0x49b40821, + }, + Table2: []uint32{ + // round 2 + 0xf61e2562, + 0xc040b340, + 0x265e5a51, + 0xe9b6c7aa, + 0xd62f105d, + 0x2441453, + 0xd8a1e681, + 0xe7d3fbc8, + 0x21e1cde6, + 0xc33707d6, + 0xf4d50d87, + 0x455a14ed, + 0xa9e3e905, + 0xfcefa3f8, + 0x676f02d9, + 0x8d2a4c8a, + }, + Table3: []uint32{ + // round3 + 0xfffa3942, + 0x8771f681, + 0x6d9d6122, + 0xfde5380c, + 0xa4beea44, + 0x4bdecfa9, + 0xf6bb4b60, + 0xbebfbc70, + 0x289b7ec6, + 0xeaa127fa, + 0xd4ef3085, + 0x4881d05, + 0xd9d4d039, + 0xe6db99e5, + 0x1fa27cf8, + 0xc4ac5665, + }, + Table4: []uint32{ + // round 4 + 0xf4292244, + 0x432aff97, + 0xab9423a7, + 0xfc93a039, + 0x655b59c3, + 0x8f0ccc92, + 0xffeff47d, + 0x85845dd1, + 0x6fa87e4f, + 0xfe2ce6e0, + 0xa3014314, + 0x4e0811a1, + 0xf7537e82, + 0xbd3af235, + 0x2ad7d2bb, + 0xeb86d391, + }, +} + +var program = ` +package md5 + +import ( + "unsafe" + "runtime" +) + +{{if not .Full}} + var t1 = [...]uint32{ + {{range .Table1}}{{printf "\t%#x,\n" .}}{{end}} + } + + var t2 = [...]uint32{ + {{range .Table2}}{{printf "\t%#x,\n" .}}{{end}} + } + + var t3 = [...]uint32{ + {{range .Table3}}{{printf "\t%#x,\n" .}}{{end}} + } + + var t4 = [...]uint32{ + {{range .Table4}}{{printf "\t%#x,\n" .}}{{end}} + } +{{end}} + +func _Block(dig *digest, p []byte) int { + a := dig.s[0] + b := dig.s[1] + c := dig.s[2] + d := dig.s[3] + n := 0 + var X *[16]uint32 + var xbuf [16]uint32 + for len(p) >= _Chunk { + aa, bb, cc, dd := a, b, c, d + + // This is a constant condition - it is not evaluated on each iteration. + if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" { + // MD5 was designed so that x86 processors can just iterate + // over the block data directly as uint32s, and we generate + // less code and run 1.3x faster if we take advantage of that. + // My apologies. + X = (*[16]uint32)(unsafe.Pointer(&p[0])) + } else { + X = &xbuf + j := 0 + for i := 0; i < 16; i++ { + X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24 + j += 4 + } + } + + {{if .Full}} + // Round 1. + {{range $i, $s := dup 4 .Shift1}} + {{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + {{rotate}} + {{end}} + + // Round 2. + {{range $i, $s := dup 4 .Shift2}} + {{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + {{rotate}} + {{end}} + + // Round 3. + {{range $i, $s := dup 4 .Shift3}} + {{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + {{rotate}} + {{end}} + + // Round 4. + {{range $i, $s := dup 4 .Shift4}} + {{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + {{rotate}} + {{end}} + {{else}} + // Round 1. + for i := uint(0); i < 16; { + {{range $s := .Shift1}} + {{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + i++ + {{rotate}} + {{end}} + } + + // Round 2. + for i := uint(0); i < 16; { + {{range $s := .Shift2}} + {{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + i++ + {{rotate}} + {{end}} + } + + // Round 3. + for i := uint(0); i < 16; { + {{range $s := .Shift3}} + {{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + i++ + {{rotate}} + {{end}} + } + + // Round 4. + for i := uint(0); i < 16; { + {{range $s := .Shift4}} + {{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}} + {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} + i++ + {{rotate}} + {{end}} + } + {{end}} + + a += aa + b += bb + c += cc + d += dd + + p = p[_Chunk:] + n += _Chunk + } + + dig.s[0] = a + dig.s[1] = b + dig.s[2] = c + dig.s[3] = d + return n +} +` diff --git a/src/pkg/crypto/md5/md5_test.go b/src/pkg/crypto/md5/md5_test.go index aae875464f..b474a90d5a 100644 --- a/src/pkg/crypto/md5/md5_test.go +++ b/src/pkg/crypto/md5/md5_test.go @@ -78,3 +78,28 @@ func ExampleNew() { fmt.Printf("%x", h.Sum(nil)) // Output: e2c569be17396eca2a2e3c11578123ed } + +var bench = md5.New() +var buf = makeBuf() + +func makeBuf() []byte { + b := make([]byte, 8<<10) + for i := range b { + b[i] = byte(i) + } + return b +} + +func BenchmarkHash1K(b *testing.B) { + b.SetBytes(1024) + for i := 0; i < b.N; i++ { + bench.Write(buf[:1024]) + } +} + +func BenchmarkHash8K(b *testing.B) { + b.SetBytes(int64(len(buf))) + for i := 0; i < b.N; i++ { + bench.Write(buf) + } +} diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go index a887e2e05e..51758272c5 100644 --- a/src/pkg/crypto/md5/md5block.go +++ b/src/pkg/crypto/md5/md5block.go @@ -1,92 +1,9 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// MD5 block step. -// In its own file so that a faster assembly or C version -// can be substituted easily. - package md5 -// table[i] = int((1<<32) * abs(sin(i+1 radians))). -var table = []uint32{ - // round 1 - 0xd76aa478, - 0xe8c7b756, - 0x242070db, - 0xc1bdceee, - 0xf57c0faf, - 0x4787c62a, - 0xa8304613, - 0xfd469501, - 0x698098d8, - 0x8b44f7af, - 0xffff5bb1, - 0x895cd7be, - 0x6b901122, - 0xfd987193, - 0xa679438e, - 0x49b40821, - - // round 2 - 0xf61e2562, - 0xc040b340, - 0x265e5a51, - 0xe9b6c7aa, - 0xd62f105d, - 0x2441453, - 0xd8a1e681, - 0xe7d3fbc8, - 0x21e1cde6, - 0xc33707d6, - 0xf4d50d87, - 0x455a14ed, - 0xa9e3e905, - 0xfcefa3f8, - 0x676f02d9, - 0x8d2a4c8a, - - // round3 - 0xfffa3942, - 0x8771f681, - 0x6d9d6122, - 0xfde5380c, - 0xa4beea44, - 0x4bdecfa9, - 0xf6bb4b60, - 0xbebfbc70, - 0x289b7ec6, - 0xeaa127fa, - 0xd4ef3085, - 0x4881d05, - 0xd9d4d039, - 0xe6db99e5, - 0x1fa27cf8, - 0xc4ac5665, - - // round 4 - 0xf4292244, - 0x432aff97, - 0xab9423a7, - 0xfc93a039, - 0x655b59c3, - 0x8f0ccc92, - 0xffeff47d, - 0x85845dd1, - 0x6fa87e4f, - 0xfe2ce6e0, - 0xa3014314, - 0x4e0811a1, - 0xf7537e82, - 0xbd3af235, - 0x2ad7d2bb, - 0xeb86d391, -} - -var shift1 = []uint{7, 12, 17, 22} -var shift2 = []uint{5, 9, 14, 20} -var shift3 = []uint{4, 11, 16, 23} -var shift4 = []uint{6, 10, 15, 21} +import ( + "runtime" + "unsafe" +) func _Block(dig *digest, p []byte) int { a := dig.s[0] @@ -94,66 +11,226 @@ func _Block(dig *digest, p []byte) int { c := dig.s[2] d := dig.s[3] n := 0 - var X [16]uint32 + var X *[16]uint32 + var xbuf [16]uint32 for len(p) >= _Chunk { aa, bb, cc, dd := a, b, c, d - j := 0 - for i := 0; i < 16; i++ { - X[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24 - j += 4 + // This is a constant condition - it is not evaluated on each iteration. + if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" { + // MD5 was designed so that x86 processors can just iterate + // over the block data directly as uint32s, and we generate + // less code and run 1.3x faster if we take advantage of that. + // My apologies. + X = (*[16]uint32)(unsafe.Pointer(&p[0])) + } else { + X = &xbuf + j := 0 + for i := 0; i < 16; i++ { + X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24 + j += 4 + } } - // If this needs to be made faster in the future, - // the usual trick is to unroll each of these - // loops by a factor of 4; that lets you replace - // the shift[] lookups with constants and, - // with suitable variable renaming in each - // unrolled body, delete the a, b, c, d = d, a, b, c - // (or you can let the optimizer do the renaming). - // - // The index variables are uint so that % by a power - // of two can be optimized easily by a compiler. - // Round 1. - for i := uint(0); i < 16; i++ { - x := i - s := shift1[i%4] - f := ((c ^ d) & b) ^ d - a += f + X[x] + table[i] - a = a<>(32-s) + b - a, b, c, d = d, a, b, c - } + + a += (((c ^ d) & b) ^ d) + X[0] + 3614090360 + a = a<<7 | a>>(32-7) + b + + d += (((b ^ c) & a) ^ c) + X[1] + 3905402710 + d = d<<12 | d>>(32-12) + a + + c += (((a ^ b) & d) ^ b) + X[2] + 606105819 + c = c<<17 | c>>(32-17) + d + + b += (((d ^ a) & c) ^ a) + X[3] + 3250441966 + b = b<<22 | b>>(32-22) + c + + a += (((c ^ d) & b) ^ d) + X[4] + 4118548399 + a = a<<7 | a>>(32-7) + b + + d += (((b ^ c) & a) ^ c) + X[5] + 1200080426 + d = d<<12 | d>>(32-12) + a + + c += (((a ^ b) & d) ^ b) + X[6] + 2821735955 + c = c<<17 | c>>(32-17) + d + + b += (((d ^ a) & c) ^ a) + X[7] + 4249261313 + b = b<<22 | b>>(32-22) + c + + a += (((c ^ d) & b) ^ d) + X[8] + 1770035416 + a = a<<7 | a>>(32-7) + b + + d += (((b ^ c) & a) ^ c) + X[9] + 2336552879 + d = d<<12 | d>>(32-12) + a + + c += (((a ^ b) & d) ^ b) + X[10] + 4294925233 + c = c<<17 | c>>(32-17) + d + + b += (((d ^ a) & c) ^ a) + X[11] + 2304563134 + b = b<<22 | b>>(32-22) + c + + a += (((c ^ d) & b) ^ d) + X[12] + 1804603682 + a = a<<7 | a>>(32-7) + b + + d += (((b ^ c) & a) ^ c) + X[13] + 4254626195 + d = d<<12 | d>>(32-12) + a + + c += (((a ^ b) & d) ^ b) + X[14] + 2792965006 + c = c<<17 | c>>(32-17) + d + + b += (((d ^ a) & c) ^ a) + X[15] + 1236535329 + b = b<<22 | b>>(32-22) + c // Round 2. - for i := uint(0); i < 16; i++ { - x := (1 + 5*i) % 16 - s := shift2[i%4] - g := ((b ^ c) & d) ^ c - a += g + X[x] + table[i+16] - a = a<>(32-s) + b - a, b, c, d = d, a, b, c - } + + a += (((b ^ c) & d) ^ c) + X[(1+5*0)&15] + 4129170786 + a = a<<5 | a>>(32-5) + b + + d += (((a ^ b) & c) ^ b) + X[(1+5*1)&15] + 3225465664 + d = d<<9 | d>>(32-9) + a + + c += (((d ^ a) & b) ^ a) + X[(1+5*2)&15] + 643717713 + c = c<<14 | c>>(32-14) + d + + b += (((c ^ d) & a) ^ d) + X[(1+5*3)&15] + 3921069994 + b = b<<20 | b>>(32-20) + c + + a += (((b ^ c) & d) ^ c) + X[(1+5*4)&15] + 3593408605 + a = a<<5 | a>>(32-5) + b + + d += (((a ^ b) & c) ^ b) + X[(1+5*5)&15] + 38016083 + d = d<<9 | d>>(32-9) + a + + c += (((d ^ a) & b) ^ a) + X[(1+5*6)&15] + 3634488961 + c = c<<14 | c>>(32-14) + d + + b += (((c ^ d) & a) ^ d) + X[(1+5*7)&15] + 3889429448 + b = b<<20 | b>>(32-20) + c + + a += (((b ^ c) & d) ^ c) + X[(1+5*8)&15] + 568446438 + a = a<<5 | a>>(32-5) + b + + d += (((a ^ b) & c) ^ b) + X[(1+5*9)&15] + 3275163606 + d = d<<9 | d>>(32-9) + a + + c += (((d ^ a) & b) ^ a) + X[(1+5*10)&15] + 4107603335 + c = c<<14 | c>>(32-14) + d + + b += (((c ^ d) & a) ^ d) + X[(1+5*11)&15] + 1163531501 + b = b<<20 | b>>(32-20) + c + + a += (((b ^ c) & d) ^ c) + X[(1+5*12)&15] + 2850285829 + a = a<<5 | a>>(32-5) + b + + d += (((a ^ b) & c) ^ b) + X[(1+5*13)&15] + 4243563512 + d = d<<9 | d>>(32-9) + a + + c += (((d ^ a) & b) ^ a) + X[(1+5*14)&15] + 1735328473 + c = c<<14 | c>>(32-14) + d + + b += (((c ^ d) & a) ^ d) + X[(1+5*15)&15] + 2368359562 + b = b<<20 | b>>(32-20) + c // Round 3. - for i := uint(0); i < 16; i++ { - x := (5 + 3*i) % 16 - s := shift3[i%4] - h := b ^ c ^ d - a += h + X[x] + table[i+32] - a = a<>(32-s) + b - a, b, c, d = d, a, b, c - } + + a += (b ^ c ^ d) + X[(5+3*0)&15] + 4294588738 + a = a<<4 | a>>(32-4) + b + + d += (a ^ b ^ c) + X[(5+3*1)&15] + 2272392833 + d = d<<11 | d>>(32-11) + a + + c += (d ^ a ^ b) + X[(5+3*2)&15] + 1839030562 + c = c<<16 | c>>(32-16) + d + + b += (c ^ d ^ a) + X[(5+3*3)&15] + 4259657740 + b = b<<23 | b>>(32-23) + c + + a += (b ^ c ^ d) + X[(5+3*4)&15] + 2763975236 + a = a<<4 | a>>(32-4) + b + + d += (a ^ b ^ c) + X[(5+3*5)&15] + 1272893353 + d = d<<11 | d>>(32-11) + a + + c += (d ^ a ^ b) + X[(5+3*6)&15] + 4139469664 + c = c<<16 | c>>(32-16) + d + + b += (c ^ d ^ a) + X[(5+3*7)&15] + 3200236656 + b = b<<23 | b>>(32-23) + c + + a += (b ^ c ^ d) + X[(5+3*8)&15] + 681279174 + a = a<<4 | a>>(32-4) + b + + d += (a ^ b ^ c) + X[(5+3*9)&15] + 3936430074 + d = d<<11 | d>>(32-11) + a + + c += (d ^ a ^ b) + X[(5+3*10)&15] + 3572445317 + c = c<<16 | c>>(32-16) + d + + b += (c ^ d ^ a) + X[(5+3*11)&15] + 76029189 + b = b<<23 | b>>(32-23) + c + + a += (b ^ c ^ d) + X[(5+3*12)&15] + 3654602809 + a = a<<4 | a>>(32-4) + b + + d += (a ^ b ^ c) + X[(5+3*13)&15] + 3873151461 + d = d<<11 | d>>(32-11) + a + + c += (d ^ a ^ b) + X[(5+3*14)&15] + 530742520 + c = c<<16 | c>>(32-16) + d + + b += (c ^ d ^ a) + X[(5+3*15)&15] + 3299628645 + b = b<<23 | b>>(32-23) + c // Round 4. - for i := uint(0); i < 16; i++ { - x := (7 * i) % 16 - s := shift4[i%4] - j := c ^ (b | ^d) - a += j + X[x] + table[i+48] - a = a<>(32-s) + b - a, b, c, d = d, a, b, c - } + + a += (c ^ (b | ^d)) + X[(7*0)&15] + 4096336452 + a = a<<6 | a>>(32-6) + b + + d += (b ^ (a | ^c)) + X[(7*1)&15] + 1126891415 + d = d<<10 | d>>(32-10) + a + + c += (a ^ (d | ^b)) + X[(7*2)&15] + 2878612391 + c = c<<15 | c>>(32-15) + d + + b += (d ^ (c | ^a)) + X[(7*3)&15] + 4237533241 + b = b<<21 | b>>(32-21) + c + + a += (c ^ (b | ^d)) + X[(7*4)&15] + 1700485571 + a = a<<6 | a>>(32-6) + b + + d += (b ^ (a | ^c)) + X[(7*5)&15] + 2399980690 + d = d<<10 | d>>(32-10) + a + + c += (a ^ (d | ^b)) + X[(7*6)&15] + 4293915773 + c = c<<15 | c>>(32-15) + d + + b += (d ^ (c | ^a)) + X[(7*7)&15] + 2240044497 + b = b<<21 | b>>(32-21) + c + + a += (c ^ (b | ^d)) + X[(7*8)&15] + 1873313359 + a = a<<6 | a>>(32-6) + b + + d += (b ^ (a | ^c)) + X[(7*9)&15] + 4264355552 + d = d<<10 | d>>(32-10) + a + + c += (a ^ (d | ^b)) + X[(7*10)&15] + 2734768916 + c = c<<15 | c>>(32-15) + d + + b += (d ^ (c | ^a)) + X[(7*11)&15] + 1309151649 + b = b<<21 | b>>(32-21) + c + + a += (c ^ (b | ^d)) + X[(7*12)&15] + 4149444226 + a = a<<6 | a>>(32-6) + b + + d += (b ^ (a | ^c)) + X[(7*13)&15] + 3174756917 + d = d<<10 | d>>(32-10) + a + + c += (a ^ (d | ^b)) + X[(7*14)&15] + 718787259 + c = c<<15 | c>>(32-15) + d + + b += (d ^ (c | ^a)) + X[(7*15)&15] + 3951481745 + b = b<<21 | b>>(32-21) + c a += aa b += bb