1
0
mirror of https://github.com/golang/go synced 2024-11-24 12:00:14 -07:00
go/test/codegen/mathbits.go

348 lines
6.5 KiB
Go
Raw Normal View History

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
import "math/bits"
// ----------------------- //
// bits.LeadingZeros //
// ----------------------- //
func LeadingZeros(n uint) int {
// amd64:"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.LeadingZeros(n)
}
func LeadingZeros64(n uint64) int {
// amd64:"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.LeadingZeros64(n)
}
func LeadingZeros32(n uint32) int {
cmd/compile: optimize LeadingZeros(16|32) on amd64 Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:54:45 -06:00
// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.LeadingZeros32(n)
}
func LeadingZeros16(n uint16) int {
cmd/compile: optimize LeadingZeros(16|32) on amd64 Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:54:45 -06:00
// amd64:"BSRL","LEAL",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.LeadingZeros16(n)
}
func LeadingZeros8(n uint8) int {
cmd/compile: use intrinsic for LeadingZeros8 on amd64 The previous change sped up the pure computation form of LeadingZeros8. This places it somewhat close to the table lookup form. Depending on something that varies from toolchain to toolchain (alignment, perhaps?), the slowdown from ditching the table lookup is either 20% or 5%. This benchmark is the best case scenario for the table lookup: It is in the L1 cache already. I think we're close enough that we can switch to the computational version, and trust that the memory effects and binary size savings will be worth it. Code: func f8(x uint8) { z = bits.LeadingZeros8(x) } Before: "".f8 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) LEAQ math/bits.len8tab(SB), CX 0x000f 00015 (x.go:7) MOVBLZX (CX)(AX*1), AX 0x0013 00019 (x.go:7) ADDQ $-8, AX 0x0017 00023 (x.go:7) NEGQ AX 0x001a 00026 (x.go:7) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:7) RET After: "".f8 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:7) BSRL AX, AX 0x000f 00015 (x.go:7) ADDQ $-8, AX 0x0013 00019 (x.go:7) NEGQ AX 0x0016 00022 (x.go:7) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:7) RET Change-Id: Icc7db50a7820fb9a3da8a816d6b6940d7f8e193e Reviewed-on: https://go-review.googlesource.com/108942 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 16:38:50 -06:00
// amd64:"BSRL","LEAL",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.LeadingZeros8(n)
}
// --------------- //
// bits.Len* //
// --------------- //
func Len(n uint) int {
// amd64:"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.Len(n)
}
func Len64(n uint64) int {
// amd64:"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.Len64(n)
}
func Len32(n uint32) int {
cmd/compile: optimize LeadingZeros(16|32) on amd64 Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:54:45 -06:00
// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.Len32(n)
}
func Len16(n uint16) int {
cmd/compile: optimize LeadingZeros(16|32) on amd64 Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:54:45 -06:00
// amd64:"BSRL","LEAL",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.Len16(n)
}
func Len8(n uint8) int {
cmd/compile: use intrinsic for LeadingZeros8 on amd64 The previous change sped up the pure computation form of LeadingZeros8. This places it somewhat close to the table lookup form. Depending on something that varies from toolchain to toolchain (alignment, perhaps?), the slowdown from ditching the table lookup is either 20% or 5%. This benchmark is the best case scenario for the table lookup: It is in the L1 cache already. I think we're close enough that we can switch to the computational version, and trust that the memory effects and binary size savings will be worth it. Code: func f8(x uint8) { z = bits.LeadingZeros8(x) } Before: "".f8 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) LEAQ math/bits.len8tab(SB), CX 0x000f 00015 (x.go:7) MOVBLZX (CX)(AX*1), AX 0x0013 00019 (x.go:7) ADDQ $-8, AX 0x0017 00023 (x.go:7) NEGQ AX 0x001a 00026 (x.go:7) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:7) RET After: "".f8 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:7) BSRL AX, AX 0x000f 00015 (x.go:7) ADDQ $-8, AX 0x0013 00019 (x.go:7) NEGQ AX 0x0016 00022 (x.go:7) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:7) RET Change-Id: Icc7db50a7820fb9a3da8a816d6b6940d7f8e193e Reviewed-on: https://go-review.googlesource.com/108942 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 16:38:50 -06:00
// amd64:"BSRL","LEAL",-"CMOVQEQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
return bits.Len8(n)
}
// -------------------- //
// bits.OnesCount //
// -------------------- //
func OnesCount(n uint) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
// s390x:"POPCNT"
// ppc64:"POPCNTD"
// ppc64le:"POPCNTD"
return bits.OnesCount(n)
}
func OnesCount64(n uint64) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
// s390x:"POPCNT"
// ppc64:"POPCNTD"
// ppc64le:"POPCNTD"
return bits.OnesCount64(n)
}
func OnesCount32(n uint32) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
// s390x:"POPCNT"
// ppc64:"POPCNTW"
// ppc64le:"POPCNTW"
return bits.OnesCount32(n)
}
func OnesCount16(n uint16) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
// s390x:"POPCNT"
// ppc64:"POPCNTW"
// ppc64le:"POPCNTW"
return bits.OnesCount16(n)
}
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
func OnesCount8(n uint8) int {
// s390x:"POPCNT"
// ppc64:"POPCNTB"
// ppc64le:"POPCNTB"
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
return bits.OnesCount8(n)
}
// ----------------------- //
// bits.ReverseBytes //
// ----------------------- //
func ReverseBytes(n uint) uint {
// amd64:"BSWAPQ"
// s390x:"MOVDBR"
// arm64:"REV"
return bits.ReverseBytes(n)
}
func ReverseBytes64(n uint64) uint64 {
// amd64:"BSWAPQ"
// s390x:"MOVDBR"
// arm64:"REV"
return bits.ReverseBytes64(n)
}
func ReverseBytes32(n uint32) uint32 {
// amd64:"BSWAPL"
// s390x:"MOVWBR"
// arm64:"REVW"
return bits.ReverseBytes32(n)
}
func ReverseBytes16(n uint16) uint16 {
// amd64:"ROLW"
return bits.ReverseBytes16(n)
}
// --------------------- //
// bits.RotateLeft //
// --------------------- //
func RotateLeft64(n uint64) uint64 {
// amd64:"ROLQ"
// arm64:"ROR"
// ppc64:"ROTL"
// ppc64le:"ROTL"
// s390x:"RLLG"
return bits.RotateLeft64(n, 37)
}
func RotateLeft32(n uint32) uint32 {
// amd64:"ROLL" 386:"ROLL"
// arm64:"RORW"
// ppc64:"ROTLW"
// ppc64le:"ROTLW"
// s390x:"RLL"
return bits.RotateLeft32(n, 9)
}
func RotateLeft16(n uint16) uint16 {
// amd64:"ROLW" 386:"ROLW"
return bits.RotateLeft16(n, 5)
}
func RotateLeft8(n uint8) uint8 {
// amd64:"ROLB" 386:"ROLB"
return bits.RotateLeft8(n, 5)
}
func RotateLeftVariable(n uint, m int) uint {
// amd64:"ROLQ"
// arm64:"ROR"
// ppc64:"ROTL"
// ppc64le:"ROTL"
// s390x:"RLLG"
return bits.RotateLeft(n, m)
}
func RotateLeftVariable64(n uint64, m int) uint64 {
// amd64:"ROLQ"
// arm64:"ROR"
// ppc64:"ROTL"
// ppc64le:"ROTL"
// s390x:"RLLG"
return bits.RotateLeft64(n, m)
}
func RotateLeftVariable32(n uint32, m int) uint32 {
// amd64:"ROLL"
// arm64:"RORW"
// ppc64:"ROTLW"
// ppc64le:"ROTLW"
// s390x:"RLL"
return bits.RotateLeft32(n, m)
}
// ------------------------ //
// bits.TrailingZeros //
// ------------------------ //
func TrailingZeros(n uint) int {
// amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
// s390x:"FLOGR"
// ppc64:"ANDN","POPCNTD"
// ppc64le:"ANDN","POPCNTD"
return bits.TrailingZeros(n)
}
func TrailingZeros64(n uint64) int {
// amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
// s390x:"FLOGR"
// ppc64:"ANDN","POPCNTD"
// ppc64le:"ANDN","POPCNTD"
return bits.TrailingZeros64(n)
}
func TrailingZeros32(n uint32) int {
cmd/compile: add patterns for bit set/clear/complement on amd64 This patch completes implementation of BT(Q|L), and adds support for BT(S|R|C)(Q|L). Example of code changes from time.(*Time).addSec: if t.wall&hasMonotonic != 0 { 0x1073465 488b08 MOVQ 0(AX), CX 0x1073468 4889ca MOVQ CX, DX 0x107346b 48c1e93f SHRQ $0x3f, CX 0x107346f 48c1e13f SHLQ $0x3f, CX 0x1073473 48f7c1ffffffff TESTQ $-0x1, CX 0x107347a 746b JE 0x10734e7 if t.wall&hasMonotonic != 0 { 0x1073435 488b08 MOVQ 0(AX), CX 0x1073438 480fbae13f BTQ $0x3f, CX 0x107343d 7363 JAE 0x10734a2 Another example: t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic 0x10734c8 4881e1ffffff3f ANDQ $0x3fffffff, CX 0x10734cf 48c1e61e SHLQ $0x1e, SI 0x10734d3 4809ce ORQ CX, SI 0x10734d6 48b90000000000000080 MOVQ $0x8000000000000000, CX 0x10734e0 4809f1 ORQ SI, CX 0x10734e3 488908 MOVQ CX, 0(AX) t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic 0x107348b 4881e2ffffff3f ANDQ $0x3fffffff, DX 0x1073492 48c1e61e SHLQ $0x1e, SI 0x1073496 4809f2 ORQ SI, DX 0x1073499 480fbaea3f BTSQ $0x3f, DX 0x107349e 488910 MOVQ DX, 0(AX) Go1 benchmarks seem unaffected, and I would be surprised otherwise: name old time/op new time/op delta BinaryTree17-4 2.64s ± 4% 2.56s ± 9% -2.92% (p=0.008 n=9+9) Fannkuch11-4 2.90s ± 1% 2.95s ± 3% +1.76% (p=0.010 n=10+9) FmtFprintfEmpty-4 35.3ns ± 1% 34.5ns ± 2% -2.34% (p=0.004 n=9+8) FmtFprintfString-4 57.0ns ± 1% 58.4ns ± 5% +2.52% (p=0.029 n=9+10) FmtFprintfInt-4 59.8ns ± 3% 59.8ns ± 6% ~ (p=0.565 n=10+10) FmtFprintfIntInt-4 93.9ns ± 3% 91.2ns ± 5% -2.94% (p=0.014 n=10+9) FmtFprintfPrefixedInt-4 107ns ± 6% 104ns ± 6% ~ (p=0.099 n=10+10) FmtFprintfFloat-4 187ns ± 3% 188ns ± 3% ~ (p=0.505 n=10+9) FmtManyArgs-4 410ns ± 1% 415ns ± 6% ~ (p=0.649 n=8+10) GobDecode-4 5.30ms ± 3% 5.27ms ± 3% ~ (p=0.436 n=10+10) GobEncode-4 4.62ms ± 5% 4.47ms ± 2% -3.24% (p=0.001 n=9+10) Gzip-4 197ms ± 4% 193ms ± 3% ~ (p=0.123 n=10+10) Gunzip-4 30.4ms ± 3% 30.1ms ± 3% ~ (p=0.481 n=10+10) HTTPClientServer-4 76.3µs ± 1% 76.0µs ± 1% ~ (p=0.236 n=8+9) JSONEncode-4 10.5ms ± 9% 10.3ms ± 3% ~ (p=0.280 n=10+10) JSONDecode-4 42.3ms ±10% 41.3ms ± 2% ~ (p=0.053 n=9+10) Mandelbrot200-4 3.80ms ± 2% 3.72ms ± 2% -2.15% (p=0.001 n=9+10) GoParse-4 2.88ms ±10% 2.81ms ± 2% ~ (p=0.247 n=10+10) RegexpMatchEasy0_32-4 69.5ns ± 4% 68.6ns ± 2% ~ (p=0.171 n=10+10) RegexpMatchEasy0_1K-4 165ns ± 3% 162ns ± 3% ~ (p=0.137 n=10+10) RegexpMatchEasy1_32-4 65.7ns ± 6% 64.4ns ± 2% -2.02% (p=0.037 n=10+10) RegexpMatchEasy1_1K-4 278ns ± 2% 279ns ± 3% ~ (p=0.991 n=8+9) RegexpMatchMedium_32-4 99.3ns ± 3% 98.5ns ± 4% ~ (p=0.457 n=10+9) RegexpMatchMedium_1K-4 30.1µs ± 1% 30.4µs ± 2% ~ (p=0.173 n=8+10) RegexpMatchHard_32-4 1.40µs ± 2% 1.41µs ± 4% ~ (p=0.565 n=10+10) RegexpMatchHard_1K-4 42.5µs ± 1% 41.5µs ± 3% -2.13% (p=0.002 n=8+9) Revcomp-4 332ms ± 4% 328ms ± 5% ~ (p=0.720 n=9+10) Template-4 48.3ms ± 2% 49.6ms ± 3% +2.56% (p=0.002 n=8+10) TimeParse-4 252ns ± 2% 249ns ± 3% ~ (p=0.116 n=9+10) TimeFormat-4 262ns ± 4% 252ns ± 3% -4.01% (p=0.000 n=9+10) name old speed new speed delta GobDecode-4 145MB/s ± 3% 146MB/s ± 3% ~ (p=0.436 n=10+10) GobEncode-4 166MB/s ± 5% 172MB/s ± 2% +3.28% (p=0.001 n=9+10) Gzip-4 98.6MB/s ± 4% 100.4MB/s ± 3% ~ (p=0.123 n=10+10) Gunzip-4 639MB/s ± 3% 645MB/s ± 3% ~ (p=0.481 n=10+10) JSONEncode-4 185MB/s ± 8% 189MB/s ± 3% ~ (p=0.280 n=10+10) JSONDecode-4 46.0MB/s ± 9% 47.0MB/s ± 2% +2.21% (p=0.046 n=9+10) GoParse-4 20.1MB/s ± 9% 20.6MB/s ± 2% ~ (p=0.239 n=10+10) RegexpMatchEasy0_32-4 460MB/s ± 4% 467MB/s ± 2% ~ (p=0.165 n=10+10) RegexpMatchEasy0_1K-4 6.19GB/s ± 3% 6.28GB/s ± 3% ~ (p=0.165 n=10+10) RegexpMatchEasy1_32-4 487MB/s ± 5% 497MB/s ± 2% +2.00% (p=0.043 n=10+10) RegexpMatchEasy1_1K-4 3.67GB/s ± 2% 3.67GB/s ± 3% ~ (p=0.963 n=8+9) RegexpMatchMedium_32-4 10.1MB/s ± 3% 10.1MB/s ± 4% ~ (p=0.435 n=10+9) RegexpMatchMedium_1K-4 34.0MB/s ± 1% 33.7MB/s ± 2% ~ (p=0.173 n=8+10) RegexpMatchHard_32-4 22.9MB/s ± 2% 22.7MB/s ± 4% ~ (p=0.565 n=10+10) RegexpMatchHard_1K-4 24.0MB/s ± 3% 24.7MB/s ± 3% +2.64% (p=0.001 n=9+9) Revcomp-4 766MB/s ± 4% 775MB/s ± 5% ~ (p=0.720 n=9+10) Template-4 40.2MB/s ± 2% 39.2MB/s ± 3% -2.47% (p=0.002 n=8+10) The rules match ~1800 times during all.bash. Fixes #18943 Change-Id: I64be1ada34e89c486dfd935bf429b35652117ed4 Reviewed-on: https://go-review.googlesource.com/94766 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-02-17 05:54:03 -07:00
// amd64:"BTSQ\\t\\$32","BSFQ"
// s390x:"FLOGR","MOVWZ"
// ppc64:"ANDN","POPCNTW"
// ppc64le:"ANDN","POPCNTW"
return bits.TrailingZeros32(n)
}
func TrailingZeros16(n uint16) int {
cmd/compile: optimize TrailingZeros(8|16) on amd64 Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. name old time/op new time/op delta TrailingZeros8-8 1.33ns ± 6% 0.84ns ± 3% -36.90% (p=0.000 n=20+20) TrailingZeros16-8 1.26ns ± 5% 0.84ns ± 5% -33.50% (p=0.000 n=20+18) Code: func f8(x uint8) { z = bits.TrailingZeros8(x) } func f16(x uint16) { z = bits.TrailingZeros16(x) } Before: "".f8 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) BTSQ $8, AX 0x000d 00013 (x.go:7) BSFQ AX, AX 0x0011 00017 (x.go:7) MOVL $64, CX 0x0016 00022 (x.go:7) CMOVQEQ CX, AX 0x001a 00026 (x.go:7) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:7) RET "".f16 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BTSQ $16, AX 0x000d 00013 (x.go:8) BSFQ AX, AX 0x0011 00017 (x.go:8) MOVL $64, CX 0x0016 00022 (x.go:8) CMOVQEQ CX, AX 0x001a 00026 (x.go:8) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:8) RET After: "".f8 STEXT nosplit size=20 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) BTSL $8, AX 0x0009 00009 (x.go:7) BSFL AX, AX 0x000c 00012 (x.go:7) MOVQ AX, "".z(SB) 0x0013 00019 (x.go:7) RET "".f16 STEXT nosplit size=20 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) BTSL $16, AX 0x0009 00009 (x.go:8) BSFL AX, AX 0x000c 00012 (x.go:8) MOVQ AX, "".z(SB) 0x0013 00019 (x.go:8) RET Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11 Reviewed-on: https://go-review.googlesource.com/108940 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:46:41 -06:00
// amd64:"BSFL","BTSL\\t\\$16"
// s390x:"FLOGR","OR\t\\$65536"
// ppc64:"POPCNTD","OR\\t\\$65536"
// ppc64le:"POPCNTD","OR\\t\\$65536"
return bits.TrailingZeros16(n)
}
func TrailingZeros8(n uint8) int {
cmd/compile: optimize TrailingZeros(8|16) on amd64 Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. name old time/op new time/op delta TrailingZeros8-8 1.33ns ± 6% 0.84ns ± 3% -36.90% (p=0.000 n=20+20) TrailingZeros16-8 1.26ns ± 5% 0.84ns ± 5% -33.50% (p=0.000 n=20+18) Code: func f8(x uint8) { z = bits.TrailingZeros8(x) } func f16(x uint16) { z = bits.TrailingZeros16(x) } Before: "".f8 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) MOVBLZX AL, AX 0x0008 00008 (x.go:7) BTSQ $8, AX 0x000d 00013 (x.go:7) BSFQ AX, AX 0x0011 00017 (x.go:7) MOVL $64, CX 0x0016 00022 (x.go:7) CMOVQEQ CX, AX 0x001a 00026 (x.go:7) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:7) RET "".f16 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BTSQ $16, AX 0x000d 00013 (x.go:8) BSFQ AX, AX 0x0011 00017 (x.go:8) MOVL $64, CX 0x0016 00022 (x.go:8) CMOVQEQ CX, AX 0x001a 00026 (x.go:8) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:8) RET After: "".f8 STEXT nosplit size=20 args=0x8 locals=0x0 0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX 0x0005 00005 (x.go:7) BTSL $8, AX 0x0009 00009 (x.go:7) BSFL AX, AX 0x000c 00012 (x.go:7) MOVQ AX, "".z(SB) 0x0013 00019 (x.go:7) RET "".f16 STEXT nosplit size=20 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) BTSL $16, AX 0x0009 00009 (x.go:8) BSFL AX, AX 0x000c 00012 (x.go:8) MOVQ AX, "".z(SB) 0x0013 00019 (x.go:8) RET Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11 Reviewed-on: https://go-review.googlesource.com/108940 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:46:41 -06:00
// amd64:"BSFL","BTSL\\t\\$8"
// s390x:"FLOGR","OR\t\\$256"
return bits.TrailingZeros8(n)
}
// IterateBitsNN checks special handling of TrailingZerosNN when the input is known to be non-zero.
func IterateBits(n uint) int {
i := 0
for n != 0 {
// amd64:"BSFQ",-"CMOVEQ"
i += bits.TrailingZeros(n)
n &= n - 1
}
return i
}
func IterateBits64(n uint64) int {
i := 0
for n != 0 {
// amd64:"BSFQ",-"CMOVEQ"
i += bits.TrailingZeros64(n)
n &= n - 1
}
return i
}
func IterateBits32(n uint32) int {
i := 0
for n != 0 {
// amd64:"BSFL",-"BTSQ"
i += bits.TrailingZeros32(n)
n &= n - 1
}
return i
}
func IterateBits16(n uint16) int {
i := 0
for n != 0 {
// amd64:"BSFL",-"BTSL"
i += bits.TrailingZeros16(n)
n &= n - 1
}
return i
}
func IterateBits8(n uint8) int {
i := 0
for n != 0 {
// amd64:"BSFL",-"BTSL"
i += bits.TrailingZeros8(n)
n &= n - 1
}
return i
}
cmd/compile: intrinsify math/bits.Mul Add SSA rules to intrinsify Mul/Mul64 (AMD64 and ARM64). SSA rules for other functions and architectures are left as a future optimization. Benchmark results on AMD64/ARM64 before and after SSA implementation are below. amd64 name old time/op new time/op delta Add-4 1.78ns ± 0% 1.85ns ±12% ~ (p=0.397 n=4+5) Add32-4 1.71ns ± 1% 1.70ns ± 0% ~ (p=0.683 n=5+5) Add64-4 1.80ns ± 2% 1.77ns ± 0% -1.22% (p=0.048 n=5+5) Sub-4 1.78ns ± 0% 1.78ns ± 0% ~ (all equal) Sub32-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=1.000 n=5+5) Sub64-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=0.968 n=5+4) Mul-4 11.5ns ± 1% 1.8ns ± 2% -84.39% (p=0.008 n=5+5) Mul32-4 1.39ns ± 0% 1.38ns ± 3% ~ (p=0.175 n=5+5) Mul64-4 6.85ns ± 1% 1.78ns ± 1% -73.97% (p=0.008 n=5+5) Div-4 57.1ns ± 1% 56.7ns ± 0% ~ (p=0.087 n=5+5) Div32-4 18.0ns ± 0% 18.0ns ± 0% ~ (all equal) Div64-4 56.4ns ±10% 53.6ns ± 1% ~ (p=0.071 n=5+5) arm64 name old time/op new time/op delta Add-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add64-96 5.52ns ± 0% 5.51ns ± 0% ~ (p=0.444 n=5+5) Sub-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub64-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Mul-96 34.6ns ± 0% 5.0ns ± 0% -85.52% (p=0.008 n=5+5) Mul32-96 4.51ns ± 0% 4.51ns ± 0% ~ (all equal) Mul64-96 21.1ns ± 0% 5.0ns ± 0% -76.26% (p=0.008 n=5+5) Div-96 64.7ns ± 0% 64.7ns ± 0% ~ (all equal) Div32-96 17.0ns ± 0% 17.0ns ± 0% ~ (all equal) Div64-96 53.1ns ± 0% 53.1ns ± 0% ~ (all equal) Updates #24813 Change-Id: I9bda6d2102f65cae3d436a2087b47ed8bafeb068 Reviewed-on: https://go-review.googlesource.com/129415 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-08-14 16:41:22 -06:00
// --------------- //
// bits.Mul* //
// --------------- //
func Mul(x, y uint) (hi, lo uint) {
// amd64:"MULQ"
// arm64:"UMULH","MUL"
// ppc64:"MULHDU","MULLD"
// ppc64le:"MULHDU","MULLD"
cmd/compile: intrinsify math/bits.Mul Add SSA rules to intrinsify Mul/Mul64 (AMD64 and ARM64). SSA rules for other functions and architectures are left as a future optimization. Benchmark results on AMD64/ARM64 before and after SSA implementation are below. amd64 name old time/op new time/op delta Add-4 1.78ns ± 0% 1.85ns ±12% ~ (p=0.397 n=4+5) Add32-4 1.71ns ± 1% 1.70ns ± 0% ~ (p=0.683 n=5+5) Add64-4 1.80ns ± 2% 1.77ns ± 0% -1.22% (p=0.048 n=5+5) Sub-4 1.78ns ± 0% 1.78ns ± 0% ~ (all equal) Sub32-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=1.000 n=5+5) Sub64-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=0.968 n=5+4) Mul-4 11.5ns ± 1% 1.8ns ± 2% -84.39% (p=0.008 n=5+5) Mul32-4 1.39ns ± 0% 1.38ns ± 3% ~ (p=0.175 n=5+5) Mul64-4 6.85ns ± 1% 1.78ns ± 1% -73.97% (p=0.008 n=5+5) Div-4 57.1ns ± 1% 56.7ns ± 0% ~ (p=0.087 n=5+5) Div32-4 18.0ns ± 0% 18.0ns ± 0% ~ (all equal) Div64-4 56.4ns ±10% 53.6ns ± 1% ~ (p=0.071 n=5+5) arm64 name old time/op new time/op delta Add-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add64-96 5.52ns ± 0% 5.51ns ± 0% ~ (p=0.444 n=5+5) Sub-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub64-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Mul-96 34.6ns ± 0% 5.0ns ± 0% -85.52% (p=0.008 n=5+5) Mul32-96 4.51ns ± 0% 4.51ns ± 0% ~ (all equal) Mul64-96 21.1ns ± 0% 5.0ns ± 0% -76.26% (p=0.008 n=5+5) Div-96 64.7ns ± 0% 64.7ns ± 0% ~ (all equal) Div32-96 17.0ns ± 0% 17.0ns ± 0% ~ (all equal) Div64-96 53.1ns ± 0% 53.1ns ± 0% ~ (all equal) Updates #24813 Change-Id: I9bda6d2102f65cae3d436a2087b47ed8bafeb068 Reviewed-on: https://go-review.googlesource.com/129415 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-08-14 16:41:22 -06:00
return bits.Mul(x, y)
}
func Mul64(x, y uint64) (hi, lo uint64) {
// amd64:"MULQ"
// arm64:"UMULH","MUL"
// ppc64:"MULHDU","MULLD"
// ppc64le:"MULHDU","MULLD"
cmd/compile: intrinsify math/bits.Mul Add SSA rules to intrinsify Mul/Mul64 (AMD64 and ARM64). SSA rules for other functions and architectures are left as a future optimization. Benchmark results on AMD64/ARM64 before and after SSA implementation are below. amd64 name old time/op new time/op delta Add-4 1.78ns ± 0% 1.85ns ±12% ~ (p=0.397 n=4+5) Add32-4 1.71ns ± 1% 1.70ns ± 0% ~ (p=0.683 n=5+5) Add64-4 1.80ns ± 2% 1.77ns ± 0% -1.22% (p=0.048 n=5+5) Sub-4 1.78ns ± 0% 1.78ns ± 0% ~ (all equal) Sub32-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=1.000 n=5+5) Sub64-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=0.968 n=5+4) Mul-4 11.5ns ± 1% 1.8ns ± 2% -84.39% (p=0.008 n=5+5) Mul32-4 1.39ns ± 0% 1.38ns ± 3% ~ (p=0.175 n=5+5) Mul64-4 6.85ns ± 1% 1.78ns ± 1% -73.97% (p=0.008 n=5+5) Div-4 57.1ns ± 1% 56.7ns ± 0% ~ (p=0.087 n=5+5) Div32-4 18.0ns ± 0% 18.0ns ± 0% ~ (all equal) Div64-4 56.4ns ±10% 53.6ns ± 1% ~ (p=0.071 n=5+5) arm64 name old time/op new time/op delta Add-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add64-96 5.52ns ± 0% 5.51ns ± 0% ~ (p=0.444 n=5+5) Sub-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub64-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Mul-96 34.6ns ± 0% 5.0ns ± 0% -85.52% (p=0.008 n=5+5) Mul32-96 4.51ns ± 0% 4.51ns ± 0% ~ (all equal) Mul64-96 21.1ns ± 0% 5.0ns ± 0% -76.26% (p=0.008 n=5+5) Div-96 64.7ns ± 0% 64.7ns ± 0% ~ (all equal) Div32-96 17.0ns ± 0% 17.0ns ± 0% ~ (all equal) Div64-96 53.1ns ± 0% 53.1ns ± 0% ~ (all equal) Updates #24813 Change-Id: I9bda6d2102f65cae3d436a2087b47ed8bafeb068 Reviewed-on: https://go-review.googlesource.com/129415 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-08-14 16:41:22 -06:00
return bits.Mul64(x, y)
}