diff --git a/container/intsets/popcnt_amd64.go b/container/intsets/popcnt_amd64.go new file mode 100644 index 0000000000..858431ccca --- /dev/null +++ b/container/intsets/popcnt_amd64.go @@ -0,0 +1,20 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 + +package intsets + +func popcnt(x word) int +func havePOPCNT() bool + +var hasPOPCNT = havePOPCNT() + +// popcount returns the population count (number of set bits) of x. +func popcount(x word) int { + if hasPOPCNT { + return popcnt(x) + } + return popcountTable(x) // faster than Hacker's Delight +} diff --git a/container/intsets/popcnt_amd64.s b/container/intsets/popcnt_amd64.s new file mode 100644 index 0000000000..6538b91d5e --- /dev/null +++ b/container/intsets/popcnt_amd64.s @@ -0,0 +1,28 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func havePOPCNT() bool +TEXT ·havePOPCNT(SB),4,$0 + MOVQ $1, AX + CPUID + SHRQ $23, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// func popcnt(word) int +TEXT ·popcnt(SB),NOSPLIT,$0-8 + XORQ AX, AX + MOVQ x+0(FP), SI + // POPCNT (SI), AX is not recognized by Go assembler, + // so we assemble it ourselves. + BYTE $0xf3 + BYTE $0x48 + BYTE $0x0f + BYTE $0xb8 + BYTE $0xc6 + MOVQ AX, ret+8(FP) + RET diff --git a/container/intsets/popcnt_generic.go b/container/intsets/popcnt_generic.go new file mode 100644 index 0000000000..0dc1d7196b --- /dev/null +++ b/container/intsets/popcnt_generic.go @@ -0,0 +1,32 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 + +package intsets + +import "runtime" + +// We compared three algorithms---Hacker's Delight, table lookup, +// and AMD64's SSE4.1 hardware POPCNT---on a 2.67GHz Xeon X5550. +// +// % GOARCH=amd64 go test -run=NONE -bench=Popcount +// POPCNT 5.12 ns/op +// Table 8.53 ns/op +// HackersDelight 9.96 ns/op +// +// % GOARCH=386 go test -run=NONE -bench=Popcount +// Table 10.4 ns/op +// HackersDelight 5.23 ns/op +// +// (AMD64's ABM1 hardware supports ntz and nlz too, +// but they aren't critical.) + +// popcount returns the population count (number of set bits) of x. +func popcount(x word) int { + if runtime.GOARCH == "386" { + return popcountHD(uint32(x)) + } + return popcountTable(x) +} diff --git a/container/intsets/util.go b/container/intsets/util.go index 76e682cf4d..dd1db86b1c 100644 --- a/container/intsets/util.go +++ b/container/intsets/util.go @@ -4,6 +4,16 @@ package intsets +// From Hacker's Delight, fig 5.2. +func popcountHD(x uint32) int { + x -= (x >> 1) & 0x55555555 + x = (x & 0x33333333) + ((x >> 2) & 0x33333333) + x = (x + (x >> 4)) & 0x0f0f0f0f + x = x + (x >> 8) + x = x + (x >> 16) + return int(x & 0x0000003f) +} + var a [1 << 8]byte func init() { @@ -18,8 +28,7 @@ func init() { } } -// popcount returns the population count (number of set bits) of x. -func popcount(x word) int { +func popcountTable(x word) int { return int(a[byte(x>>(0*8))] + a[byte(x>>(1*8))] + a[byte(x>>(2*8))] + diff --git a/container/intsets/util_test.go b/container/intsets/util_test.go index 92a4bc58bc..e4cc6597f1 100644 --- a/container/intsets/util_test.go +++ b/container/intsets/util_test.go @@ -4,7 +4,10 @@ package intsets -import "testing" +import ( + "math/rand" + "testing" +) func TestNLZ(t *testing.T) { // Test the platform-specific edge case. @@ -23,3 +26,33 @@ func TestNLZ(t *testing.T) { // Backdoor for testing. func (s *Sparse) Check() error { return s.check() } + +func dumbPopcount(x word) int { + var popcnt int + for i := uint(0); i < bitsPerWord; i++ { + if x&(1<