1
0
mirror of https://github.com/golang/go synced 2024-11-24 22:57:57 -07:00

[math/bits] Use Wilkes-Wheeler-Gill algorithm for OnesCount64.

This implementation is based on the C function from
"Faster Population Counts Using AVX2 Instructions" paper, Figure 3,
available at https://arxiv.org/pdf/1612.07612.pdf

More details and benchmark results are available in the #46188.

Closes #46188
This commit is contained in:
Taras Tsugrii 2021-05-15 20:52:11 +01:00
parent ce92a2023c
commit b51e555469

View File

@ -132,33 +132,14 @@ func OnesCount32(x uint32) int {
// OnesCount64 returns the number of one bits ("population count") in x.
func OnesCount64(x uint64) int {
// Implementation: Parallel summing of adjacent bits.
// See "Hacker's Delight", Chap. 5: Counting Bits.
// The following pattern shows the general approach:
//
// x = x>>1&(m0&m) + x&(m0&m)
// x = x>>2&(m1&m) + x&(m1&m)
// x = x>>4&(m2&m) + x&(m2&m)
// x = x>>8&(m3&m) + x&(m3&m)
// x = x>>16&(m4&m) + x&(m4&m)
// x = x>>32&(m5&m) + x&(m5&m)
// return int(x)
//
// Masking (& operations) can be left away when there's no
// danger that a field's sum will carry over into the next
// field: Since the result cannot be > 64, 8 bits is enough
// and we can ignore the masks for the shifts by 8 and up.
// Per "Hacker's Delight", the first line can be simplified
// more, but it saves at best one instruction, so we leave
// it alone for clarity.
const m = 1<<64 - 1
x = x>>1&(m0&m) + x&(m0&m)
x = x>>2&(m1&m) + x&(m1&m)
x = (x>>4 + x) & (m2 & m)
x += x >> 8
x += x >> 16
x += x >> 32
return int(x) & (1<<7 - 1)
// Implementation: Wilkes-Wheeler-Gill algorithm.
// See "Faster Population Counts Using AVX2 Instructions", FIGURE 3.
// Full paper is available at https://arxiv.org/pdf/1611.07612.pdf
x -= (x >> 1) & m0
x = ((x >> 2) & m1) + (x & m1)
x = (x + (x >> 4)) & m2
x *= 0x0101010101010101
return int(x >> 56)
}
// --- RotateLeft ---