diff --git a/src/math/bits/bits.go b/src/math/bits/bits.go index 65452feda2c..9c8ce265da4 100644 --- a/src/math/bits/bits.go +++ b/src/math/bits/bits.go @@ -132,33 +132,14 @@ func OnesCount32(x uint32) int { // OnesCount64 returns the number of one bits ("population count") in x. func OnesCount64(x uint64) int { - // Implementation: Parallel summing of adjacent bits. - // See "Hacker's Delight", Chap. 5: Counting Bits. - // The following pattern shows the general approach: - // - // x = x>>1&(m0&m) + x&(m0&m) - // x = x>>2&(m1&m) + x&(m1&m) - // x = x>>4&(m2&m) + x&(m2&m) - // x = x>>8&(m3&m) + x&(m3&m) - // x = x>>16&(m4&m) + x&(m4&m) - // x = x>>32&(m5&m) + x&(m5&m) - // return int(x) - // - // Masking (& operations) can be left away when there's no - // danger that a field's sum will carry over into the next - // field: Since the result cannot be > 64, 8 bits is enough - // and we can ignore the masks for the shifts by 8 and up. - // Per "Hacker's Delight", the first line can be simplified - // more, but it saves at best one instruction, so we leave - // it alone for clarity. - const m = 1<<64 - 1 - x = x>>1&(m0&m) + x&(m0&m) - x = x>>2&(m1&m) + x&(m1&m) - x = (x>>4 + x) & (m2 & m) - x += x >> 8 - x += x >> 16 - x += x >> 32 - return int(x) & (1<<7 - 1) + // Implementation: Wilkes-Wheeler-Gill algorithm. + // See "Faster Population Counts Using AVX2 Instructions", FIGURE 3. + // Full paper is available at https://arxiv.org/pdf/1611.07612.pdf + x -= (x >> 1) & m0 + x = ((x >> 2) & m1) + (x & m1) + x = (x + (x >> 4)) & m2 + x *= 0x0101010101010101 + return int(x >> 56) } // --- RotateLeft ---