1
0
mirror of https://github.com/golang/go synced 2024-11-12 08:20:22 -07:00

cmd/compile: improve atomic swap intrinsics on arm64

ARMv8.1 has added new instructions for atomic memory operations. This
change builds on the previous change which added support for atomic add,
0a7ac93c27, to include similar support for
atomic-compare-and-swap, atomic-swap, atomic-or, and atomic-and
intrinsics. Since the new instructions are not guaranteed to be present,
we guard their usages with a branch on a CPU feature.

Peformance on an ARMv8.1 machine:
name                 old time/op  new time/op  delta
CompareAndSwap-16    37.9ns ±16%  24.1ns ± 4%  -36.44%  (p=0.000 n=10+9)
CompareAndSwap64-16  38.6ns ±15%  24.1ns ± 3%  -37.47%  (p=0.000 n=10+10)

name       old time/op  new time/op  delta
Swap-16    46.9ns ±32%  12.5ns ± 6%  -73.40%  (p=0.000 n=10+10)
Swap64-16  53.4ns ± 1%  12.5ns ± 6%  -76.56%  (p=0.000 n=10+10)

name            old time/op  new time/op  delta
Or8-16          8.81ns ± 0%  5.61ns ± 0%  -36.32%  (p=0.000 n=10+10)
Or-16           7.21ns ± 0%  5.61ns ± 0%  -22.19%  (p=0.000 n=10+10)
Or8Parallel-16  59.8ns ± 3%  12.5ns ± 2%  -79.10%  (p=0.000 n=10+10)
OrParallel-16   51.7ns ± 3%  12.5ns ± 2%  -75.84%  (p=0.000 n=10+10)

name             old time/op  new time/op  delta
And8-16          8.81ns ± 0%  5.61ns ± 0%  -36.32%  (p=0.000 n=10+10)
And-16           7.21ns ± 0%  5.61ns ± 0%  -22.19%  (p=0.000 n=10+10)
And8Parallel-16  59.1ns ± 6%  12.8ns ± 3%  -78.33%  (p=0.000 n=10+10)
AndParallel-16   51.4ns ± 7%  12.8ns ± 3%  -75.03%  (p=0.000 n=10+10)

Performance on an ARMv8.0 machine (no atomics instructions):
name                 old time/op  new time/op  delta
CompareAndSwap-16    61.3ns ± 0%  62.4ns ± 0%  +1.70%  (p=0.000 n=8+9)
CompareAndSwap64-16  62.0ns ± 3%  61.3ns ± 2%    ~     (p=0.093 n=10+10)

name       old time/op  new time/op  delta
Swap-16     127ns ± 2%   131ns ± 2%  +2.91%  (p=0.001 n=10+10)
Swap64-16   128ns ± 1%   131ns ± 2%  +2.43%  (p=0.001 n=10+10)

name            old time/op  new time/op  delta
Or8-16          14.9ns ± 0%  15.3ns ± 0%  +2.68%  (p=0.000 n=10+10)
Or-16           11.8ns ± 0%  12.3ns ± 0%  +4.24%  (p=0.000 n=10+10)
Or8Parallel-16   137ns ± 1%   144ns ± 1%  +4.97%  (p=0.000 n=10+10)
OrParallel-16    128ns ± 1%   136ns ± 1%  +6.34%  (p=0.000 n=10+10)

name             old time/op  new time/op  delta
And8-16          14.9ns ± 0%  15.3ns ± 0%  +2.68%  (p=0.000 n=10+10)
And-16           11.8ns ± 0%  12.3ns ± 0%  +4.24%  (p=0.000 n=10+10)
And8Parallel-16   134ns ± 2%   141ns ± 1%  +5.29%  (p=0.000 n=10+10)
AndParallel-16    125ns ± 2%   134ns ± 1%  +7.10%  (p=0.000 n=10+10)

Fixes #39304

Change-Id: Idaca68701d4751650be6b4bedca3d57f51571712
Reviewed-on: https://go-review.googlesource.com/c/go/+/234217
Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: fannie zhang <Fannie.Zhang@arm.com>
This commit is contained in:
Jonathan Swinney 2020-11-04 16:18:23 +00:00 committed by Cherry Zhang
parent 8e5778ed70
commit ecc3f5112e
8 changed files with 616 additions and 49 deletions

View File

@ -581,6 +581,24 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p2.From.Reg = arm64.REGTMP
p2.To.Type = obj.TYPE_BRANCH
gc.Patch(p2, p)
case ssa.OpARM64LoweredAtomicExchange64Variant,
ssa.OpARM64LoweredAtomicExchange32Variant:
swap := arm64.ASWPALD
if v.Op == ssa.OpARM64LoweredAtomicExchange32Variant {
swap = arm64.ASWPALW
}
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
// SWPALD Rarg1, (Rarg0), Rout
p := s.Prog(swap)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_MEM
p.To.Reg = r0
p.RegTo2 = out
case ssa.OpARM64LoweredAtomicAdd64,
ssa.OpARM64LoweredAtomicAdd32:
// LDAXR (Rarg0), Rout
@ -687,6 +705,56 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p5.To.Type = obj.TYPE_REG
p5.To.Reg = out
gc.Patch(p2, p5)
case ssa.OpARM64LoweredAtomicCas64Variant,
ssa.OpARM64LoweredAtomicCas32Variant:
// Rarg0: ptr
// Rarg1: old
// Rarg2: new
// MOV Rarg1, Rtmp
// CASAL Rtmp, (Rarg0), Rarg2
// CMP Rarg1, Rtmp
// CSET EQ, Rout
cas := arm64.ACASALD
cmp := arm64.ACMP
mov := arm64.AMOVD
if v.Op == ssa.OpARM64LoweredAtomicCas32Variant {
cas = arm64.ACASALW
cmp = arm64.ACMPW
mov = arm64.AMOVW
}
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
r2 := v.Args[2].Reg()
out := v.Reg0()
// MOV Rarg1, Rtmp
p := s.Prog(mov)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = arm64.REGTMP
// CASAL Rtmp, (Rarg0), Rarg2
p1 := s.Prog(cas)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = arm64.REGTMP
p1.To.Type = obj.TYPE_MEM
p1.To.Reg = r0
p1.RegTo2 = r2
// CMP Rarg1, Rtmp
p2 := s.Prog(cmp)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = r1
p2.Reg = arm64.REGTMP
// CSET EQ, Rout
p3 := s.Prog(arm64.ACSET)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = arm64.COND_EQ
p3.To.Type = obj.TYPE_REG
p3.To.Reg = out
case ssa.OpARM64LoweredAtomicAnd8,
ssa.OpARM64LoweredAtomicAnd32,
ssa.OpARM64LoweredAtomicOr8,
@ -725,6 +793,63 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p3.From.Reg = arm64.REGTMP
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
case ssa.OpARM64LoweredAtomicAnd8Variant,
ssa.OpARM64LoweredAtomicAnd32Variant:
atomic_clear := arm64.ALDCLRALW
if v.Op == ssa.OpARM64LoweredAtomicAnd8Variant {
atomic_clear = arm64.ALDCLRALB
}
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
// MNV Rarg1 Rtemp
p := s.Prog(arm64.AMVN)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = arm64.REGTMP
// LDCLRALW Rtemp, (Rarg0), Rout
p1 := s.Prog(atomic_clear)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = arm64.REGTMP
p1.To.Type = obj.TYPE_MEM
p1.To.Reg = r0
p1.RegTo2 = out
// AND Rarg1, Rout
p2 := s.Prog(arm64.AAND)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = r1
p2.To.Type = obj.TYPE_REG
p2.To.Reg = out
case ssa.OpARM64LoweredAtomicOr8Variant,
ssa.OpARM64LoweredAtomicOr32Variant:
atomic_or := arm64.ALDORALW
if v.Op == ssa.OpARM64LoweredAtomicOr8Variant {
atomic_or = arm64.ALDORALB
}
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
// LDORALW Rarg1, (Rarg0), Rout
p := s.Prog(atomic_or)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_MEM
p.To.Reg = r0
p.RegTo2 = out
// ORR Rarg1, Rout
p2 := s.Prog(arm64.AORR)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = r1
p2.To.Type = obj.TYPE_REG
p2.To.Reg = out
case ssa.OpARM64MOVBreg,
ssa.OpARM64MOVBUreg,
ssa.OpARM64MOVHreg,

View File

@ -3458,14 +3458,64 @@ func init() {
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("runtime/internal/atomic", "Xchg64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
},
sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
type atomicOpEmitter func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType)
makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ, rtyp types.EType, emit atomicOpEmitter) intrinsicBuilder {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64HasATOMICS, s.sb)
v := s.load(types.Types[TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
emit(s, n, args, op1, typ)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, typ)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
if rtyp == TNIL {
return nil
} else {
return s.variable(n, types.Types[rtyp])
}
}
}
atomicXchgXaddEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
addF("runtime/internal/atomic", "Xchg",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, TUINT32, TUINT32, atomicXchgXaddEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Xchg64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, TUINT64, TUINT64, atomicXchgXaddEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Xadd",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@ -3482,46 +3532,11 @@ func init() {
},
sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
makeXaddARM64 := func(op0 ssa.Op, op1 ssa.Op, ty types.EType) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64HasATOMICS, s.sb)
v := s.load(types.Types[TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchUnlikely // most machines don't have Atomics nowadays
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
v0 := s.newValue3(op1, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v0)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v0)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
v1 := s.newValue3(op0, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v1)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v1)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[ty])
}
}
addF("runtime/internal/atomic", "Xadd",
makeXaddARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32),
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32, TUINT32, atomicXchgXaddEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Xadd64",
makeXaddARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64),
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64, TUINT64, atomicXchgXaddEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Cas",
@ -3530,14 +3545,14 @@ func init() {
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TBOOL], v)
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("runtime/internal/atomic", "Cas64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TBOOL], v)
},
sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("runtime/internal/atomic", "CasRel",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
@ -3546,18 +3561,31 @@ func init() {
},
sys.PPC64)
atomicCasEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
v := s.newValue4(op, types.NewTuple(types.Types[TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
addF("runtime/internal/atomic", "Cas",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, TUINT32, TBOOL, atomicCasEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Cas64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, TUINT64, TBOOL, atomicCasEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "And8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
addF("runtime/internal/atomic", "And",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
addF("runtime/internal/atomic", "Or8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
@ -3569,7 +3597,24 @@ func init() {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
sys.AMD64, sys.MIPS, sys.PPC64, sys.S390X)
atomicAndOrEmitterARM64 := func(s *state, n *Node, args []*ssa.Value, op ssa.Op, typ types.EType) {
s.vars[&memVar] = s.newValue3(op, types.TypeMem, args[0], args[1], s.mem())
}
addF("runtime/internal/atomic", "And8",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd8, ssa.OpAtomicAnd8Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "And",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32, ssa.OpAtomicAnd32Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Or8",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr8, ssa.OpAtomicOr8Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
sys.ARM64)
addF("runtime/internal/atomic", "Or",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32, ssa.OpAtomicOr32Variant, TNIL, TNIL, atomicAndOrEmitterARM64),
sys.ARM64)
alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...)
alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...)

View File

@ -543,17 +543,24 @@
(AtomicStore64 ...) => (STLR ...)
(AtomicStorePtrNoWB ...) => (STLR ...)
(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
(AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
(AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...)
(AtomicExchange(32|64)Variant ...) => (LoweredAtomicExchange(32|64)Variant ...)
(AtomicCompareAndSwap(32|64)Variant ...) => (LoweredAtomicCas(32|64)Variant ...)
// Currently the updated value is not used, but we need a register to temporarily hold it.
(AtomicAnd8 ptr val mem) => (Select1 (LoweredAtomicAnd8 ptr val mem))
(AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem))
(AtomicOr8 ptr val mem) => (Select1 (LoweredAtomicOr8 ptr val mem))
(AtomicOr32 ptr val mem) => (Select1 (LoweredAtomicOr32 ptr val mem))
(AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...)
(AtomicAnd8Variant ptr val mem) => (Select1 (LoweredAtomicAnd8Variant ptr val mem))
(AtomicAnd32Variant ptr val mem) => (Select1 (LoweredAtomicAnd32Variant ptr val mem))
(AtomicOr8Variant ptr val mem) => (Select1 (LoweredAtomicOr8Variant ptr val mem))
(AtomicOr32Variant ptr val mem) => (Select1 (LoweredAtomicOr32Variant ptr val mem))
// Write barrier.
(WB ...) => (LoweredWB ...)

View File

@ -621,6 +621,12 @@ func init() {
{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic exchange variant.
// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
// SWPALD Rarg1, (Rarg0), Rout
{name: "LoweredAtomicExchange64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicExchange32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
// atomic add.
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// LDAXR (Rarg0), Rout
@ -654,6 +660,21 @@ func init() {
{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic compare and swap variant.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
// if *arg0 == arg1 {
// *arg0 = arg2
// return (true, memory)
// } else {
// return (false, memory)
// }
// MOV Rarg1, Rtmp
// CASAL Rtmp, (Rarg0), Rarg2
// CMP Rarg1, Rtmp
// CSET EQ, Rout
{name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic and/or.
// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// LDAXR (Rarg0), Rout
@ -665,6 +686,20 @@ func init() {
{name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic and/or variant.
// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// AND:
// MNV Rarg1, Rtemp
// LDANDALB Rtemp, (Rarg0), Rout
// AND Rarg1, Rout
// OR:
// LDORALB Rarg1, (Rarg0), Rout
// ORR Rarg1, Rout
{name: "LoweredAtomicAnd8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicAnd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicOr8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicOr32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true},
// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
// It saves all GP registers if necessary,
// but clobbers R30 (LR) because it's a call.

View File

@ -574,8 +574,16 @@ var genericOps = []opData{
// These variants have the same semantics as above atomic operations.
// But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
// Currently, they are used on ARM64 only.
{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
{name: "AtomicExchange32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
{name: "AtomicExchange64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
{name: "AtomicCompareAndSwap32Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
{name: "AtomicCompareAndSwap64Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
{name: "AtomicAnd8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
{name: "AtomicAnd32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
{name: "AtomicOr8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
{name: "AtomicOr32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
// Clobber experiment op
{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable

View File

@ -1581,16 +1581,24 @@ const (
OpARM64STLRW
OpARM64LoweredAtomicExchange64
OpARM64LoweredAtomicExchange32
OpARM64LoweredAtomicExchange64Variant
OpARM64LoweredAtomicExchange32Variant
OpARM64LoweredAtomicAdd64
OpARM64LoweredAtomicAdd32
OpARM64LoweredAtomicAdd64Variant
OpARM64LoweredAtomicAdd32Variant
OpARM64LoweredAtomicCas64
OpARM64LoweredAtomicCas32
OpARM64LoweredAtomicCas64Variant
OpARM64LoweredAtomicCas32Variant
OpARM64LoweredAtomicAnd8
OpARM64LoweredAtomicAnd32
OpARM64LoweredAtomicOr8
OpARM64LoweredAtomicOr32
OpARM64LoweredAtomicAnd8Variant
OpARM64LoweredAtomicAnd32Variant
OpARM64LoweredAtomicOr8Variant
OpARM64LoweredAtomicOr32Variant
OpARM64LoweredWB
OpARM64LoweredPanicBoundsA
OpARM64LoweredPanicBoundsB
@ -2881,6 +2889,14 @@ const (
OpAtomicOr32
OpAtomicAdd32Variant
OpAtomicAdd64Variant
OpAtomicExchange32Variant
OpAtomicExchange64Variant
OpAtomicCompareAndSwap32Variant
OpAtomicCompareAndSwap64Variant
OpAtomicAnd8Variant
OpAtomicAnd32Variant
OpAtomicOr8Variant
OpAtomicOr32Variant
OpClobber
)
@ -20994,6 +21010,38 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredAtomicExchange64Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicExchange32Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicAdd64",
argLen: 3,
@ -21098,6 +21146,44 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredAtomicCas64Variant",
argLen: 4,
resultNotInArgs: true,
clobberFlags: true,
faultOnNilArg0: true,
hasSideEffects: true,
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicCas32Variant",
argLen: 4,
resultNotInArgs: true,
clobberFlags: true,
faultOnNilArg0: true,
hasSideEffects: true,
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicAnd8",
argLen: 3,
@ -21170,6 +21256,72 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredAtomicAnd8Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicAnd32Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicOr8Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredAtomicOr32Variant",
argLen: 3,
resultNotInArgs: true,
faultOnNilArg0: true,
hasSideEffects: true,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "LoweredWB",
auxType: auxSym,
@ -35874,6 +36026,54 @@ var opcodeTable = [...]opInfo{
hasSideEffects: true,
generic: true,
},
{
name: "AtomicExchange32Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicExchange64Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicCompareAndSwap32Variant",
argLen: 4,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicCompareAndSwap64Variant",
argLen: 4,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicAnd8Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicAnd32Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicOr8Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "AtomicOr32Variant",
argLen: 3,
hasSideEffects: true,
generic: true,
},
{
name: "Clobber",
auxType: auxSymOff,

View File

@ -426,20 +426,36 @@ func rewriteValueARM64(v *Value) bool {
return true
case OpAtomicAnd32:
return rewriteValueARM64_OpAtomicAnd32(v)
case OpAtomicAnd32Variant:
return rewriteValueARM64_OpAtomicAnd32Variant(v)
case OpAtomicAnd8:
return rewriteValueARM64_OpAtomicAnd8(v)
case OpAtomicAnd8Variant:
return rewriteValueARM64_OpAtomicAnd8Variant(v)
case OpAtomicCompareAndSwap32:
v.Op = OpARM64LoweredAtomicCas32
return true
case OpAtomicCompareAndSwap32Variant:
v.Op = OpARM64LoweredAtomicCas32Variant
return true
case OpAtomicCompareAndSwap64:
v.Op = OpARM64LoweredAtomicCas64
return true
case OpAtomicCompareAndSwap64Variant:
v.Op = OpARM64LoweredAtomicCas64Variant
return true
case OpAtomicExchange32:
v.Op = OpARM64LoweredAtomicExchange32
return true
case OpAtomicExchange32Variant:
v.Op = OpARM64LoweredAtomicExchange32Variant
return true
case OpAtomicExchange64:
v.Op = OpARM64LoweredAtomicExchange64
return true
case OpAtomicExchange64Variant:
v.Op = OpARM64LoweredAtomicExchange64Variant
return true
case OpAtomicLoad32:
v.Op = OpARM64LDARW
return true
@ -454,8 +470,12 @@ func rewriteValueARM64(v *Value) bool {
return true
case OpAtomicOr32:
return rewriteValueARM64_OpAtomicOr32(v)
case OpAtomicOr32Variant:
return rewriteValueARM64_OpAtomicOr32Variant(v)
case OpAtomicOr8:
return rewriteValueARM64_OpAtomicOr8(v)
case OpAtomicOr8Variant:
return rewriteValueARM64_OpAtomicOr8Variant(v)
case OpAtomicStore32:
v.Op = OpARM64STLRW
return true
@ -21363,6 +21383,25 @@ func rewriteValueARM64_OpAtomicAnd32(v *Value) bool {
return true
}
}
func rewriteValueARM64_OpAtomicAnd32Variant(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicAnd32Variant ptr val mem)
// result: (Select1 (LoweredAtomicAnd32Variant ptr val mem))
for {
ptr := v_0
val := v_1
mem := v_2
v.reset(OpSelect1)
v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicAnd32Variant, types.NewTuple(typ.UInt32, types.TypeMem))
v0.AddArg3(ptr, val, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueARM64_OpAtomicAnd8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -21382,6 +21421,25 @@ func rewriteValueARM64_OpAtomicAnd8(v *Value) bool {
return true
}
}
func rewriteValueARM64_OpAtomicAnd8Variant(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicAnd8Variant ptr val mem)
// result: (Select1 (LoweredAtomicAnd8Variant ptr val mem))
for {
ptr := v_0
val := v_1
mem := v_2
v.reset(OpSelect1)
v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicAnd8Variant, types.NewTuple(typ.UInt8, types.TypeMem))
v0.AddArg3(ptr, val, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueARM64_OpAtomicOr32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -21401,6 +21459,25 @@ func rewriteValueARM64_OpAtomicOr32(v *Value) bool {
return true
}
}
func rewriteValueARM64_OpAtomicOr32Variant(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicOr32Variant ptr val mem)
// result: (Select1 (LoweredAtomicOr32Variant ptr val mem))
for {
ptr := v_0
val := v_1
mem := v_2
v.reset(OpSelect1)
v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicOr32Variant, types.NewTuple(typ.UInt32, types.TypeMem))
v0.AddArg3(ptr, val, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueARM64_OpAtomicOr8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -21420,6 +21497,25 @@ func rewriteValueARM64_OpAtomicOr8(v *Value) bool {
return true
}
}
func rewriteValueARM64_OpAtomicOr8Variant(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicOr8Variant ptr val mem)
// result: (Select1 (LoweredAtomicOr8Variant ptr val mem))
for {
ptr := v_0
val := v_1
mem := v_2
v.reset(OpSelect1)
v0 := b.NewValue0(v.Pos, OpARM64LoweredAtomicOr8Variant, types.NewTuple(typ.UInt8, types.TypeMem))
v0.AddArg3(ptr, val, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueARM64_OpAvg64u(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]

View File

@ -142,3 +142,54 @@ func BenchmarkXadd64(b *testing.B) {
}
})
}
func BenchmarkCas(b *testing.B) {
var x uint32
x = 1
ptr := &x
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
atomic.Cas(ptr, 1, 0)
atomic.Cas(ptr, 0, 1)
}
})
}
func BenchmarkCas64(b *testing.B) {
var x uint64
x = 1
ptr := &x
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
atomic.Cas64(ptr, 1, 0)
atomic.Cas64(ptr, 0, 1)
}
})
}
func BenchmarkXchg(b *testing.B) {
var x uint32
x = 1
ptr := &x
b.RunParallel(func(pb *testing.PB) {
var y uint32
y = 1
for pb.Next() {
y = atomic.Xchg(ptr, y)
y += 1
}
})
}
func BenchmarkXchg64(b *testing.B) {
var x uint64
x = 1
ptr := &x
b.RunParallel(func(pb *testing.PB) {
var y uint64
y = 1
for pb.Next() {
y = atomic.Xchg64(ptr, y)
y += 1
}
})
}