1
0
mirror of https://github.com/golang/go synced 2024-11-18 14:14:46 -07:00

cmd/compile/internal/ppc64, runtime internal/atomic, sync/atomic: implement faster atomics for ppc64x

This change implements faster atomics for ppc64x based on the ISA 2.07B,
Appendix B.2 recommendations, replacing SYNC/ISYNC by LWSYNC in some
cases.

Updates #21348

name                                           old time/op new time/op    delta
Cond1-16                                           955ns     856ns      -10.33%
Cond2-16                                          2.38µs    2.03µs      -14.59%
Cond4-16                                          5.90µs    5.44µs       -7.88%
Cond8-16                                          12.1µs    11.1µs       -8.42%
Cond16-16                                         27.0µs    25.1µs       -7.04%
Cond32-16                                         59.1µs    55.5µs       -6.14%
LoadMostlyHits/*sync_test.DeepCopyMap-16          22.1ns    24.1ns       +9.02%
LoadMostlyHits/*sync_test.RWMutexMap-16            252ns     249ns       -1.20%
LoadMostlyHits/*sync.Map-16                       16.2ns    16.3ns         ~
LoadMostlyMisses/*sync_test.DeepCopyMap-16        22.3ns    22.6ns         ~
LoadMostlyMisses/*sync_test.RWMutexMap-16          249ns     247ns       -0.51%
LoadMostlyMisses/*sync.Map-16                     12.7ns    12.7ns         ~
LoadOrStoreBalanced/*sync_test.RWMutexMap-16      1.27µs    1.17µs       -7.54%
LoadOrStoreBalanced/*sync.Map-16                  1.12µs    1.10µs       -2.35%
LoadOrStoreUnique/*sync_test.RWMutexMap-16        1.75µs    1.68µs       -3.84%
LoadOrStoreUnique/*sync.Map-16                    2.07µs    1.97µs       -5.13%
LoadOrStoreCollision/*sync_test.DeepCopyMap-16    15.8ns    15.9ns         ~
LoadOrStoreCollision/*sync_test.RWMutexMap-16      496ns     424ns      -14.48%
LoadOrStoreCollision/*sync.Map-16                 6.07ns    6.07ns         ~
Range/*sync_test.DeepCopyMap-16                   1.65µs    1.64µs         ~
Range/*sync_test.RWMutexMap-16                     278µs     288µs       +3.75%
Range/*sync.Map-16                                2.00µs    2.01µs         ~
AdversarialAlloc/*sync_test.DeepCopyMap-16        3.45µs    3.44µs         ~
AdversarialAlloc/*sync_test.RWMutexMap-16          226ns     227ns         ~
AdversarialAlloc/*sync.Map-16                     1.09µs    1.07µs       -2.36%
AdversarialDelete/*sync_test.DeepCopyMap-16        553ns     550ns       -0.57%
AdversarialDelete/*sync_test.RWMutexMap-16         273ns     274ns         ~
AdversarialDelete/*sync.Map-16                     247ns     249ns         ~
UncontendedSemaphore-16                           79.0ns    65.5ns      -17.11%
ContendedSemaphore-16                              112ns      97ns      -13.77%
MutexUncontended-16                               3.34ns    2.51ns      -24.69%
Mutex-16                                           266ns     191ns      -28.26%
MutexSlack-16                                      226ns     159ns      -29.55%
MutexWork-16                                       377ns     338ns      -10.14%
MutexWorkSlack-16                                  335ns     308ns       -8.20%
MutexNoSpin-16                                     196ns     184ns       -5.91%
MutexSpin-16                                       710ns     666ns       -6.21%
Once-16                                           1.29ns    1.29ns         ~
Pool-16                                           8.64ns    8.71ns         ~
PoolOverflow-16                                   1.60µs    1.44µs      -10.25%
SemaUncontended-16                                5.39ns    4.42ns      -17.96%
SemaSyntNonblock-16                                539ns     483ns      -10.42%
SemaSyntBlock-16                                   413ns     354ns      -14.20%
SemaWorkNonblock-16                                305ns     258ns      -15.36%
SemaWorkBlock-16                                   266ns     229ns      -14.06%
RWMutexUncontended-16                             12.9ns     9.7ns      -24.80%
RWMutexWrite100-16                                 203ns     147ns      -27.47%
RWMutexWrite10-16                                  177ns     119ns      -32.74%
RWMutexWorkWrite100-16                             435ns     403ns       -7.39%
RWMutexWorkWrite10-16                              642ns     611ns       -4.79%
WaitGroupUncontended-16                           4.67ns    3.70ns      -20.92%
WaitGroupAddDone-16                                402ns     355ns      -11.54%
WaitGroupAddDoneWork-16                            208ns     250ns      +20.09%
WaitGroupWait-16                                  1.21ns    1.21ns         ~
WaitGroupWaitWork-16                              5.91ns    5.87ns       -0.81%
WaitGroupActuallyWait-16                          92.2ns    85.8ns       -6.91%

Updates #21348

Change-Id: Ibb9b271d11b308264103829e176c6d9fe8f867d3
Reviewed-on: https://go-review.googlesource.com/95175
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
Carlos Eduardo Seo 2018-01-04 18:23:27 -02:00 committed by Lynn Boger
parent a3d8326993
commit 6633bb2aa7
3 changed files with 43 additions and 50 deletions

View File

@ -154,16 +154,17 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicAnd8, case ssa.OpPPC64LoweredAtomicAnd8,
ssa.OpPPC64LoweredAtomicOr8: ssa.OpPPC64LoweredAtomicOr8:
// SYNC // LWSYNC
// LBAR (Rarg0), Rtmp // LBAR (Rarg0), Rtmp
// AND/OR Rarg1, Rtmp // AND/OR Rarg1, Rtmp
// STBCCC Rtmp, (Rarg0) // STBCCC Rtmp, (Rarg0)
// BNE -3(PC) // BNE -3(PC)
// ISYNC
r0 := v.Args[0].Reg() r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg() r1 := v.Args[1].Reg()
psync := s.Prog(ppc64.ASYNC) // LWSYNC - Assuming shared data not write-through-required nor
psync.To.Type = obj.TYPE_NONE // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
p := s.Prog(ppc64.ALBAR) p := s.Prog(ppc64.ALBAR)
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
p.From.Reg = r0 p.From.Reg = r0
@ -183,17 +184,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p3 := s.Prog(ppc64.ABNE) p3 := s.Prog(ppc64.ABNE)
p3.To.Type = obj.TYPE_BRANCH p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p) gc.Patch(p3, p)
pisync := s.Prog(ppc64.AISYNC)
pisync.To.Type = obj.TYPE_NONE
case ssa.OpPPC64LoweredAtomicAdd32, case ssa.OpPPC64LoweredAtomicAdd32,
ssa.OpPPC64LoweredAtomicAdd64: ssa.OpPPC64LoweredAtomicAdd64:
// SYNC // LWSYNC
// LDAR/LWAR (Rarg0), Rout // LDAR/LWAR (Rarg0), Rout
// ADD Rarg1, Rout // ADD Rarg1, Rout
// STDCCC/STWCCC Rout, (Rarg0) // STDCCC/STWCCC Rout, (Rarg0)
// BNE -3(PC) // BNE -3(PC)
// ISYNC
// MOVW Rout,Rout (if Add32) // MOVW Rout,Rout (if Add32)
ld := ppc64.ALDAR ld := ppc64.ALDAR
st := ppc64.ASTDCCC st := ppc64.ASTDCCC
@ -204,9 +202,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r0 := v.Args[0].Reg() r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg() r1 := v.Args[1].Reg()
out := v.Reg0() out := v.Reg0()
// SYNC // LWSYNC - Assuming shared data not write-through-required nor
psync := s.Prog(ppc64.ASYNC) // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
psync.To.Type = obj.TYPE_NONE plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
// LDAR or LWAR // LDAR or LWAR
p := s.Prog(ld) p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
@ -229,9 +228,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p4 := s.Prog(ppc64.ABNE) p4 := s.Prog(ppc64.ABNE)
p4.To.Type = obj.TYPE_BRANCH p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p) gc.Patch(p4, p)
// ISYNC
pisync := s.Prog(ppc64.AISYNC)
pisync.To.Type = obj.TYPE_NONE
// Ensure a 32 bit result // Ensure a 32 bit result
if v.Op == ssa.OpPPC64LoweredAtomicAdd32 { if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
@ -244,7 +240,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicExchange32, case ssa.OpPPC64LoweredAtomicExchange32,
ssa.OpPPC64LoweredAtomicExchange64: ssa.OpPPC64LoweredAtomicExchange64:
// SYNC // LWSYNC
// LDAR/LWAR (Rarg0), Rout // LDAR/LWAR (Rarg0), Rout
// STDCCC/STWCCC Rout, (Rarg0) // STDCCC/STWCCC Rout, (Rarg0)
// BNE -2(PC) // BNE -2(PC)
@ -258,9 +254,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r0 := v.Args[0].Reg() r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg() r1 := v.Args[1].Reg()
out := v.Reg0() out := v.Reg0()
// SYNC // LWSYNC - Assuming shared data not write-through-required nor
psync := s.Prog(ppc64.ASYNC) // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
psync.To.Type = obj.TYPE_NONE plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
// LDAR or LWAR // LDAR or LWAR
p := s.Prog(ld) p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
@ -342,14 +339,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicCas64, case ssa.OpPPC64LoweredAtomicCas64,
ssa.OpPPC64LoweredAtomicCas32: ssa.OpPPC64LoweredAtomicCas32:
// SYNC // LWSYNC
// loop: // loop:
// LDAR (Rarg0), Rtmp // LDAR (Rarg0), Rtmp
// CMP Rarg1, Rtmp // CMP Rarg1, Rtmp
// BNE fail // BNE fail
// STDCCC Rarg2, (Rarg0) // STDCCC Rarg2, (Rarg0)
// BNE loop // BNE loop
// ISYNC // LWSYNC
// MOVD $1, Rout // MOVD $1, Rout
// BR end // BR end
// fail: // fail:
@ -367,9 +364,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r1 := v.Args[1].Reg() r1 := v.Args[1].Reg()
r2 := v.Args[2].Reg() r2 := v.Args[2].Reg()
out := v.Reg0() out := v.Reg0()
// SYNC // LWSYNC - Assuming shared data not write-through-required nor
psync := s.Prog(ppc64.ASYNC) // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
psync.To.Type = obj.TYPE_NONE plwsync1 := s.Prog(ppc64.ALWSYNC)
plwsync1.To.Type = obj.TYPE_NONE
// LDAR or LWAR // LDAR or LWAR
p := s.Prog(ld) p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
@ -395,9 +393,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p4 := s.Prog(ppc64.ABNE) p4 := s.Prog(ppc64.ABNE)
p4.To.Type = obj.TYPE_BRANCH p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p) gc.Patch(p4, p)
// ISYNC // LWSYNC - Assuming shared data not write-through-required nor
pisync := s.Prog(ppc64.AISYNC) // caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b.
pisync.To.Type = obj.TYPE_NONE plwsync2 := s.Prog(ppc64.ALWSYNC)
plwsync2.To.Type = obj.TYPE_NONE
// return true // return true
p5 := s.Prog(ppc64.AMOVD) p5 := s.Prog(ppc64.AMOVD)
p5.From.Type = obj.TYPE_CONST p5.From.Type = obj.TYPE_CONST

View File

@ -17,7 +17,7 @@ TEXT runtimeinternalatomic·Cas(SB), NOSPLIT, $0-17
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVWZ old+8(FP), R4 MOVWZ old+8(FP), R4
MOVWZ new+12(FP), R5 MOVWZ new+12(FP), R5
SYNC LWSYNC
cas_again: cas_again:
LWAR (R3), R6 LWAR (R3), R6
CMPW R6, R4 CMPW R6, R4
@ -25,7 +25,7 @@ cas_again:
STWCCC R5, (R3) STWCCC R5, (R3)
BNE cas_again BNE cas_again
MOVD $1, R3 MOVD $1, R3
ISYNC LWSYNC
MOVB R3, ret+16(FP) MOVB R3, ret+16(FP)
RET RET
cas_fail: cas_fail:
@ -44,7 +44,7 @@ TEXT runtimeinternalatomic·Cas64(SB), NOSPLIT, $0-25
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVD old+8(FP), R4 MOVD old+8(FP), R4
MOVD new+16(FP), R5 MOVD new+16(FP), R5
SYNC LWSYNC
cas64_again: cas64_again:
LDAR (R3), R6 LDAR (R3), R6
CMP R6, R4 CMP R6, R4
@ -52,7 +52,7 @@ cas64_again:
STDCCC R5, (R3) STDCCC R5, (R3)
BNE cas64_again BNE cas64_again
MOVD $1, R3 MOVD $1, R3
ISYNC LWSYNC
MOVB R3, ret+24(FP) MOVB R3, ret+24(FP)
RET RET
cas64_fail: cas64_fail:
@ -97,31 +97,29 @@ TEXT runtimeinternalatomic·Casp1(SB), NOSPLIT, $0-25
TEXT runtimeinternalatomic·Xadd(SB), NOSPLIT, $0-20 TEXT runtimeinternalatomic·Xadd(SB), NOSPLIT, $0-20
MOVD ptr+0(FP), R4 MOVD ptr+0(FP), R4
MOVW delta+8(FP), R5 MOVW delta+8(FP), R5
SYNC LWSYNC
LWAR (R4), R3 LWAR (R4), R3
ADD R5, R3 ADD R5, R3
STWCCC R3, (R4) STWCCC R3, (R4)
BNE -3(PC) BNE -3(PC)
ISYNC
MOVW R3, ret+16(FP) MOVW R3, ret+16(FP)
RET RET
TEXT runtimeinternalatomic·Xadd64(SB), NOSPLIT, $0-24 TEXT runtimeinternalatomic·Xadd64(SB), NOSPLIT, $0-24
MOVD ptr+0(FP), R4 MOVD ptr+0(FP), R4
MOVD delta+8(FP), R5 MOVD delta+8(FP), R5
SYNC LWSYNC
LDAR (R4), R3 LDAR (R4), R3
ADD R5, R3 ADD R5, R3
STDCCC R3, (R4) STDCCC R3, (R4)
BNE -3(PC) BNE -3(PC)
ISYNC
MOVD R3, ret+16(FP) MOVD R3, ret+16(FP)
RET RET
TEXT runtimeinternalatomic·Xchg(SB), NOSPLIT, $0-20 TEXT runtimeinternalatomic·Xchg(SB), NOSPLIT, $0-20
MOVD ptr+0(FP), R4 MOVD ptr+0(FP), R4
MOVW new+8(FP), R5 MOVW new+8(FP), R5
SYNC LWSYNC
LWAR (R4), R3 LWAR (R4), R3
STWCCC R5, (R4) STWCCC R5, (R4)
BNE -2(PC) BNE -2(PC)
@ -132,7 +130,7 @@ TEXT runtimeinternalatomic·Xchg(SB), NOSPLIT, $0-20
TEXT runtimeinternalatomic·Xchg64(SB), NOSPLIT, $0-24 TEXT runtimeinternalatomic·Xchg64(SB), NOSPLIT, $0-24
MOVD ptr+0(FP), R4 MOVD ptr+0(FP), R4
MOVD new+8(FP), R5 MOVD new+8(FP), R5
SYNC LWSYNC
LDAR (R4), R3 LDAR (R4), R3
STDCCC R5, (R4) STDCCC R5, (R4)
BNE -2(PC) BNE -2(PC)
@ -165,24 +163,22 @@ TEXT runtimeinternalatomic·Store64(SB), NOSPLIT, $0-16
TEXT runtimeinternalatomic·Or8(SB), NOSPLIT, $0-9 TEXT runtimeinternalatomic·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4 MOVBZ val+8(FP), R4
SYNC LWSYNC
again: again:
LBAR (R3), R6 LBAR (R3), R6
OR R4, R6 OR R4, R6
STBCCC R6, (R3) STBCCC R6, (R3)
BNE again BNE again
ISYNC
RET RET
// void runtimeinternalatomic·And8(byte volatile*, byte); // void runtimeinternalatomic·And8(byte volatile*, byte);
TEXT runtimeinternalatomic·And8(SB), NOSPLIT, $0-9 TEXT runtimeinternalatomic·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4 MOVBZ val+8(FP), R4
SYNC LWSYNC
again: again:
LBAR (R3),R6 LBAR (R3),R6
AND R4,R6 AND R4,R6
STBCCC R6,(R3) STBCCC R6,(R3)
BNE again BNE again
ISYNC
RET RET

View File

@ -12,7 +12,7 @@ TEXT ·SwapInt32(SB),NOSPLIT,$0-20
TEXT ·SwapUint32(SB),NOSPLIT,$0-20 TEXT ·SwapUint32(SB),NOSPLIT,$0-20
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVW new+8(FP), R4 MOVW new+8(FP), R4
SYNC LWSYNC
LWAR (R3), R5 LWAR (R3), R5
STWCCC R4, (R3) STWCCC R4, (R3)
BNE -2(PC) BNE -2(PC)
@ -26,7 +26,7 @@ TEXT ·SwapInt64(SB),NOSPLIT,$0-24
TEXT ·SwapUint64(SB),NOSPLIT,$0-24 TEXT ·SwapUint64(SB),NOSPLIT,$0-24
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVD new+8(FP), R4 MOVD new+8(FP), R4
SYNC LWSYNC
LDAR (R3), R5 LDAR (R3), R5
STDCCC R4, (R3) STDCCC R4, (R3)
BNE -2(PC) BNE -2(PC)
@ -44,13 +44,13 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVW old+8(FP), R4 MOVW old+8(FP), R4
MOVW new+12(FP), R5 MOVW new+12(FP), R5
SYNC LWSYNC
LWAR (R3), R6 LWAR (R3), R6
CMPW R6, R4 CMPW R6, R4
BNE 7(PC) BNE 7(PC)
STWCCC R5, (R3) STWCCC R5, (R3)
BNE -4(PC) BNE -4(PC)
ISYNC LWSYNC
MOVD $1, R3 MOVD $1, R3
MOVB R3, swapped+16(FP) MOVB R3, swapped+16(FP)
RET RET
@ -67,13 +67,13 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVD old+8(FP), R4 MOVD old+8(FP), R4
MOVD new+16(FP), R5 MOVD new+16(FP), R5
SYNC LWSYNC
LDAR (R3), R6 LDAR (R3), R6
CMP R6, R4 CMP R6, R4
BNE 7(PC) BNE 7(PC)
STDCCC R5, (R3) STDCCC R5, (R3)
BNE -4(PC) BNE -4(PC)
ISYNC LWSYNC
MOVD $1, R3 MOVD $1, R3
MOVB R3, swapped+24(FP) MOVB R3, swapped+24(FP)
RET RET
@ -86,12 +86,11 @@ TEXT ·AddInt32(SB),NOSPLIT,$0-20
TEXT ·AddUint32(SB),NOSPLIT,$0-20 TEXT ·AddUint32(SB),NOSPLIT,$0-20
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVW delta+8(FP), R4 MOVW delta+8(FP), R4
SYNC LWSYNC
LWAR (R3), R5 LWAR (R3), R5
ADD R4, R5 ADD R4, R5
STWCCC R5, (R3) STWCCC R5, (R3)
BNE -3(PC) BNE -3(PC)
ISYNC
MOVW R5, new+16(FP) MOVW R5, new+16(FP)
RET RET
@ -104,12 +103,11 @@ TEXT ·AddInt64(SB),NOSPLIT,$0-24
TEXT ·AddUint64(SB),NOSPLIT,$0-24 TEXT ·AddUint64(SB),NOSPLIT,$0-24
MOVD addr+0(FP), R3 MOVD addr+0(FP), R3
MOVD delta+8(FP), R4 MOVD delta+8(FP), R4
SYNC LWSYNC
LDAR (R3), R5 LDAR (R3), R5
ADD R4, R5 ADD R4, R5
STDCCC R5, (R3) STDCCC R5, (R3)
BNE -3(PC) BNE -3(PC)
ISYNC
MOVD R5, new+16(FP) MOVD R5, new+16(FP)
RET RET