1
0
mirror of https://github.com/golang/go synced 2024-11-12 03:00:22 -07:00

sync/atomic, runtime/internal/atomic: improve ppc64x atomics

The following performance improvements have been made to the
low-level atomic functions for ppc64le & ppc64:

- For those cases containing a lwarx and stwcx (or other sizes):
sync, lwarx, maybe something, stwcx, loop to sync, sync, isync
The sync is moved before (outside) the lwarx/stwcx loop, and the
 sync after is removed, so it becomes:
sync, lwarx, maybe something, stwcx, loop to lwarx, isync

- For the Or8 and And8, the shifting and manipulation of the
address to the word aligned version were removed and the
instructions were changed to use lbarx, stbcx instead of
register shifting, xor, then lwarx, stwcx.

- New instructions LWSYNC, LBAR, STBCC were tested and added.
runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode
instead of the WORD encoding.

Fixes #15469

Ran some of the benchmarks in the runtime and sync directories.
Some results varied from run to run but the trend was improvement
based on best times for base and new:

runtime.test:
BenchmarkChanNonblocking-128         0.88          0.89          +1.14%
BenchmarkChanUncontended-128         569           511           -10.19%
BenchmarkChanContended-128           63110         53231         -15.65%
BenchmarkChanSync-128                691           598           -13.46%
BenchmarkChanSyncWork-128            11355         11649         +2.59%
BenchmarkChanProdCons0-128           2402          2090          -12.99%
BenchmarkChanProdCons10-128          1348          1363          +1.11%
BenchmarkChanProdCons100-128         1002          746           -25.55%
BenchmarkChanProdConsWork0-128       2554          2720          +6.50%
BenchmarkChanProdConsWork10-128      1909          1804          -5.50%
BenchmarkChanProdConsWork100-128     1624          1580          -2.71%
BenchmarkChanCreation-128            237           212           -10.55%
BenchmarkChanSem-128                 705           667           -5.39%
BenchmarkChanPopular-128             5081190       4497566       -11.49%

BenchmarkCreateGoroutines-128             532           473           -11.09%
BenchmarkCreateGoroutinesParallel-128     35.0          34.7          -0.86%
BenchmarkCreateGoroutinesCapture-128      4923          4200          -14.69%

sync.test:
BenchmarkUncontendedSemaphore-128      112           94.2          -15.89%
BenchmarkContendedSemaphore-128        133           128           -3.76%
BenchmarkMutexUncontended-128          1.90          1.67          -12.11%
BenchmarkMutex-128                     353           310           -12.18%
BenchmarkMutexSlack-128                304           283           -6.91%
BenchmarkMutexWork-128                 554           541           -2.35%
BenchmarkMutexWorkSlack-128            567           556           -1.94%
BenchmarkMutexNoSpin-128               275           242           -12.00%
BenchmarkMutexSpin-128                 1129          1030          -8.77%
BenchmarkOnce-128                      1.08          0.96          -11.11%
BenchmarkPool-128                      29.8          27.4          -8.05%
BenchmarkPoolOverflow-128              40564         36583         -9.81%
BenchmarkSemaUncontended-128           3.14          2.63          -16.24%
BenchmarkSemaSyntNonblock-128          1087          1069          -1.66%
BenchmarkSemaSyntBlock-128             897           893           -0.45%
BenchmarkSemaWorkNonblock-128          1034          1028          -0.58%
BenchmarkSemaWorkBlock-128             949           886           -6.64%

Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e
Reviewed-on: https://go-review.googlesource.com/22549
Reviewed-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Minux Ma <minux@golang.org>
This commit is contained in:
Lynn Boger 2016-04-28 07:16:08 -05:00 committed by Michael Munday
parent 0960c7c7eb
commit eeca3ba92f
7 changed files with 44 additions and 68 deletions

View File

@ -342,8 +342,10 @@ const (
AFSUBS AFSUBS
AFSUBSCC AFSUBSCC
AMOVMW AMOVMW
ALBAR
ALSW ALSW
ALWAR ALWAR
ALWSYNC
AMOVWBR AMOVWBR
AMOVB AMOVB
AMOVBU AMOVBU
@ -401,6 +403,7 @@ const (
ASRAW ASRAW
ASRAWCC ASRAWCC
ASRWCC ASRWCC
ASTBCCC
ASTSW ASTSW
ASTWCCC ASTWCCC
ASUB ASUB

View File

@ -118,8 +118,10 @@ var Anames = []string{
"FSUBS", "FSUBS",
"FSUBSCC", "FSUBSCC",
"MOVMW", "MOVMW",
"LBAR",
"LSW", "LSW",
"LWAR", "LWAR",
"LWSYNC",
"MOVWBR", "MOVWBR",
"MOVB", "MOVB",
"MOVBU", "MOVBU",
@ -177,6 +179,7 @@ var Anames = []string{
"SRAW", "SRAW",
"SRAWCC", "SRAWCC",
"SRWCC", "SRWCC",
"STBCCC",
"STSW", "STSW",
"STWCCC", "STWCCC",
"SUB", "SUB",

View File

@ -933,6 +933,7 @@ func buildop(ctxt *obj.Link) {
case AECOWX: /* indexed store: op s,(b+a); op s,(b) */ case AECOWX: /* indexed store: op s,(b+a); op s,(b) */
opset(ASTWCCC, r0) opset(ASTWCCC, r0)
opset(ASTBCCC, r0)
opset(ASTDCCC, r0) opset(ASTDCCC, r0)
@ -1202,6 +1203,7 @@ func buildop(ctxt *obj.Link) {
case ASYNC: case ASYNC:
opset(AISYNC, r0) opset(AISYNC, r0)
opset(ALWSYNC, r0)
opset(APTESYNC, r0) opset(APTESYNC, r0)
opset(ATLBSYNC, r0) opset(ATLBSYNC, r0)
@ -1228,6 +1230,7 @@ func buildop(ctxt *obj.Link) {
opset(AFMOVSU, r0) opset(AFMOVSU, r0)
case AECIWX: case AECIWX:
opset(ALBAR, r0)
opset(ALWAR, r0) opset(ALWAR, r0)
opset(ALDAR, r0) opset(ALDAR, r0)
@ -3001,6 +3004,9 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
case ASYNC: case ASYNC:
return OPVCC(31, 598, 0, 0) return OPVCC(31, 598, 0, 0)
case ALWSYNC:
return OPVCC(31, 598, 0, 0) | 1<<21
case APTESYNC: case APTESYNC:
return OPVCC(31, 598, 0, 0) | 2<<21 return OPVCC(31, 598, 0, 0) | 2<<21
@ -3246,6 +3252,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
return OPVCC(31, 311, 0, 0) /* lhzux */ return OPVCC(31, 311, 0, 0) /* lhzux */
case AECIWX: case AECIWX:
return OPVCC(31, 310, 0, 0) /* eciwx */ return OPVCC(31, 310, 0, 0) /* eciwx */
case ALBAR:
return OPVCC(31, 52, 0, 0) /* lbarx */
case ALWAR: case ALWAR:
return OPVCC(31, 20, 0, 0) /* lwarx */ return OPVCC(31, 20, 0, 0) /* lwarx */
case ALDAR: case ALDAR:
@ -3342,6 +3350,8 @@ func opstorex(ctxt *obj.Link, a obj.As) uint32 {
return OPVCC(31, 661, 0, 0) /* stswx */ return OPVCC(31, 661, 0, 0) /* stswx */
case AMOVWBR: case AMOVWBR:
return OPVCC(31, 662, 0, 0) /* stwbrx */ return OPVCC(31, 662, 0, 0) /* stwbrx */
case ASTBCCC:
return OPVCC(31, 694, 0, 1) /* stbcx. */
case ASTWCCC: case ASTWCCC:
return OPVCC(31, 150, 0, 1) /* stwcx. */ return OPVCC(31, 150, 0, 1) /* stwcx. */
case ASTDCCC: case ASTDCCC:

View File

@ -301,6 +301,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
} }
case ALWAR, case ALWAR,
ALBAR,
ASTBCCC,
ASTWCCC, ASTWCCC,
AECIWX, AECIWX,
AECOWX, AECOWX,
@ -323,6 +325,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
ASYNC, ASYNC,
ATLBSYNC, ATLBSYNC,
APTESYNC, APTESYNC,
ALWSYNC,
ATW, ATW,
AWORD, AWORD,
ARFI, ARFI,

View File

@ -10,5 +10,5 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
// LWSYNC is the "export" barrier recommended by Power ISA // LWSYNC is the "export" barrier recommended by Power ISA
// v2.07 book II, appendix B.2.2.2. // v2.07 book II, appendix B.2.2.2.
// LWSYNC is a load/load, load/store, and store/store barrier. // LWSYNC is a load/load, load/store, and store/store barrier.
WORD $0x7c2004ac // LWSYNC LWSYNC
RET RET

View File

@ -17,21 +17,20 @@ TEXT runtimeinternalatomic·Cas(SB), NOSPLIT, $0-17
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVWZ old+8(FP), R4 MOVWZ old+8(FP), R4
MOVWZ new+12(FP), R5 MOVWZ new+12(FP), R5
cas_again:
SYNC SYNC
cas_again:
LWAR (R3), R6 LWAR (R3), R6
CMPW R6, R4 CMPW R6, R4
BNE cas_fail BNE cas_fail
STWCCC R5, (R3) STWCCC R5, (R3)
BNE cas_again BNE cas_again
MOVD $1, R3 MOVD $1, R3
SYNC
ISYNC ISYNC
MOVB R3, ret+16(FP) MOVB R3, ret+16(FP)
RET RET
cas_fail: cas_fail:
MOVD $0, R3 MOVB R0, ret+16(FP)
BR -5(PC) RET
// bool runtimeinternalatomic·Cas64(uint64 *ptr, uint64 old, uint64 new) // bool runtimeinternalatomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
// Atomically: // Atomically:
@ -45,21 +44,20 @@ TEXT runtimeinternalatomic·Cas64(SB), NOSPLIT, $0-25
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVD old+8(FP), R4 MOVD old+8(FP), R4
MOVD new+16(FP), R5 MOVD new+16(FP), R5
cas64_again:
SYNC SYNC
cas64_again:
LDAR (R3), R6 LDAR (R3), R6
CMP R6, R4 CMP R6, R4
BNE cas64_fail BNE cas64_fail
STDCCC R5, (R3) STDCCC R5, (R3)
BNE cas64_again BNE cas64_again
MOVD $1, R3 MOVD $1, R3
SYNC
ISYNC ISYNC
MOVB R3, ret+24(FP) MOVB R3, ret+24(FP)
RET RET
cas64_fail: cas64_fail:
MOVD $0, R3 MOVB R0, ret+24(FP)
BR -5(PC) RET
TEXT runtimeinternalatomic·Casuintptr(SB), NOSPLIT, $0-25 TEXT runtimeinternalatomic·Casuintptr(SB), NOSPLIT, $0-25
BR runtimeinternalatomic·Cas64(SB) BR runtimeinternalatomic·Cas64(SB)
@ -103,8 +101,7 @@ TEXT runtimeinternalatomic·Xadd(SB), NOSPLIT, $0-20
LWAR (R4), R3 LWAR (R4), R3
ADD R5, R3 ADD R5, R3
STWCCC R3, (R4) STWCCC R3, (R4)
BNE -4(PC) BNE -3(PC)
SYNC
ISYNC ISYNC
MOVW R3, ret+16(FP) MOVW R3, ret+16(FP)
RET RET
@ -116,8 +113,7 @@ TEXT runtimeinternalatomic·Xadd64(SB), NOSPLIT, $0-24
LDAR (R4), R3 LDAR (R4), R3
ADD R5, R3 ADD R5, R3
STDCCC R3, (R4) STDCCC R3, (R4)
BNE -4(PC) BNE -3(PC)
SYNC
ISYNC ISYNC
MOVD R3, ret+16(FP) MOVD R3, ret+16(FP)
RET RET
@ -128,8 +124,7 @@ TEXT runtimeinternalatomic·Xchg(SB), NOSPLIT, $0-20
SYNC SYNC
LWAR (R4), R3 LWAR (R4), R3
STWCCC R5, (R4) STWCCC R5, (R4)
BNE -3(PC) BNE -2(PC)
SYNC
ISYNC ISYNC
MOVW R3, ret+16(FP) MOVW R3, ret+16(FP)
RET RET
@ -140,8 +135,7 @@ TEXT runtimeinternalatomic·Xchg64(SB), NOSPLIT, $0-24
SYNC SYNC
LDAR (R4), R3 LDAR (R4), R3
STDCCC R5, (R4) STDCCC R5, (R4)
BNE -3(PC) BNE -2(PC)
SYNC
ISYNC ISYNC
MOVD R3, ret+16(FP) MOVD R3, ret+16(FP)
RET RET
@ -171,26 +165,12 @@ TEXT runtimeinternalatomic·Store64(SB), NOSPLIT, $0-16
TEXT runtimeinternalatomic·Or8(SB), NOSPLIT, $0-9 TEXT runtimeinternalatomic·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4 MOVBZ val+8(FP), R4
// Align ptr down to 4 bytes so we can use 32-bit load/store. SYNC
// R5 = (R3 << 0) & ~3
RLDCR $0, R3, $~3, R5
// Compute val shift.
#ifdef GOARCH_ppc64
// Big endian. ptr = ptr ^ 3
XOR $3, R3
#endif
// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
RLDC $3, R3, $(3*8), R6
// Shift val for aligned ptr. R4 = val << R6
SLD R6, R4, R4
again: again:
SYNC LBAR (R3), R6
LWAR (R5), R6
OR R4, R6 OR R4, R6
STWCCC R6, (R5) STBCCC R6, (R3)
BNE again BNE again
SYNC
ISYNC ISYNC
RET RET
@ -198,28 +178,11 @@ again:
TEXT runtimeinternalatomic·And8(SB), NOSPLIT, $0-9 TEXT runtimeinternalatomic·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3 MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4 MOVBZ val+8(FP), R4
// Align ptr down to 4 bytes so we can use 32-bit load/store. SYNC
// R5 = (R3 << 0) & ~3
RLDCR $0, R3, $~3, R5
// Compute val shift.
#ifdef GOARCH_ppc64
// Big endian. ptr = ptr ^ 3
XOR $3, R3
#endif
// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
RLDC $3, R3, $(3*8), R6
// Shift val for aligned ptr. R4 = val << R6 | ^(0xFF << R6)
MOVD $0xFF, R7
SLD R6, R4
SLD R6, R7
XOR $-1, R7
OR R7, R4
again: again:
SYNC LBAR (R3), R6
LWAR (R5), R6
AND R4, R6 AND R4, R6
STWCCC R6, (R5) STBCCC R6, (R3)
BNE again BNE again
SYNC
ISYNC ISYNC
RET RET

View File

@ -15,8 +15,7 @@ TEXT ·SwapUint32(SB),NOSPLIT,$0-20
SYNC SYNC
LWAR (R3), R5 LWAR (R3), R5
STWCCC R4, (R3) STWCCC R4, (R3)
BNE -3(PC) BNE -2(PC)
SYNC
ISYNC ISYNC
MOVW R5, old+16(FP) MOVW R5, old+16(FP)
RET RET
@ -30,8 +29,7 @@ TEXT ·SwapUint64(SB),NOSPLIT,$0-24
SYNC SYNC
LDAR (R3), R5 LDAR (R3), R5
STDCCC R4, (R3) STDCCC R4, (R3)
BNE -3(PC) BNE -2(PC)
SYNC
ISYNC ISYNC
MOVD R5, old+16(FP) MOVD R5, old+16(FP)
RET RET
@ -49,10 +47,9 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
SYNC SYNC
LWAR (R3), R6 LWAR (R3), R6
CMPW R6, R4 CMPW R6, R4
BNE 8(PC) BNE 7(PC)
STWCCC R5, (R3) STWCCC R5, (R3)
BNE -5(PC) BNE -4(PC)
SYNC
ISYNC ISYNC
MOVD $1, R3 MOVD $1, R3
MOVB R3, swapped+16(FP) MOVB R3, swapped+16(FP)
@ -73,10 +70,9 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
SYNC SYNC
LDAR (R3), R6 LDAR (R3), R6
CMP R6, R4 CMP R6, R4
BNE 8(PC) BNE 7(PC)
STDCCC R5, (R3) STDCCC R5, (R3)
BNE -5(PC) BNE -4(PC)
SYNC
ISYNC ISYNC
MOVD $1, R3 MOVD $1, R3
MOVB R3, swapped+24(FP) MOVB R3, swapped+24(FP)
@ -94,8 +90,7 @@ TEXT ·AddUint32(SB),NOSPLIT,$0-20
LWAR (R3), R5 LWAR (R3), R5
ADD R4, R5 ADD R4, R5
STWCCC R5, (R3) STWCCC R5, (R3)
BNE -4(PC) BNE -3(PC)
SYNC
ISYNC ISYNC
MOVW R5, ret+16(FP) MOVW R5, ret+16(FP)
RET RET
@ -113,8 +108,7 @@ TEXT ·AddUint64(SB),NOSPLIT,$0-24
LDAR (R3), R5 LDAR (R3), R5
ADD R4, R5 ADD R4, R5
STDCCC R5, (R3) STDCCC R5, (R3)
BNE -4(PC) BNE -3(PC)
SYNC
ISYNC ISYNC
MOVD R5, ret+16(FP) MOVD R5, ret+16(FP)
RET RET