cmd/asm, cmd/internal/obj/ppc64: Add ISA 2.05, 2.06 and 2.07 instructions.

This change adds instructions from ISA 2.05, 2.06 and 2.07 that are frequently used in assembly optimizations for ppc64. It also fixes two problems: * the implementation of RLDICR[CC]/RLDICL[CC] did not consider all possible cases for the bit mask. * removed two non-existing instructions that were added by mistake in the VMX implementation (VORL/VANDL). Change-Id: Iaef4e5c6a5240c2156c6c0f28ad3bcd8780e9830 Reviewed-on: https://go-review.googlesource.com/36230 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
2024-10-02 16:28:34 -06:00 · 2017-02-02 17:59:18 -02:00 · 2017-02-02 17:59:18 -02:00 · 85ecc51c48
commit 85ecc51c48
parent 06637fb314
4 changed files with 191 additions and 24 deletions
--- a/src/cmd/asm/internal/asm/testdata/ppc64.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64.s
@ -540,6 +540,14 @@ label1:
 //	}
 //	FCMPU	F1, F2, CR0

+//	FTDIV FRA, FRB, BF produces
+//	ftdiv BF, FRA, FRB
+	FTDIV F1,F2,$7
+
+//	FTSQRT	FRB, BF produces
+//	ftsqrt	BF, FRB
+	FTSQRT	F2,$7
+
 //
 // CMP
 //
@ -567,6 +575,10 @@ label1:
 //	}
 	CMP	R1, $4, CR0 // CMP R1, CR0, $4

+//	CMPB  RS,RB,RA produces
+//	cmpb  RA,RS,RB
+	CMPB  R2,R2,R1
+
 //
 // rotate and mask
 //
@ -673,6 +685,13 @@ label1:
 	DCBF	(R1)
 	DCBF	(R1+R2) // DCBF	(R1)(R2*1)

+//	Population count, X-form
+//	<MNEMONIC> RS,RA produces
+//	<mnemonic> RA,RS
+	POPCNTD	R1,R2
+	POPCNTW	R1,R2
+	POPCNTB R1,R2
+
 //	VMX instructions

 //	Described as:
@ -703,14 +722,14 @@ label1:
 //	Vector AND, VX-form
 //	<MNEMONIC> VRA,VRB,VRT produces
 //	<mnemonic> VRT,VRA,VRB
-	VANDL	V10, V9, V8
+	VAND	V10, V9, V8
 	VANDC	V15, V14, V13
 	VNAND	V19, V18, V17

 //	Vector OR, VX-form
 //	<MNEMONIC> VRA,VRB,VRT produces
 //	<mnemonic> VRT,VRA,VRB
-	VORL	V26, V25, V24
+	VOR	V26, V25, V24
 	VORC	V23, V22, V21
 	VNOR	V20, V19, V18
 	VXOR	V17, V16, V15
@ -739,6 +758,14 @@ label1:
 	VADDEUQM V4, V3, V2, V1
 	VADDECUQ V4, V3, V2, V1

+//	Vector polynomial multiply-sum, VX-form
+//	<MNEMONIC>  VRA,VRB,VRT produces
+//	<mnemonic>  VRT,VRA,VRB
+	VPMSUMB	V2, V3, V1
+	VPMSUMH	V2, V3, V1
+	VPMSUMW	V2, V3, V1
+	VPMSUMD	V2, V3, V1
+
 //	Vector SUB, VX-form
 //	<MNEMONIC> VRA,VRB,VRT produces
 //	<mnemonic> VRT,VRA,VRB
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@ -626,6 +626,9 @@ const (
 	ACNTLZDCC
 	ACMPW /* CMP with L=0 */
 	ACMPWU
+	ACMPB
+	AFTDIV
+	AFTSQRT
 	ADIVD
 	ADIVDCC
 	ADIVDE
@ -704,6 +707,9 @@ const (

 	/* more 64-bit operations */
 	AHRFID
+	APOPCNTD
+	APOPCNTW
+	APOPCNTB

 	/* Vector */
 	ALV
@ -721,11 +727,9 @@ const (
 	ASTVX
 	ASTVXL
 	AVAND
-	AVANDL
 	AVANDC
 	AVNAND
 	AVOR
-	AVORL
 	AVORC
 	AVNOR
 	AVXOR
@ -770,6 +774,11 @@ const (
 	AVSUBE
 	AVSUBEUQM
 	AVSUBECUQ
+	AVPMSUM
+	AVPMSUMB
+	AVPMSUMH
+	AVPMSUMW
+	AVPMSUMD
 	AVR
 	AVRLB
 	AVRLH
@ -871,9 +880,13 @@ const (
 	ASTXSIWX
 	AMFVSR
 	AMFVSRD
+	AMFFPRD
+	AMFVRD
 	AMFVSRWZ
 	AMTVSR
 	AMTVSRD
+	AMTFPRD
+	AMTVRD
 	AMTVSRWA
 	AMTVSRWZ
 	AXXLAND
--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@ -248,6 +248,9 @@ var Anames = []string{
 	"CNTLZDCC",
 	"CMPW",
 	"CMPWU",
+	"CMPB",
+	"FTDIV",
+	"FTSQRT",
 	"DIVD",
 	"DIVDCC",
 	"DIVDE",
@ -321,6 +324,9 @@ var Anames = []string{
 	"REMDUV",
 	"REMDUVCC",
 	"HRFID",
+	"POPCNTD",
+	"POPCNTW",
+	"POPCNTB",
 	"LV",
 	"LVEBX",
 	"LVEHX",
@ -336,11 +342,9 @@ var Anames = []string{
 	"STVX",
 	"STVXL",
 	"VAND",
-	"VANDL",
 	"VANDC",
 	"VNAND",
 	"VOR",
-	"VORL",
 	"VORC",
 	"VNOR",
 	"VXOR",
@ -385,6 +389,11 @@ var Anames = []string{
 	"VSUBE",
 	"VSUBEUQM",
 	"VSUBECUQ",
+	"VPMSUM",
+	"VPMSUMB",
+	"VPMSUMH",
+	"VPMSUMW",
+	"VPMSUMD",
 	"VR",
 	"VRLB",
 	"VRLH",
@ -484,9 +493,13 @@ var Anames = []string{
 	"STXSIWX",
 	"MFVSR",
 	"MFVSRD",
+	"MFFPRD",
+	"MFVRD",
 	"MFVSRWZ",
 	"MTVSR",
 	"MTVSRD",
+	"MTFPRD",
+	"MTVRD",
 	"MTVSRWA",
 	"MTVSRWZ",
 	"XXLAND",
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@ -346,6 +346,12 @@ var optab = []Optab{
 	{AMOVD, C_REG, C_NONE, C_NONE, C_MSR, 54, 4, 0},  /* mtmsrd */
 	{AMOVWZ, C_REG, C_NONE, C_NONE, C_MSR, 54, 4, 0}, /* mtmsr */

+	/* Other ISA 2.05+ instructions */
+	{APOPCNTD, C_REG, C_NONE, C_NONE, C_REG, 93, 4, 0},  /* population count, x-form */
+	{ACMPB, C_REG, C_REG, C_NONE, C_REG, 92, 4, 0},      /* compare byte, x-form */
+	{AFTDIV, C_FREG, C_FREG, C_NONE, C_SCON, 92, 4, 0},  /* floating test for sw divide, x-form */
+	{AFTSQRT, C_FREG, C_NONE, C_NONE, C_SCON, 93, 4, 0}, /* floating test for sw square root, x-form */
+
 	/* Vector instructions */

 	/* Vector load */
@ -372,6 +378,9 @@ var optab = []Optab{
 	{AVSUBSS, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector subtract signed saturate, vx-form */
 	{AVSUBE, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0},  /* vector subtract extended, va-form */

+	/* Vector multiply */
+	{AVPMSUM, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector polynomial multiply & sum, vx-form */
+
 	/* Vector rotate */
 	{AVR, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector rotate, vx-form */

@ -428,9 +437,13 @@ var optab = []Optab{

 	/* VSX move from VSR */
 	{AMFVSR, C_VSREG, C_NONE, C_NONE, C_REG, 88, 4, 0}, /* vsx move from vsr, xx1-form */
+	{AMFVSR, C_FREG, C_NONE, C_NONE, C_REG, 88, 4, 0},
+	{AMFVSR, C_VREG, C_NONE, C_NONE, C_REG, 88, 4, 0},

 	/* VSX move to VSR */
 	{AMTVSR, C_REG, C_NONE, C_NONE, C_VSREG, 88, 4, 0}, /* vsx move to vsr, xx1-form */
+	{AMTVSR, C_REG, C_NONE, C_NONE, C_FREG, 88, 4, 0},
+	{AMTVSR, C_REG, C_NONE, C_NONE, C_VREG, 88, 4, 0},

 	/* VSX logical */
 	{AXXLAND, C_VSREG, C_VSREG, C_NONE, C_VSREG, 90, 4, 0}, /* vsx and, xx3-form */
@ -1161,6 +1174,10 @@ func buildop(ctxt *obj.Link) {
 			opset(ADIVDUVCC, r0)
 			opset(ADIVDUCC, r0)

+		case APOPCNTD:
+			opset(APOPCNTW, r0)
+			opset(APOPCNTB, r0)
+
 		case AMOVBZ: /* lbz, stz, rlwm(r/r), lhz, lha, stz, and x variants */
 			opset(AMOVH, r0)

@ -1192,12 +1209,12 @@ func buildop(ctxt *obj.Link) {
 			opset(ASTVXL, r0)

 		case AVAND: /* vand, vandc, vnand */
-			opset(AVANDL, r0)
+			opset(AVAND, r0)
 			opset(AVANDC, r0)
 			opset(AVNAND, r0)

 		case AVOR: /* vor, vorc, vxor, vnor, veqv */
-			opset(AVORL, r0)
+			opset(AVOR, r0)
 			opset(AVORC, r0)
 			opset(AVXOR, r0)
 			opset(AVNOR, r0)
@ -1253,6 +1270,12 @@ func buildop(ctxt *obj.Link) {
 			opset(AVSUBEUQM, r0)
 			opset(AVSUBECUQ, r0)

+		case AVPMSUM: /* vpmsumb, vpmsumh, vpmsumw, vpmsumd */
+			opset(AVPMSUMB, r0)
+			opset(AVPMSUMH, r0)
+			opset(AVPMSUMW, r0)
+			opset(AVPMSUMD, r0)
+
 		case AVR: /* vrlb, vrlh, vrlw, vrld */
 			opset(AVRLB, r0)
 			opset(AVRLH, r0)
@ -1375,12 +1398,16 @@ func buildop(ctxt *obj.Link) {
 		case ASTXSI: /* stxsiwx */
 			opset(ASTXSIWX, r0)

-		case AMFVSR: /* mfvsrd, mfvsrwz */
+		case AMFVSR: /* mfvsrd, mfvsrwz (and extended mnemonics) */
 			opset(AMFVSRD, r0)
+			opset(AMFFPRD, r0)
+			opset(AMFVRD, r0)
 			opset(AMFVSRWZ, r0)

-		case AMTVSR: /* mtvsrd, mtvsrwa, mtvsrwz */
+		case AMTVSR: /* mtvsrd, mtvsrwa, mtvsrwz (and extended mnemonics) */
 			opset(AMTVSRD, r0)
+			opset(AMTFPRD, r0)
+			opset(AMTVRD, r0)
 			opset(AMTVSRWA, r0)
 			opset(AMTVSRWZ, r0)

@ -1710,6 +1737,15 @@ func buildop(ctxt *obj.Link) {
 		case ACMPU:
 			opset(ACMPWU, r0)

+		case ACMPB:
+			opset(ACMPB, r0)
+
+		case AFTDIV:
+			opset(AFTDIV, r0)
+
+		case AFTSQRT:
+			opset(AFTSQRT, r0)
+
 		case AADD,
 			AANDCC, /* and. Rb,Rs,Ra; andi. $uimm,Rs,Ra; andis. $uimm,Rs,Ra */
 			AFMOVSX,
@ -1783,7 +1819,7 @@ func AOP_RRR(op uint32, d uint32, a uint32, b uint32) uint32 {
 	return op | (d&31)<<21 | (a&31)<<16 | (b&31)<<11
 }

-/* VX-form 2-register operands, r/r/none */
+/* VX-form 2-register operands, r/none/r */
 func AOP_RR(op uint32, d uint32, a uint32) uint32 {
 	return op | (d&31)<<21 | (a&31)<<11
 }
@ -1881,6 +1917,10 @@ func OP_RLW(op uint32, a uint32, s uint32, sh uint32, mb uint32, me uint32) uint
 	return op | (s&31)<<21 | (a&31)<<16 | (sh&31)<<11 | (mb&31)<<6 | (me&31)<<1
 }

+func AOP_RLDIC(op uint32, a uint32, s uint32, sh uint32, m uint32) uint32 {
+	return op | (s&31)<<21 | (a&31)<<16 | (sh&31)<<11 | ((sh&32)>>5)<<1 | (m&31)<<6 | ((m&32)>>5)<<5
+}
+
 func AOP_ISEL(op uint32, t uint32, a uint32, b uint32, bc uint32) uint32 {
 	return op | (t&31)<<21 | (a&31)<<16 | (b&31)<<11 | (bc&0x1F)<<6
 }
@ -2353,6 +2393,11 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 			if mask[1] != 63 {
 				ctxt.Diag("invalid mask for rotate: %x (end != bit 63)\n%v", uint64(d), p)
 			}
+			o1 = LOP_RRR(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))
+			o1 |= (uint32(a) & 31) << 6
+			if a&0x20 != 0 {
+				o1 |= 1 << 5 /* mb[5] is top bit */
+			}

 		case ARLDCR, ARLDCRCC:
 			var mask [2]uint8
@ -2362,22 +2407,28 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 			if mask[0] != 0 {
 				ctxt.Diag("invalid mask for rotate: %x (start != 0)\n%v", uint64(d), p)
 			}
+			o1 = LOP_RRR(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))
+			o1 |= (uint32(a) & 31) << 6
+			if a&0x20 != 0 {
+				o1 |= 1 << 5 /* mb[5] is top bit */
+			}

 		// These opcodes use a shift count like the ppc64 asm, no mask conversion done
-		case ARLDICR, ARLDICRCC, ARLDICL, ARLDICLCC:
-			a = int(d)
+		case ARLDICR, ARLDICRCC:
+			me := int(d)
+			sh := regoff(ctxt, &p.From)
+			o1 = AOP_RLDIC(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(r), uint32(sh), uint32(me))
+
+		case ARLDICL, ARLDICLCC:
+			mb := int(d)
+			sh := regoff(ctxt, &p.From)
+			o1 = AOP_RLDIC(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(r), uint32(sh), uint32(mb))

 		default:
 			ctxt.Diag("unexpected op in rldc case\n%v", p)
 			a = 0
 		}

-		o1 = LOP_RRR(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))
-		o1 |= (uint32(a) & 31) << 6
-		if a&0x20 != 0 {
-			o1 |= 1 << 5 /* mb[5] is top bit */
-		}
-
 	case 17, /* bc bo,bi,lbra (same for now) */
 		16: /* bc bo,bi,sbra */
 		a := 0
@ -3170,8 +3221,24 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 		/* 2-register operand order: XS, RA or RA, XT */
 		xt := int32(p.To.Reg)
 		xs := int32(p.From.Reg)
-		if REG_VS0 <= xt && xt <= REG_VS63 {
+		/* We need to treat the special case of extended mnemonics that may have a FREG/VREG as an argument */
+		if REG_V0 <= xt && xt <= REG_V31 {
+			/* Convert V0-V31 to VS32-VS63 */
+			xt = xt + 64
 			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg))
+		} else if REG_F0 <= xt && xt <= REG_F31 {
+			/* Convert F0-F31 to VS0-VS31 */
+			xt = xt + 64
+			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg))
+		} else if REG_VS0 <= xt && xt <= REG_VS63 {
+			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg))
+		} else if REG_V0 <= xs && xs <= REG_V31 {
+			/* Likewise for XS */
+			xs = xs + 64
+			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
+		} else if REG_F0 <= xs && xs <= REG_F31 {
+			xs = xs + 64
+			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
 		} else if REG_VS0 <= xs && xs <= REG_VS63 {
 			o1 = AOP_XX1(oprrr(ctxt, p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
 		}
@ -3199,6 +3266,30 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 		/* 3-register operand order: XA, XB, XC, XT */
 		o1 = AOP_XX4(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), uint32(p.From3.Reg))

+	case 92: /* X-form instructions, 3-operands */
+		if p.To.Type == obj.TYPE_CONST {
+			/* imm reg reg */
+			/* operand order: FRA, FRB, BF */
+			bf := int(regoff(ctxt, &p.To)) << 2
+			o1 = AOP_RRR(opirr(ctxt, p.As), uint32(bf), uint32(p.From.Reg), uint32(p.Reg))
+		} else if p.To.Type == obj.TYPE_REG {
+			/* reg reg reg */
+			/* operand order: RS, RB, RA */
+			o1 = AOP_RRR(oprrr(ctxt, p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
+		}
+
+	case 93: /* X-form instructions, 2-operands */
+		if p.To.Type == obj.TYPE_CONST {
+			/* imm reg */
+			/* operand order: FRB, BF */
+			bf := int(regoff(ctxt, &p.To)) << 2
+			o1 = AOP_RR(opirr(ctxt, p.As), uint32(bf), uint32(p.From.Reg))
+		} else if p.Reg == 0 {
+			/* popcnt* r,r, X-form */
+			/* operand order: RS, RA */
+			o1 = AOP_RRR(oprrr(ctxt, p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
+		}
+
 	}

 	out[0] = o1
@ -3281,6 +3372,8 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVCC(31, 0, 0, 0) /* L=0 */
 	case ACMPWU:
 		return OPVCC(31, 32, 0, 0)
+	case ACMPB:
+		return OPVCC(31, 508, 0, 0) /* cmpb - v2.05 */

 	case ACNTLZW:
 		return OPVCC(31, 26, 0, 0)
@ -3621,6 +3714,13 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
 	case AORNCC:
 		return OPVCC(31, 412, 0, 1)

+	case APOPCNTD:
+		return OPVCC(31, 506, 0, 0) /* popcntd - v2.06 */
+	case APOPCNTW:
+		return OPVCC(31, 378, 0, 0) /* popcntw - v2.06 */
+	case APOPCNTB:
+		return OPVCC(31, 122, 0, 0) /* popcntb - v2.02 */
+
 	case ARFI:
 		return OPVCC(19, 50, 0, 0)
 	case ARFCI:
@ -3757,14 +3857,14 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
 	/* Vector (VMX/Altivec) instructions */
 	/* ISA 2.03 enables these for PPC970. For POWERx processors, these */
 	/* are enabled starting at POWER6 (ISA 2.05). */
-	case AVANDL:
+	case AVAND:
 		return OPVX(4, 1028, 0, 0) /* vand - v2.03 */
 	case AVANDC:
 		return OPVX(4, 1092, 0, 0) /* vandc - v2.03 */
 	case AVNAND:
 		return OPVX(4, 1412, 0, 0) /* vnand - v2.07 */

-	case AVORL:
+	case AVOR:
 		return OPVX(4, 1156, 0, 0) /* vor - v2.03 */
 	case AVORC:
 		return OPVX(4, 1348, 0, 0) /* vorc - v2.07 */
@ -3810,6 +3910,15 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
 	case AVADDECUQ:
 		return OPVX(4, 61, 0, 0) /* vaddecuq - v2.07 */

+	case AVPMSUMB:
+		return OPVX(4, 1032, 0, 0) /* vpmsumb - v2.07 */
+	case AVPMSUMH:
+		return OPVX(4, 1096, 0, 0) /* vpmsumh - v2.07 */
+	case AVPMSUMW:
+		return OPVX(4, 1160, 0, 0) /* vpmsumw - v2.07 */
+	case AVPMSUMD:
+		return OPVX(4, 1224, 0, 0) /* vpmsumd - v2.07 */
+
 	case AVSUBUBM:
 		return OPVX(4, 1024, 0, 0) /* vsububm - v2.03 */
 	case AVSUBUHM:
@ -3976,12 +4085,12 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {

 	/* Vector scalar (VSX) instructions */
 	/* ISA 2.06 enables these for POWER7. */
-	case AMFVSRD:
+	case AMFVSRD, AMFVRD, AMFFPRD:
 		return OPVXX1(31, 51, 0) /* mfvsrd - v2.07 */
 	case AMFVSRWZ:
 		return OPVXX1(31, 115, 0) /* mfvsrwz - v2.07 */

-	case AMTVSRD:
+	case AMTVSRD, AMTFPRD, AMTVRD:
 		return OPVXX1(31, 179, 0) /* mtvsrd - v2.07 */
 	case AMTVSRWA:
 		return OPVXX1(31, 211, 0) /* mtvsrwa - v2.07 */
@ -4260,6 +4369,11 @@ func opirr(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVX(4, 908, 0, 0) /* vspltisw - v2.03 */
 	/* End of vector instructions */

+	case AFTDIV:
+		return OPVCC(63, 128, 0, 0) /* ftdiv - v2.06 */
+	case AFTSQRT:
+		return OPVCC(63, 160, 0, 0) /* ftsqrt - v2.06 */
+
 	case AXOR:
 		return OPVCC(26, 0, 0, 0) /* XORIL */
 	case -AXOR: