cmd/internal/obj/arm64: summarize the Go assembly syntax and the GNU syntax mapping rules

The patch rewrites the content of doc.go file. The file describes some general rules of the mapping between Go assembly syntax and GNU syntax. And it gives some Go assembly examples and corresponding GNU assembly examples. The patch changes the doc.go to use standard doc comment format so that the link https://golang.org/cmd/internal/obj/arm64/ can display it. Assembly document framework is mainly contributed by Eric Fang <Eric.Fang@arm.com> Documentation work is contributed by Eric Fang and Fannie Zhang <Fannie.Zhang@arm.com> Change-Id: I8b3f6d6c6b91afdc2c44602e8f796beea905085e Reviewed-on: https://go-review.googlesource.com/102055 Reviewed-by: Rob Pike <r@golang.org>
2024-09-24 17:20:12 -06:00 · 2018-03-15 12:16:48 +00:00 · 2018-03-15 12:16:48 +00:00 · 9f10d28304
commit 9f10d28304
parent 25824c96dc
2 changed files with 252 additions and 328 deletions
--- a/doc/asm.html
+++ b/doc/asm.html
@ -738,6 +738,13 @@ The other codes are <code>-&gt;</code> (arithmetic right shift),
 The ARM64 port is in an experimental state.
 </p>

+<p>
+<code>R18</code> is the "platform register", reserved on the Apple platform.
+<code>R27</code> and <code>R28</code> are reserved by the compiler and linker.
+<code>R29</code> is the frame pointer.
+<code>R30</code> is the link register.
+</p>
+
 <p>
 Instruction modifiers are appended to the instruction following a period.
 The only modifiers are <code>P</code> (postincrement) and <code>W</code>
@ -752,11 +759,61 @@ Addressing modes:
 <ul>

 <li>
-<code>(R5, R6)</code>: Register pair for <code>LDP</code>/<code>STP</code>.
+<code>R0-&gt;16</code>
+<br>
+<code>R0&gt;&gt;16</code>
+<br>
+<code>R0&lt;&lt;16</code>
+<br>
+<code>R0@&gt;16</code>:
+These are the same as on the 32-bit ARM.
+</li>
+
+<li>
+<code>$(8&lt;&lt;12)</code>:
+Left shift the immediate value <code>8</code> by <code>12</code> bits.
+</li>
+
+<li>
+<code>8(R0)</code>:
+Add the value of <code>R0</code> and <code>8</code>.
+</li>
+
+<li>
+<code>(R2)(R0)</code>:
+The location at <code>R0</code> plus <code>R2</code>.
+</li>
+
+<li>
+<code>R0.UXTB</code>
+<br>
+<code>R0.UXTB&lt;&lt;imm</code>:
+<code>UXTB</code>: extract an 8-bit value from the low-order bits of <code>R0</code> and zero-extend it to the size of <code>R0</code>.
+<code>R0.UXTB&lt;&lt;imm</code>: left shift the result of <code>R0.UXTB</code> by <code>imm</code> bits.
+The <code>imm</code> value can be 0, 1, 2, 3, or 4.
+The other extensions include <code>UXTH</code> (16-bit), <code>UXTW</code> (32-bit), and <code>UXTX</code> (64-bit).
+</li>
+
+<li>
+<code>R0.SXTB</code>
+<br>
+<code>R0.SXTB&lt;&lt;imm</code>:
+<code>SXTB</code>: extract an 8-bit value from the low-order bits of <code>R0</code> and sign-extend it to the size of <code>R0</code>.
+<code>R0.SXTB&lt;&lt;imm</code>: left shift the result of <code>R0.SXTB</code> by <code>imm</code> bits.
+The <code>imm</code> value can be 0, 1, 2, 3, or 4.
+The other extensions include <code>SXTH</code> (16-bit), <code>SXTW</code> (32-bit), and <code>SXTX</code> (64-bit).
+</li>
+
+<li>
+<code>(R5, R6)</code>: Register pair for <code>LDAXP</code>/<code>LDP</code>/<code>LDXP</code>/<code>STLXP</code>/<code>STP</code>/<code>STP</code>.
 </li>

 </ul>

+<p>
+Reference: <a href="/pkg/cmd/internal/obj/arm64">Go ARM64 Assembly Instructions Reference Manual</a>
+</p>
+
 <h3 id="ppc64">64-bit PowerPC, a.k.a. ppc64</h3>

 <p>
--- a/src/cmd/internal/obj/arm64/doc.go
+++ b/src/cmd/internal/obj/arm64/doc.go
@ -1,334 +1,201 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-package arm64
-
 /*
+Package arm64 implements an ARM64 assembler. Go assembly syntax is different from GNU ARM64
+syntax, but we can still follow the general rules to map between them.

-Go Assembly for ARM64 Reference Manual
-
-1. Alphabetical list of basic instructions
-    // TODO
-
-    LDARB: Load-Acquire Register Byte
-      LDARB	(<Rn>), <Rd>
-        Loads a byte from memory, zero-extends it and writes it to Rd.
-
-    LDARH: Load-Acquire Register Halfword
-      LDARH	(<Rn>), <Rd>
-        Loads a halfword from memory, zero-extends it and writes it to Rd.
-
-    LDAXP: Load-Acquire Exclusive Pair of Registers
-      LDAXP	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 64-bit doublewords from memory, and writes them to Rt1 and Rt2.
-
-    LDAXPW: Load-Acquire Exclusive Pair of Registers
-      LDAXPW	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 32-bit words from memory, and writes them to Rt1 and Rt2.
-
-    LDXP: 64-bit Load Exclusive Pair of Registers
-      LDXP	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 64-bit doublewords from memory, and writes them to Rt1 and Rt2.
-
-    LDXPW: 32-bit Load Exclusive Pair of Registers
-      LDXPW	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 32-bit words from memory, and writes them to Rt1 and Rt2.
-
-    MOVD|MOVW|MOVH|MOVHU|MOVB|MOVBU: Load Register (register offset)
-      MOVD	(Rn)(Rm.UXTW<<3), Rt
-      MOVD	(Rn)(Rm.SXTX), Rt
-      MOVD	(Rn)(Rm<<3), Rt
-      MOVD	(Rn)(Rm), Rt
-      MOVB|MOVBU	(Rn)(Rm.UXTW), Rt
-
-    MOVD|MOVW|MOVH|MOVB: Stote Register (register offset)
-      MOVD	Rt, (Rn)(Rm.UXTW<<3)
-      MOVD	Rt, (Rn)(Rm.SXTX)
-      MOVD	Rt, (Rn)(Rm)
-
-    PRFM: Prefetch Memory (immediate)
-      PRFM	imm(Rn), <prfop>
-        prfop is the prefetch operation and can have the following values:
-        PLDL1KEEP, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM,
-        PLIL1KEEP, PLIL1STRM, PLIL2KEEP, PLIL2STRM, PLIL3KEEP, PLIL3STRM,
-        PSTL1KEEP, PSTL1STRM, PSTL2KEEP, PSTL2STRM, PSTL3KEEP, PSTL3STRM.
-      PRFM	imm(Rn), $imm
-        $imm prefetch operation is encoded as an immediate.
-
-    STLRB: Store-Release Register Byte
-      STLRB	<Rd>, (<Rn>)
-        Stores a byte from Rd to a memory location from Rn.
-
-    STLRH: Store-Release Register Halfword
-      STLRH	<Rd>, (<Rn>)
-        Stores a halfword from Rd to a memory location from Rn.
-
-    STLXP: 64-bit Store-Release Exclusive Pair of registers
-      STLXP	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 64-bit doublewords from Rt1 and Rt2 to a memory location from Rn,
-        and returns in Rs a status value of 0 if the store was successful, or of 1 if
-        no store was performed.
-
-    STLXPW: 32-bit Store-Release Exclusive Pair of registers
-      STLXPW	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 32-bit words from Rt1 and Rt2 to a memory location from Rn, and
-        returns in Rs a status value of 0 if the store was successful, or of 1 if no
-        store was performed.
-
-    STXP: 64-bit Store Exclusive Pair of registers
-      STXP	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 64-bit doublewords from Rt1 and Rt2 to a memory location from Rn,
-        and returns in Rs a status value of 0 if the store was successful, or of 1 if
-        no store was performed.
-
-    STXPW: 32-bit Store Exclusive Pair of registers
-      STXPW	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 32-bit words from Rt1 and Rt2 to a memory location from Rn, and returns in
-        a Rs a status value of 0 if the store was successful, or of 1 if no store was performed.
-
-2. Alphabetical list of float-point instructions
-    // TODO
-
-    FMADDD: 64-bit floating-point fused Multiply-Add
-      FMADDD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        adds the product to <Fa>, and writes the result to <Fd>.
-
-    FMADDS: 32-bit floating-point fused Multiply-Add
-      FMADDS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        adds the product to <Fa>, and writes the result to <Fd>.
-
-    FMSUBD: 64-bit floating-point fused Multiply-Subtract
-      FMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        adds the product to <Fa>, and writes the result to <Fd>.
-
-    FMSUBS: 32-bit floating-point fused Multiply-Subtract
-      FMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        adds the product to <Fa>, and writes the result to <Fd>.
-
-    FNMADDD: 64-bit floating-point negated fused Multiply-Add
-      FNMADDD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
-
-    FNMADDS: 32-bit floating-point negated fused Multiply-Add
-      FNMADDS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
-
-    FNMSUBD: 64-bit floating-point negated fused Multiply-Subtract
-      FNMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
-
-    FNMSUBS: 32-bit floating-point negated fused Multiply-Subtract
-      FNMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
-
-3. Alphabetical list of SIMD instructions
-    VADD: Add (scalar)
-      VADD	<Vm>, <Vn>, <Vd>
-        Add corresponding low 64-bit elements in <Vm> and <Vn>,
-        place the result into low 64-bit element of <Vd>.
-
-    VADD: Add (vector).
-      VADD	<Vm>.T, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
-
-    VADDP: Add Pairwise (vector)
-      VADDP	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
-
-    VADDV: Add across Vector.
-      VADDV	<Vn>.<T>, Vd
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S4
-
-    VAND: Bitwise AND (vector)
-      VAND	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
-
-    VCMEQ: Compare bitwise Equal (vector)
-      VCMEQ	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
-
-    VDUP: Duplicate vector element to vector or scalar.
-      VDUP	<Vn>.<Ts>[index], <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S2, S4, D2
-        <Ts> Is an element size specifier and can have the following values:
-        B, H, S, D
-
-    VEOR: Bitwise exclusive OR (vector, register)
-      VEOR	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
-
-    VFMLA: Floating-point fused Multiply-Add to accumulator (vector)
-      VFMLA	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        S2, S4, D2
-
-    VFMLS: Floating-point fused Multiply-Subtract from accumulator (vector)
-      VFMLS	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        S2, S4, D2
-
-    VEXT:  Extracts vector elements from src SIMD registers to dst SIMD register
-      VEXT	$index, <Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> is an arrangment specifier and can be B8, B16
-        $index is the lowest numbered byte element to be exracted.
-
-    VLD1: Load multiple single-element structures
-      VLD1	(Rn), [<Vt>.<T>, <Vt2>.<T> ...]     // no offset
-      VLD1.P	imm(Rn), [<Vt>.<T>, <Vt2>.<T> ...]  // immediate offset variant
-      VLD1.P	(Rn)(Rm), [<Vt>.<T>, <Vt2>.<T> ...] // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-
-    VLD1: Load one single-element structure
-      VLD1	(Rn), <Vt>.<T>[index]     // no offset
-      VLD1.P	imm(Rn), <Vt>.<T>[index]  // immediate offset variant
-      VLD1.P	(Rn)(Rm), <Vt>.<T>[index] // register offset variant
-        <T> is an arrangement specifier and can have the following values:
-        B, H, S D
-
-    VMOV: move
-      VMOV	<Vn>.<T>[index], Rd // Move vector element to general-purpose register.
-        <T> Is a source width specifier and can have the following values:
-        B, H, S (Wd)
-        D (Xd)
-
-      VMOV	Rn, <Vd>.<T> // Duplicate general-purpose register to vector.
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4 (Wn)
-        D2 (Xn)
-
-      VMOV	<Vn>.<T>, <Vd>.<T> // Move vector.
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
-
-      VMOV	Rn, <Vd>.<T>[index] // Move general-purpose register to a vector element.
-        <T> Is a source width specifier and can have the following values:
-        B, H, S (Wd)
-        D (Xd)
-
-      VMOV	<Vn>.<T>[index], Vn  // Move vector element to scalar.
-        <T> Is an element size specifier and can have the following values:
-        B, H, S, D
-
-      VMOV	<Vn>.<T>[index], <Vd>.<T>[index] // Move vector element to another vector element.
-        <T> Is an element size specifier and can have the following values:
-        B, H, S, D
-
-    VMOVI: Move Immediate (vector).
-      VMOVI	$imm8, <Vd>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        8B, 16B
-
-    VMOVS: Load SIMD&FP Register (immediate offset). ARMv8: LDR (immediate, SIMD&FP)
-      Store SIMD&FP register (immediate offset). ARMv8: STR (immediate, SIMD&FP)
-      VMOVS	(Rn), Vn
-      VMOVS.W	imm(Rn), Vn
-      VMOVS.P	imm(Rn), Vn
-      VMOVS	Vn, (Rn)
-      VMOVS.W	Vn, imm(Rn)
-      VMOVS.P	Vn, imm(Rn)
-
-    VORR: Bitwise inclusive OR (vector, register)
-      VORR	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
-
-    VRBIT: Reverse bit order (vector)
-      VRBIT	<Vn>.<T>, <Vd>.<T>
-        <T> is an arrangment specifier and can be B8, B16
-
-    VREV32: Reverse elements in 32-bit words (vector).
-      REV32 <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8
-
-    VREV64: Reverse elements in 64-bit words (vector).
-      REV64 <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4
-
-    VSHL: Shift Left(immediate)
-      VSHL 	$shift, <Vn>.<T>, <Vd>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-        $shift Is the left shift amount
-
-    VST1: Store multiple single-element structures
-      VST1	[<Vt>.<T>, <Vt2>.<T> ...], (Rn)         // no offset
-      VST1.P	[<Vt>.<T>, <Vt2>.<T> ...], imm(Rn)      // immediate offset variant
-      VST1.P	[<Vt>.<T>, <Vt2>.<T> ...], (Rn)(Rm)     // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-
-    VSUB: Sub (scalar)
-      VSUB	<Vm>, <Vn>, <Vd>
-        Subtract low 64-bit element in <Vm> from the corresponding element in <Vn>,
-        place the result into low 64-bit element of <Vd>.
-
-    VUADDLV: Unsigned sum Long across Vector.
-      VUADDLV	<Vn>.<T>, Vd
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S4
-
-    VST1: Store one single-element structure
-      VST1	<Vt>.<T>.<Index>, (Rn)         // no offset
-      VST1.P	<Vt>.<T>.<Index>, imm(Rn)      // immediate offset variant
-      VST1.P	<Vt>.<T>.<Index>, (Rn)(Rm)     // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B, H, S, D
-
-    VUSHR: Unsigned shift right(immediate)
-      VUSHR	$shift, <Vn>.<T>, <Vm>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-        $shift is the right shift amount
-
-
-4. Alphabetical list of cryptographic extension instructions
-
-    VPMULL{2}: Polynomial multiply long.
-      VPMULL{2}	<Vm>.<Tb>, <Vn>.<Tb>, <Vd>.<Ta>
-        VPMULL multiplies corresponding elements in the lower half of the
-        vectors of two source SIMD registers and VPMULL{2} operates in the upper half.
-        <Ta> is an arrangement specifier, it can be H8, Q1
-        <Tb> is an arrangement specifier, it can be B8, B16, D1, D2
-
-    SHA1C, SHA1M, SHA1P: SHA1 hash update.
-      SHA1C	<Vm>.S4, Vn, Vd
-      SHA1M	<Vm>.S4, Vn, Vd
-      SHA1P	<Vm>.S4, Vn, Vd
-
-    SHA1H: SHA1 fixed rotate.
-      SHA1H	Vn, Vd
-
-    SHA1SU0:   SHA1 schedule update 0.
-    SHA256SU1: SHA256 schedule update 1.
-      SHA1SU0	<Vm>.S4, <Vn>.S4, <Vd>.S4
-      SHA256SU1	<Vm>.S4, <Vn>.S4, <Vd>.S4
-
-    SHA1SU1:   SHA1 schedule update 1.
-    SHA256SU0: SHA256 schedule update 0.
-      SHA1SU1	<Vn>.S4, <Vd>.S4
-      SHA256SU0	<Vn>.S4, <Vd>.S4
-
-    SHA256H, SHA256H2: SHA256 hash update.
-      SHA256H	<Vm>.S4, Vn, Vd
-      SHA256H2	<Vm>.S4, Vn, Vd
+Instructions mnemonics mapping rules

+1. Most instructions use width suffixes of instruction names to indicate operand width rather than
+using different register names.
+
+  Examples:
+    ADC R24, R14, R12          <=>     adc x12, x24
+    ADDW R26->24, R21, R15     <=>     add w15, w21, w26, asr #24
+    FCMPS F2, F3               <=>     fcmp s3, s2
+    FCMPD F2, F3               <=>     fcmp d3, d2
+    FCVTDH F2, F3              <=>     fcvt h3, d2
+
+2. Go uses .P and .W suffixes to indicate post-increment and pre-increment.
+
+  Examples:
+    MOVD.P -8(R10), R8         <=>      ldr x8, [x10],#-8
+    MOVB.W 16(R16), R10        <=>      ldr x10, [x16,#16]!
+
+3. Go uses a series of MOV instructions as load and store.
+
+64-bit variant ldr, str, stur => MOVD;
+32-bit variant str, stur, ldrsw => MOVW;
+32-bit variant ldr => MOVWU;
+ldrb => MOVBU; ldrh => MOVHU;
+ldrsb, sturb, strb => MOVB;
+ldrsh, sturh, strh =>  MOVH.
+
+4. Go moves conditions into opcode suffix, like BLT.
+
+5. Go adds a V prefix for most floating-point and SIMD instrutions except cryptographic extension
+instructions and floating-point(scalar) instructions.
+
+  Examples:
+    VADD V5.H8, V18.H8, V9.H8         <=>      add v9.8h, v18.8h, v5.8h
+    VLD1.P (R6)(R11), [V31.D1]        <=>      ld1 {v31.1d}, [x6], x11
+    VFMLA V29.S2, V20.S2, V14.S2      <=>      fmla v14.2s, v20.2s, v29.2s
+    AESD V22.B16, V19.B16             <=>      aesd v19.16b, v22.16b
+    SCVTFWS R3, F16                   <=>      scvtf s17, w6
+
+Special Cases.
+
+(1) umov is written as VMOV.
+
+(2) br is renamed JMP, blr is renamed CALL.
+
+(3) No need to add "W" suffix: LDARB, LDARH, LDAXRB, LDAXRH, LDTRH, LDXRB, LDXRH.
+
+  Examples:
+    VMOV V13.B[1], R20      <=>      mov x20, v13.b[1]
+    VMOV V13.H[1], R20      <=>      mov w20, v13.h[1]
+    JMP (R3)                <=>      br x3
+    CALL (R17)              <=>      blr x17
+    LDAXRB (R19), R16       <=>      ldaxrb w16, [x19]
+
+
+Register mapping rules
+
+1. All basic register names are written as Rn.
+
+2. Go uses ZR as the zero register and RSP as the stack pointer.
+
+3. Bn, Hn, Dn, Sn and Qn instructions are written as Fn in floating-point instructions and as Vn
+in SIMD instructions.
+
+
+Argument mapping rules
+
+1. The operands appear in left-to-right assignment order.
+
+Go reverses the arguments of most instructions.
+
+    Examples:
+      ADD R11.SXTB<<1, RSP, R25      <=>      add x25, sp, w11, sxtb #1
+      VADD V16, V19, V14             <=>      add d14, d19, d16
+
+Special Cases.
+
+(1) Argument order is the same as in the GNU ARM64 syntax: cbz, cbnz and some store instructions,
+such as str, stur, strb, sturb, strh, sturh stlr, stlrb. stlrh, st1.
+
+  Examples:
+    MOVD R29, 384(R19)    <=>    str x29, [x19,#384]
+    MOVB.P R30, 30(R4)    <=>    strb w30, [x4],#30
+    STLRH R21, (R18)      <=>    stlrh w21, [x18]
+
+(2) MADD, MADDW, MSUB, MSUBW, SMADDL, SMSUBL, UMADDL, UMSUBL <Rm>, <Ra>, <Rn>, <Rd>
+
+  Examples:
+    MADD R2, R30, R22, R6       <=>    madd x6, x22, x2, x30
+    SMSUBL R10, R3, R17, R27    <=>    smsubl x27, w17, w10, x3
+
+  Examples:
+    FMADDD F30, F20, F3, F29    <=>    fmadd d29, d3, d30, d20
+    FNMSUBS F7, F25, F7, F22    <=>    fnmsub s22, s7, s7, s25
+
+(4) BFI, BFXIL, SBFIZ, SBFX, UBFIZ, UBFX $<lsb>, <Rn>, $<width>, <Rd>
+
+  Examples:
+    BFIW $16, R20, $6, R0      <=>    bfi w0, w20, #16, #6
+    UBFIZ $34, R26, $5, R20    <=>    ubfiz x20, x26, #34, #5
+
+(5) FCCMPD, FCCMPS, FCCMPED, FCCMPES <cond>, Fm. Fn, $<nzcv>
+
+  Examples:
+    FCCMPD AL, F8, F26, $0     <=>    fccmp d26, d8, #0x0, al
+    FCCMPS VS, F29, F4, $4     <=>    fccmp s4, s29, #0x4, vs
+    FCCMPED LE, F20, F5, $13   <=>    fccmpe d5, d20, #0xd, le
+    FCCMPES NE, F26, F10, $0   <=>    fccmpe s10, s26, #0x0, ne
+
+(6) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, $<imm>, $<nzcv>
+
+  Examples:
+    CCMP MI, R22, $12, $13     <=>    ccmp x22, #0xc, #0xd, mi
+    CCMNW AL, R1, $11, $8      <=>    ccmn w1, #0xb, #0x8, al
+
+(7) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, <Rm>, $<nzcv>
+
+  Examples:
+    CCMN VS, R13, R22, $10     <=>    ccmn x13, x22, #0xa, vs
+    CCMPW HS, R18, R14, $11    <=>    ccmp w18, w14, #0xb, cs
+
+(9) CSEL, CSELW, CSNEG, CSNEGW, CSINC, CSINCW <cond>, <Rn>, <Rm>, <Rd> ;
+FCSELD, FCSELS <cond>, <Fn>, <Fm>, <Fd>
+
+  Examples:
+    CSEL GT, R0, R19, R1        <=>    csel x1, x0, x19, gt
+    CSNEGW GT, R7, R17, R8      <=>    csneg w8, w7, w17, gt
+    FCSELD EQ, F15, F18, F16    <=>    fcsel d16, d15, d18, eq
+
+(10) TBNZ, TBZ $<imm>, <Rt>, <label>
+
+
+(11) STLXR, STLXRW, STXR, STXRW, STLXRB, STLXRH, STXRB, STXRH  <Rf>, (<Rn|RSP>), <Rs>
+
+  Examples:
+    STLXR ZR, (R15), R16    <=>    stlxr w16, xzr, [x15]
+    STXRB R9, (R21), R18    <=>    stxrb w18, w9, [x21]
+
+(12) STLXP, STLXPW, STXP, STXPW (<Rf1>, <Rf2>), (<Rn|RSP>), <Rs>
+
+  Examples:
+    STLXP (R17, R18), (R4), R5      <=>    stlxp w5, x17, x18, [x4]
+    STXPW (R30, R25), (R22), R13    <=>    stxp w13, w30, w25, [x22]
+
+2. Expressions for special arguments.
+
+#<immediate> is written as $<immediate>.
+
+Optionally-shifted immedate.
+
+  Examples:
+    ADD $(3151<<12), R14, R20     <=>    add x20, x14, #0xc4f, lsl #12
+    ADDW $1864, R25, R6           <=>    add w6, w25, #0x748
+
+Optionally-shifted registers are written as <Rm>{<shift><amount>}.
+The <shift> can be <<(lsl), >>(lsr), ->(asr), @>(ror).
+
+  Examples:
+    ADD R19>>30, R10, R24     <=>    add x24, x10, x19, lsr #30
+    ADDW R26->24, R21, R15    <=>    add w15, w21, w26, asr #24
+
+Extended registers are written as <Rm>{.<extend>{<<<amount>}}.
+<extend> can be UXTB, UXTH, UXTW, UXTX, SXTB, SXTH, SXTW or SXTX.
+
+  Examples:
+    ADDS R18.UXTB<<4, R9, R26     <=>    adds x26, x9, w18, uxtb #4
+    ADDSW R14.SXTX, R14, R6       <=>    adds w6, w14, w14, sxtx
+
+Memory references: [<Xn|SP>{,#0}] is written as (Rn|RSP), a base register and an immediate
+offset is written as imm(Rn|RSP), a base register and an offset register is written as (Rn|RSP)(Rm).
+
+  Examples:
+    LDAR (R22), R9                  <=>    ldar x9, [x22]
+    LDP 28(R17), (R15, R23)         <=>    ldp x15, x23, [x17,#28]
+    MOVWU (R4)(R12<<2), R8          <=>    ldr w8, [x4, x12, lsl #2]
+    MOVD (R7)(R11.UXTW<<3), R25     <=>    ldr x25, [x7,w11,uxtw #3]
+    MOVBU (R27)(R23), R14           <=>    ldrb w14, [x27,x23]
+
+Register pairs are written as (Rt1, Rt2).
+
+  Examples:
+    LDP.P -240(R11), (R12, R26)    <=>    ldp x12, x26, [x11],#-240
+
+Register with arrangement and register with arrangement and index.
+
+  Examples:
+    VADD V5.H8, V18.H8, V9.H8                     <=>    add v9.8h, v18.8h, v5.8h
+    VLD1 (R2), [V21.B16]                          <=>    ld1 {v21.16b}, [x2]
+    VST1.P V9.S[1], (R16)(R21)                    <=>    st1 {v9.s}[1], [x16], x28
+    VST1.P [V13.H8, V14.H8, V15.H8], (R3)(R14)    <=>    st1 {v13.8h-v15.8h}, [x3], x14
+    VST1.P [V14.D1, V15.D1], (R7)(R23)            <=>    st1 {v14.1d, v15.1d}, [x7], x23
 */
+package arm64