909 lines
31 KiB
C
909 lines
31 KiB
C
/*
|
|
* Copyright © 2012 Raspberry Pi Foundation
|
|
* Copyright © 2012 RISC OS Open Ltd
|
|
*
|
|
* Permission to use, copy, modify, distribute, and sell this software and its
|
|
* documentation for any purpose is hereby granted without fee, provided that
|
|
* the above copyright notice appear in all copies and that both that
|
|
* copyright notice and this permission notice appear in supporting
|
|
* documentation, and that the name of the copyright holders not be used in
|
|
* advertising or publicity pertaining to distribution of the software without
|
|
* specific, written prior permission. The copyright holders make no
|
|
* representations about the suitability of this software for any purpose. It
|
|
* is provided "as is" without express or implied warranty.
|
|
*
|
|
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
|
|
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
|
|
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
|
|
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
|
|
* SOFTWARE.
|
|
*
|
|
* Author: Ben Avison (bavison@riscosopen.org)
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Because the alignment of pixel data to cachelines, and even the number of
|
|
* cachelines per row can vary from row to row, and because of the need to
|
|
* preload each scanline once and only once, this prefetch strategy treats
|
|
* each row of pixels independently. When a pixel row is long enough, there
|
|
* are three distinct phases of prefetch:
|
|
* * an inner loop section, where each time a cacheline of data is
|
|
* processed, another cacheline is preloaded (the exact distance ahead is
|
|
* determined empirically using profiling results from lowlevel-blt-bench)
|
|
* * a leading section, where enough cachelines are preloaded to ensure no
|
|
* cachelines escape being preloaded when the inner loop starts
|
|
* * a trailing section, where a limited number (0 or more) of cachelines
|
|
* are preloaded to deal with data (if any) that hangs off the end of the
|
|
* last iteration of the inner loop, plus any trailing bytes that were not
|
|
* enough to make up one whole iteration of the inner loop
|
|
*
|
|
* There are (in general) three distinct code paths, selected between
|
|
* depending upon how long the pixel row is. If it is long enough that there
|
|
* is at least one iteration of the inner loop (as described above) then
|
|
* this is described as the "wide" case. If it is shorter than that, but
|
|
* there are still enough bytes output that there is at least one 16-byte-
|
|
* long, 16-byte-aligned write to the destination (the optimum type of
|
|
* write), then this is the "medium" case. If it is not even this long, then
|
|
* this is the "narrow" case, and there is no attempt to align writes to
|
|
* 16-byte boundaries. In the "medium" and "narrow" cases, all the
|
|
* cachelines containing data from the pixel row are prefetched up-front.
|
|
*/
|
|
|
|
/*
|
|
* Determine whether we put the arguments on the stack for debugging.
|
|
*/
|
|
#undef DEBUG_PARAMS
|
|
|
|
/*
|
|
* Bit flags for 'generate_composite_function' macro which are used
|
|
* to tune generated functions behavior.
|
|
*/
|
|
.set FLAG_DST_WRITEONLY, 0
|
|
.set FLAG_DST_READWRITE, 1
|
|
.set FLAG_COND_EXEC, 0
|
|
.set FLAG_BRANCH_OVER, 2
|
|
.set FLAG_PROCESS_PRESERVES_PSR, 0
|
|
.set FLAG_PROCESS_CORRUPTS_PSR, 4
|
|
.set FLAG_PROCESS_DOESNT_STORE, 0
|
|
.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
|
|
.set FLAG_NO_SPILL_LINE_VARS, 0
|
|
.set FLAG_SPILL_LINE_VARS_WIDE, 16
|
|
.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
|
|
.set FLAG_SPILL_LINE_VARS, 48
|
|
.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
|
|
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
|
|
|
|
/*
|
|
* Offset into stack where mask and source pointer/stride can be accessed.
|
|
*/
|
|
#ifdef DEBUG_PARAMS
|
|
.set ARGS_STACK_OFFSET, (9*4+9*4)
|
|
#else
|
|
.set ARGS_STACK_OFFSET, (9*4)
|
|
#endif
|
|
|
|
/*
|
|
* Constants for selecting preferable prefetch type.
|
|
*/
|
|
.set PREFETCH_TYPE_NONE, 0
|
|
.set PREFETCH_TYPE_STANDARD, 1
|
|
|
|
/*
|
|
* Definitions of macros for load/store of pixel data.
|
|
*/
|
|
|
|
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
|
|
.if numbytes == 16
|
|
.if unaligned == 1
|
|
op&r&cond WK®0, [base], #4
|
|
op&r&cond WK®1, [base], #4
|
|
op&r&cond WK®2, [base], #4
|
|
op&r&cond WK®3, [base], #4
|
|
.else
|
|
op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3}
|
|
.endif
|
|
.elseif numbytes == 8
|
|
.if unaligned == 1
|
|
op&r&cond WK®0, [base], #4
|
|
op&r&cond WK®1, [base], #4
|
|
.else
|
|
op&m&cond&ia base!, {WK®0,WK®1}
|
|
.endif
|
|
.elseif numbytes == 4
|
|
op&r&cond WK®0, [base], #4
|
|
.elseif numbytes == 2
|
|
op&r&cond&h WK®0, [base], #2
|
|
.elseif numbytes == 1
|
|
op&r&cond&b WK®0, [base], #1
|
|
.else
|
|
.error "unsupported size: numbytes"
|
|
.endif
|
|
.endm
|
|
|
|
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
|
|
.if numbytes == 16
|
|
stm&cond&db base, {WK®0,WK®1,WK®2,WK®3}
|
|
.elseif numbytes == 8
|
|
stm&cond&db base, {WK®0,WK®1}
|
|
.elseif numbytes == 4
|
|
str&cond WK®0, [base, #-4]
|
|
.elseif numbytes == 2
|
|
str&cond&h WK®0, [base, #-2]
|
|
.elseif numbytes == 1
|
|
str&cond&b WK®0, [base, #-1]
|
|
.else
|
|
.error "unsupported size: numbytes"
|
|
.endif
|
|
.endm
|
|
|
|
.macro pixld cond, numbytes, firstreg, base, unaligned
|
|
pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
|
|
.endm
|
|
|
|
.macro pixst cond, numbytes, firstreg, base
|
|
.if (flags) & FLAG_DST_READWRITE
|
|
pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
|
|
.else
|
|
pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
|
|
.endif
|
|
.endm
|
|
|
|
.macro PF a, x:vararg
|
|
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
|
|
a x
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro preload_leading_step1 bpp, ptr, base
|
|
/* If the destination is already 16-byte aligned, then we need to preload
|
|
* between 0 and prefetch_distance (inclusive) cache lines ahead so there
|
|
* are no gaps when the inner loop starts.
|
|
*/
|
|
.if bpp > 0
|
|
PF bic, ptr, base, #31
|
|
.set OFFSET, 0
|
|
.rept prefetch_distance+1
|
|
PF pld, [ptr, #OFFSET]
|
|
.set OFFSET, OFFSET+32
|
|
.endr
|
|
.endif
|
|
.endm
|
|
|
|
.macro preload_leading_step2 bpp, bpp_shift, ptr, base
|
|
/* However, if the destination is not 16-byte aligned, we may need to
|
|
* preload more cache lines than that. The question we need to ask is:
|
|
* are the bytes corresponding to the leading pixels more than the amount
|
|
* by which the source pointer will be rounded down for preloading, and if
|
|
* so, by how many cache lines? Effectively, we want to calculate
|
|
* leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
|
|
* inner_loop_offset = (src+leading_bytes)&31
|
|
* extra_needed = leading_bytes - inner_loop_offset
|
|
* and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
|
|
* possible when there are 4 src bytes for every 1 dst byte).
|
|
*/
|
|
.if bpp > 0
|
|
.ifc base,DST
|
|
/* The test can be simplified further when preloading the destination */
|
|
PF tst, base, #16
|
|
PF beq, 61f
|
|
.else
|
|
.if bpp/dst_w_bpp == 4
|
|
PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
|
|
PF and, SCRATCH, SCRATCH, #31
|
|
PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
|
|
PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
|
|
PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
|
|
PF bcs, 61f
|
|
PF bpl, 60f
|
|
PF pld, [ptr, #32*(prefetch_distance+2)]
|
|
.else
|
|
PF mov, SCRATCH, base, lsl #32-5
|
|
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
|
|
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
|
|
PF bls, 61f
|
|
.endif
|
|
.endif
|
|
60: PF pld, [ptr, #32*(prefetch_distance+1)]
|
|
61:
|
|
.endif
|
|
.endm
|
|
|
|
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
|
|
.macro preload_middle bpp, base, scratch_holds_offset
|
|
.if bpp > 0
|
|
/* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
|
|
.if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
|
|
.if scratch_holds_offset
|
|
PF pld, [base, SCRATCH]
|
|
.else
|
|
PF bic, SCRATCH, base, #31
|
|
PF pld, [SCRATCH, #32*prefetch_distance]
|
|
.endif
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
.macro preload_trailing bpp, bpp_shift, base
|
|
.if bpp > 0
|
|
.if bpp*pix_per_block > 256
|
|
/* Calculations are more complex if more than one fetch per block */
|
|
PF and, WK1, base, #31
|
|
PF add, WK1, WK1, WK0, lsl #bpp_shift
|
|
PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
|
|
PF bic, SCRATCH, base, #31
|
|
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
|
|
PF add, SCRATCH, SCRATCH, #32
|
|
PF subs, WK1, WK1, #32
|
|
PF bhi, 80b
|
|
.else
|
|
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
|
|
PF mov, SCRATCH, base, lsl #32-5
|
|
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
|
|
PF adceqs, SCRATCH, SCRATCH, #0
|
|
/* The instruction above has two effects: ensures Z is only
|
|
* set if C was clear (so Z indicates that both shifted quantities
|
|
* were 0), and clears C if Z was set (so C indicates that the sum
|
|
* of the shifted quantities was greater and not equal to 32) */
|
|
PF beq, 82f
|
|
PF bic, SCRATCH, base, #31
|
|
PF bcc, 81f
|
|
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
|
|
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
|
|
82:
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro preload_line narrow_case, bpp, bpp_shift, base
|
|
/* "narrow_case" - just means that the macro was invoked from the "narrow"
|
|
* code path rather than the "medium" one - because in the narrow case,
|
|
* the row of pixels is known to output no more than 30 bytes, then
|
|
* (assuming the source pixels are no wider than the the destination
|
|
* pixels) they cannot possibly straddle more than 2 32-byte cachelines,
|
|
* meaning there's no need for a loop.
|
|
* "bpp" - number of bits per pixel in the channel (source, mask or
|
|
* destination) that's being preloaded, or 0 if this channel is not used
|
|
* for reading
|
|
* "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
|
|
* "base" - base address register of channel to preload (SRC, MASK or DST)
|
|
*/
|
|
.if bpp > 0
|
|
.if narrow_case && (bpp <= dst_w_bpp)
|
|
/* In these cases, each line for each channel is in either 1 or 2 cache lines */
|
|
PF bic, WK0, base, #31
|
|
PF pld, [WK0]
|
|
PF add, WK1, base, X, LSL #bpp_shift
|
|
PF sub, WK1, WK1, #1
|
|
PF bic, WK1, WK1, #31
|
|
PF cmp, WK1, WK0
|
|
PF beq, 90f
|
|
PF pld, [WK1]
|
|
90:
|
|
.else
|
|
PF bic, WK0, base, #31
|
|
PF pld, [WK0]
|
|
PF add, WK1, base, X, lsl #bpp_shift
|
|
PF sub, WK1, WK1, #1
|
|
PF bic, WK1, WK1, #31
|
|
PF cmp, WK1, WK0
|
|
PF beq, 92f
|
|
91: PF add, WK0, WK0, #32
|
|
PF cmp, WK0, WK1
|
|
PF pld, [WK0]
|
|
PF bne, 91b
|
|
92:
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
|
process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
|
|
.if decrementx
|
|
sub&cond X, X, #8*numbytes/dst_w_bpp
|
|
.endif
|
|
process_tail cond, numbytes, firstreg
|
|
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
|
pixst cond, numbytes, firstreg, DST
|
|
.endif
|
|
.endm
|
|
|
|
.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
|
.if (flags) & FLAG_BRANCH_OVER
|
|
.ifc cond,mi
|
|
bpl 100f
|
|
.endif
|
|
.ifc cond,cs
|
|
bcc 100f
|
|
.endif
|
|
.ifc cond,ne
|
|
beq 100f
|
|
.endif
|
|
conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
|
100:
|
|
.else
|
|
conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
|
.endif
|
|
.endm
|
|
|
|
.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
|
|
.if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
|
|
/* Can't interleave reads and writes */
|
|
test
|
|
conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
|
|
.if (flags) & FLAG_PROCESS_CORRUPTS_PSR
|
|
test
|
|
.endif
|
|
conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
|
|
.else
|
|
/* Can interleave reads and writes for better scheduling */
|
|
test
|
|
process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
|
|
process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
|
|
.if decrementx
|
|
sub&cond1 X, X, #8*numbytes1/dst_w_bpp
|
|
sub&cond2 X, X, #8*numbytes2/dst_w_bpp
|
|
.endif
|
|
process_tail cond1, numbytes1, firstreg1
|
|
process_tail cond2, numbytes2, firstreg2
|
|
pixst cond1, numbytes1, firstreg1, DST
|
|
pixst cond2, numbytes2, firstreg2, DST
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro test_bits_1_0_ptr
|
|
movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
|
|
.endm
|
|
|
|
.macro test_bits_3_2_ptr
|
|
movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
|
|
.endm
|
|
|
|
.macro leading_15bytes process_head, process_tail
|
|
/* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
|
|
/* Use unaligned loads in all cases for simplicity */
|
|
.if dst_w_bpp == 8
|
|
conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
|
|
.elseif dst_w_bpp == 16
|
|
test_bits_1_0_ptr
|
|
conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
|
|
.endif
|
|
conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
|
|
.endm
|
|
|
|
.macro test_bits_3_2_pix
|
|
movs SCRATCH, X, lsl #dst_bpp_shift+32-3
|
|
.endm
|
|
|
|
.macro test_bits_1_0_pix
|
|
.if dst_w_bpp == 8
|
|
movs SCRATCH, X, lsl #dst_bpp_shift+32-1
|
|
.else
|
|
movs SCRATCH, X, lsr #1
|
|
.endif
|
|
.endm
|
|
|
|
.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
|
conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
|
|
.if dst_w_bpp == 16
|
|
test_bits_1_0_pix
|
|
conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
|
|
.elseif dst_w_bpp == 8
|
|
conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
|
|
110:
|
|
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
|
|
.rept pix_per_block*dst_w_bpp/128
|
|
process_head , 16, 0, unaligned_src, unaligned_mask, 1
|
|
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
|
preload_middle src_bpp, SRC, 1
|
|
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
|
preload_middle mask_bpp, MASK, 1
|
|
.else
|
|
preload_middle src_bpp, SRC, 0
|
|
preload_middle mask_bpp, MASK, 0
|
|
.endif
|
|
.if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
|
|
/* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
|
|
* destination prefetches are 32-byte aligned. It's also the easiest channel to offset
|
|
* preloads for, to achieve staggered prefetches for multiple channels, because there are
|
|
* always two STMs per prefetch, so there is always an opposite STM on which to put the
|
|
* preload. Note, no need to BIC the base register here */
|
|
PF pld, [DST, #32*prefetch_distance - dst_alignment]
|
|
.endif
|
|
process_tail , 16, 0
|
|
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
|
pixst , 16, 0, DST
|
|
.endif
|
|
.set SUBBLOCK, SUBBLOCK+1
|
|
.endr
|
|
subs X, X, #pix_per_block
|
|
bhs 110b
|
|
.endm
|
|
|
|
.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
|
|
/* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
|
|
.if dst_r_bpp > 0
|
|
tst DST, #16
|
|
bne 111f
|
|
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
|
|
b 112f
|
|
111:
|
|
.endif
|
|
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
|
|
112:
|
|
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
|
|
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
|
|
PF and, WK0, X, #pix_per_block-1
|
|
.endif
|
|
preload_trailing src_bpp, src_bpp_shift, SRC
|
|
preload_trailing mask_bpp, mask_bpp_shift, MASK
|
|
preload_trailing dst_r_bpp, dst_bpp_shift, DST
|
|
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
|
|
/* The remainder of the line is handled identically to the medium case */
|
|
medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
|
|
.endm
|
|
|
|
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
|
|
120:
|
|
process_head , 16, 0, unaligned_src, unaligned_mask, 0
|
|
process_tail , 16, 0
|
|
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
|
pixst , 16, 0, DST
|
|
.endif
|
|
subs X, X, #128/dst_w_bpp
|
|
bhs 120b
|
|
/* Trailing pixels */
|
|
tst X, #128/dst_w_bpp - 1
|
|
beq exit_label
|
|
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
|
.endm
|
|
|
|
.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
|
|
tst X, #16*8/dst_w_bpp
|
|
conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
|
|
/* Trailing pixels */
|
|
/* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
|
|
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
|
.endm
|
|
|
|
.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
|
|
/* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
|
|
.if mask_bpp == 8 || mask_bpp == 16
|
|
tst MASK, #3
|
|
bne 141f
|
|
.endif
|
|
.if src_bpp == 8 || src_bpp == 16
|
|
tst SRC, #3
|
|
bne 140f
|
|
.endif
|
|
action process_head, process_tail, process_inner_loop, exit_label, 0, 0
|
|
.if src_bpp == 8 || src_bpp == 16
|
|
b exit_label
|
|
140:
|
|
action process_head, process_tail, process_inner_loop, exit_label, 1, 0
|
|
.endif
|
|
.if mask_bpp == 8 || mask_bpp == 16
|
|
b exit_label
|
|
141:
|
|
.if src_bpp == 8 || src_bpp == 16
|
|
tst SRC, #3
|
|
bne 142f
|
|
.endif
|
|
action process_head, process_tail, process_inner_loop, exit_label, 0, 1
|
|
.if src_bpp == 8 || src_bpp == 16
|
|
b exit_label
|
|
142:
|
|
action process_head, process_tail, process_inner_loop, exit_label, 1, 1
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
|
|
.if vars_spilled
|
|
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
|
|
/* This is ldmia sp,{} */
|
|
.word 0xE89D0000 | LINE_SAVED_REGS
|
|
.endif
|
|
subs Y, Y, #1
|
|
.if vars_spilled
|
|
.if (LINE_SAVED_REGS) & (1<<1)
|
|
str Y, [sp]
|
|
.endif
|
|
.endif
|
|
add DST, DST, STRIDE_D
|
|
.if src_bpp > 0
|
|
add SRC, SRC, STRIDE_S
|
|
.endif
|
|
.if mask_bpp > 0
|
|
add MASK, MASK, STRIDE_M
|
|
.endif
|
|
.if restore_x
|
|
mov X, ORIG_W
|
|
.endif
|
|
bhs loop_label
|
|
.ifc "last_one",""
|
|
.if vars_spilled
|
|
b 197f
|
|
.else
|
|
b 198f
|
|
.endif
|
|
.else
|
|
.if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
|
|
b 198f
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro generate_composite_function fname, \
|
|
src_bpp_, \
|
|
mask_bpp_, \
|
|
dst_w_bpp_, \
|
|
flags_, \
|
|
prefetch_distance_, \
|
|
init, \
|
|
newline, \
|
|
cleanup, \
|
|
process_head, \
|
|
process_tail, \
|
|
process_inner_loop
|
|
|
|
.func fname
|
|
.global fname
|
|
/* For ELF format also set function visibility to hidden */
|
|
#ifdef __ELF__
|
|
.hidden fname
|
|
.type fname, %function
|
|
#endif
|
|
|
|
/*
|
|
* Make some macro arguments globally visible and accessible
|
|
* from other macros
|
|
*/
|
|
.set src_bpp, src_bpp_
|
|
.set mask_bpp, mask_bpp_
|
|
.set dst_w_bpp, dst_w_bpp_
|
|
.set flags, flags_
|
|
.set prefetch_distance, prefetch_distance_
|
|
|
|
/*
|
|
* Select prefetch type for this function.
|
|
*/
|
|
.if prefetch_distance == 0
|
|
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
|
|
.else
|
|
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
|
|
.endif
|
|
|
|
.if src_bpp == 32
|
|
.set src_bpp_shift, 2
|
|
.elseif src_bpp == 24
|
|
.set src_bpp_shift, 0
|
|
.elseif src_bpp == 16
|
|
.set src_bpp_shift, 1
|
|
.elseif src_bpp == 8
|
|
.set src_bpp_shift, 0
|
|
.elseif src_bpp == 0
|
|
.set src_bpp_shift, -1
|
|
.else
|
|
.error "requested src bpp (src_bpp) is not supported"
|
|
.endif
|
|
|
|
.if mask_bpp == 32
|
|
.set mask_bpp_shift, 2
|
|
.elseif mask_bpp == 24
|
|
.set mask_bpp_shift, 0
|
|
.elseif mask_bpp == 8
|
|
.set mask_bpp_shift, 0
|
|
.elseif mask_bpp == 0
|
|
.set mask_bpp_shift, -1
|
|
.else
|
|
.error "requested mask bpp (mask_bpp) is not supported"
|
|
.endif
|
|
|
|
.if dst_w_bpp == 32
|
|
.set dst_bpp_shift, 2
|
|
.elseif dst_w_bpp == 24
|
|
.set dst_bpp_shift, 0
|
|
.elseif dst_w_bpp == 16
|
|
.set dst_bpp_shift, 1
|
|
.elseif dst_w_bpp == 8
|
|
.set dst_bpp_shift, 0
|
|
.else
|
|
.error "requested dst bpp (dst_w_bpp) is not supported"
|
|
.endif
|
|
|
|
.if (((flags) & FLAG_DST_READWRITE) != 0)
|
|
.set dst_r_bpp, dst_w_bpp
|
|
.else
|
|
.set dst_r_bpp, 0
|
|
.endif
|
|
|
|
.set pix_per_block, 16*8/dst_w_bpp
|
|
.if src_bpp != 0
|
|
.if 32*8/src_bpp > pix_per_block
|
|
.set pix_per_block, 32*8/src_bpp
|
|
.endif
|
|
.endif
|
|
.if mask_bpp != 0
|
|
.if 32*8/mask_bpp > pix_per_block
|
|
.set pix_per_block, 32*8/mask_bpp
|
|
.endif
|
|
.endif
|
|
.if dst_r_bpp != 0
|
|
.if 32*8/dst_r_bpp > pix_per_block
|
|
.set pix_per_block, 32*8/dst_r_bpp
|
|
.endif
|
|
.endif
|
|
|
|
/* The standard entry conditions set up by pixman-arm-common.h are:
|
|
* r0 = width (pixels)
|
|
* r1 = height (rows)
|
|
* r2 = pointer to top-left pixel of destination
|
|
* r3 = destination stride (pixels)
|
|
* [sp] = source pixel value, or pointer to top-left pixel of source
|
|
* [sp,#4] = 0 or source stride (pixels)
|
|
* The following arguments are unused for non-mask operations
|
|
* [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
|
|
* [sp,#12] = 0 or mask stride (pixels)
|
|
*/
|
|
|
|
/*
|
|
* Assign symbolic names to registers
|
|
*/
|
|
X .req r0 /* pixels to go on this line */
|
|
Y .req r1 /* lines to go */
|
|
DST .req r2 /* destination pixel pointer */
|
|
STRIDE_D .req r3 /* destination stride (bytes, minus width) */
|
|
SRC .req r4 /* source pixel pointer */
|
|
STRIDE_S .req r5 /* source stride (bytes, minus width) */
|
|
MASK .req r6 /* mask pixel pointer (if applicable) */
|
|
STRIDE_M .req r7 /* mask stride (bytes, minus width) */
|
|
WK0 .req r8 /* pixel data registers */
|
|
WK1 .req r9
|
|
WK2 .req r10
|
|
WK3 .req r11
|
|
SCRATCH .req r12
|
|
ORIG_W .req r14 /* width (pixels) */
|
|
|
|
fname:
|
|
push {r4-r11, lr} /* save all registers */
|
|
|
|
subs Y, Y, #1
|
|
blo 199f
|
|
|
|
#ifdef DEBUG_PARAMS
|
|
sub sp, sp, #9*4
|
|
#endif
|
|
|
|
.if src_bpp > 0
|
|
ldr SRC, [sp, #ARGS_STACK_OFFSET]
|
|
ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
|
|
.endif
|
|
.if mask_bpp > 0
|
|
ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
|
|
ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
|
|
.endif
|
|
|
|
#ifdef DEBUG_PARAMS
|
|
add Y, Y, #1
|
|
stmia sp, {r0-r7,pc}
|
|
sub Y, Y, #1
|
|
#endif
|
|
|
|
init
|
|
|
|
lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
|
|
sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
|
|
.if src_bpp > 0
|
|
lsl STRIDE_S, #src_bpp_shift
|
|
sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
|
|
.endif
|
|
.if mask_bpp > 0
|
|
lsl STRIDE_M, #mask_bpp_shift
|
|
sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
|
|
.endif
|
|
|
|
/* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
|
|
cmp X, #2*16*8/dst_w_bpp - 1
|
|
blo 170f
|
|
.if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
|
|
/* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
|
|
cmp X, #(prefetch_distance+3)*pix_per_block - 1
|
|
blo 160f
|
|
|
|
/* Wide case */
|
|
/* Adjust X so that the decrement instruction can also test for
|
|
* inner loop termination. We want it to stop when there are
|
|
* (prefetch_distance+1) complete blocks to go. */
|
|
sub X, X, #(prefetch_distance+2)*pix_per_block
|
|
mov ORIG_W, X
|
|
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
|
|
/* This is stmdb sp!,{} */
|
|
.word 0xE92D0000 | LINE_SAVED_REGS
|
|
.endif
|
|
151: /* New line */
|
|
newline
|
|
preload_leading_step1 src_bpp, WK1, SRC
|
|
preload_leading_step1 mask_bpp, WK2, MASK
|
|
preload_leading_step1 dst_r_bpp, WK3, DST
|
|
|
|
tst DST, #15
|
|
beq 154f
|
|
rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
|
|
.if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
|
|
PF and, WK0, WK0, #15
|
|
.endif
|
|
|
|
preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
|
|
preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
|
|
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
|
|
|
|
leading_15bytes process_head, process_tail
|
|
|
|
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
|
|
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
|
and SCRATCH, SRC, #31
|
|
rsb SCRATCH, SCRATCH, #32*prefetch_distance
|
|
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
|
and SCRATCH, MASK, #31
|
|
rsb SCRATCH, SCRATCH, #32*prefetch_distance
|
|
.endif
|
|
.ifc "process_inner_loop",""
|
|
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
|
|
.else
|
|
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
|
|
.endif
|
|
|
|
157: /* Check for another line */
|
|
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
|
|
.endif
|
|
|
|
.ltorg
|
|
|
|
160: /* Medium case */
|
|
mov ORIG_W, X
|
|
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
|
|
/* This is stmdb sp!,{} */
|
|
.word 0xE92D0000 | LINE_SAVED_REGS
|
|
.endif
|
|
161: /* New line */
|
|
newline
|
|
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
|
|
preload_line 0, mask_bpp, mask_bpp_shift, MASK
|
|
preload_line 0, dst_r_bpp, dst_bpp_shift, DST
|
|
|
|
sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
|
|
tst DST, #15
|
|
beq 164f
|
|
rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
|
|
|
|
leading_15bytes process_head, process_tail
|
|
|
|
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
|
|
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
|
|
|
|
167: /* Check for another line */
|
|
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
|
|
|
|
.ltorg
|
|
|
|
170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
|
|
.if dst_w_bpp < 32
|
|
mov ORIG_W, X
|
|
.endif
|
|
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
|
|
/* This is stmdb sp!,{} */
|
|
.word 0xE92D0000 | LINE_SAVED_REGS
|
|
.endif
|
|
171: /* New line */
|
|
newline
|
|
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
|
|
preload_line 1, mask_bpp, mask_bpp_shift, MASK
|
|
preload_line 1, dst_r_bpp, dst_bpp_shift, DST
|
|
|
|
.if dst_w_bpp == 8
|
|
tst DST, #3
|
|
beq 174f
|
|
172: subs X, X, #1
|
|
blo 177f
|
|
process_head , 1, 0, 1, 1, 0
|
|
process_tail , 1, 0
|
|
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
|
pixst , 1, 0, DST
|
|
.endif
|
|
tst DST, #3
|
|
bne 172b
|
|
.elseif dst_w_bpp == 16
|
|
tst DST, #2
|
|
beq 174f
|
|
subs X, X, #1
|
|
blo 177f
|
|
process_head , 2, 0, 1, 1, 0
|
|
process_tail , 2, 0
|
|
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
|
pixst , 2, 0, DST
|
|
.endif
|
|
.endif
|
|
|
|
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
|
|
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
|
|
|
|
177: /* Check for another line */
|
|
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
|
|
|
|
197:
|
|
.if (flags) & FLAG_SPILL_LINE_VARS
|
|
add sp, sp, #LINE_SAVED_REG_COUNT*4
|
|
.endif
|
|
198:
|
|
cleanup
|
|
|
|
#ifdef DEBUG_PARAMS
|
|
add sp, sp, #9*4 /* junk the debug copy of arguments */
|
|
#endif
|
|
199:
|
|
pop {r4-r11, pc} /* exit */
|
|
|
|
.ltorg
|
|
|
|
.unreq X
|
|
.unreq Y
|
|
.unreq DST
|
|
.unreq STRIDE_D
|
|
.unreq SRC
|
|
.unreq STRIDE_S
|
|
.unreq MASK
|
|
.unreq STRIDE_M
|
|
.unreq WK0
|
|
.unreq WK1
|
|
.unreq WK2
|
|
.unreq WK3
|
|
.unreq SCRATCH
|
|
.unreq ORIG_W
|
|
.endfunc
|
|
.endm
|
|
|
|
.macro line_saved_regs x:vararg
|
|
.set LINE_SAVED_REGS, 0
|
|
.set LINE_SAVED_REG_COUNT, 0
|
|
.irp SAVED_REG,x
|
|
.ifc "SAVED_REG","Y"
|
|
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
|
|
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
|
.endif
|
|
.ifc "SAVED_REG","STRIDE_D"
|
|
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
|
|
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
|
.endif
|
|
.ifc "SAVED_REG","STRIDE_S"
|
|
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
|
|
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
|
.endif
|
|
.ifc "SAVED_REG","STRIDE_M"
|
|
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
|
|
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
|
.endif
|
|
.ifc "SAVED_REG","ORIG_W"
|
|
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
|
|
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
|
.endif
|
|
.endr
|
|
.endm
|
|
|
|
.macro nop_macro x:vararg
|
|
.endm
|