From 57723b83e8654139d3d09a488f15e97e1d3628eb Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 24 Oct 2019 21:13:05 -0400
Subject: [PATCH] ARM: SIMD assembly optimization for function
 BlitRGBtoRGBPixelAlpha

Much of the heavy lifting of this optimization is lifted from the Pixman
project, which is distributed under an MIT-style license. As far as possible,
these elements have been relicensed to the zlib license.
---
 src/video/SDL_blit_A.c              |   21 +
 src/video/arm/pixman-arm-asm.h      |   36 +
 src/video/arm/pixman-arm-simd-asm.S |  168 +++++
 src/video/arm/pixman-arm-simd-asm.h | 1034 +++++++++++++++++++++++++++
 4 files changed, 1259 insertions(+)
 create mode 100644 src/video/arm/pixman-arm-asm.h
 create mode 100644 src/video/arm/pixman-arm-simd-asm.S
 create mode 100644 src/video/arm/pixman-arm-simd-asm.h

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index a04d1bc8f..07dd98001 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -389,6 +389,23 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
 
 #endif /* __MMX__ */
 
+#if SDL_ARM_SIMD_BLITTERS
+void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
+{
+    int32_t width = info->dst_w;
+    int32_t height = info->dst_h;
+    uint32_t *dstp = (uint32_t *)info->dst;
+    int32_t dststride = width + (info->dst_skip >> 2);
+    uint32_t *srcp = (uint32_t *)info->src;
+    int32_t srcstride = width + (info->src_skip >> 2);
+
+    BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
+}
+#endif
+
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 static void
 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
@@ -1315,6 +1332,10 @@ SDL_CalculateBlitA(SDL_Surface * surface)
                 }
 #endif /* __MMX__ || __3dNOW__ */
                 if (sf->Amask == 0xff000000) {
+#if SDL_ARM_SIMD_BLITTERS
+                    if (SDL_HasARMSIMD())
+                        return BlitRGBtoRGBPixelAlphaARMSIMD;
+#endif
                     return BlitRGBtoRGBPixelAlpha;
                 }
             }
diff --git a/src/video/arm/pixman-arm-asm.h b/src/video/arm/pixman-arm-asm.h
new file mode 100644
index 000000000..3f13ba049
--- /dev/null
+++ b/src/video/arm/pixman-arm-asm.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ *
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
new file mode 100644
index 000000000..2d65887e5
--- /dev/null
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016 RISC OS Open Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+#include "pixman-arm-asm.h"
+#include "pixman-arm-simd-asm.h"
+
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ *   cond           ARM condition code for code block
+ *   numbytes       Number of output bytes that should be generated this time
+ *   firstreg       First WK register in which to place output
+ *   unaligned_src  Whether to use non-wordaligned loads of source image
+ *   unaligned_mask Whether to use non-wordaligned loads of mask image
+ *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+/******************************************************************************/
+
+/* This differs from the over_8888_8888 routine in Pixman in that the destination
+ * alpha component is always left unchanged, and RGB components are not
+ * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
+ * renormalisation is done by multiplying by 257/256 (with rounding) rather than
+ * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
+ */
+
+.macro RGBtoRGBPixelAlpha_init
+        line_saved_regs STRIDE_S, ORIG_W
+        mov     MASK, #0x80
+.endm
+
+.macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
+        uxtb    tmp3, s
+        uxtb    tmp0, d
+        sub     tmp0, tmp3, tmp0
+        uxtb    tmp3, s, ror #16
+        uxtb    tmp1, d, ror #16
+        sub     tmp1, tmp3, tmp1
+        uxtb    tmp3, s, ror #8
+        mov     s, s, lsr #24
+        uxtb    tmp2, d, ror #8
+        sub     tmp2, tmp3, tmp2
+        smlabb  tmp0, tmp0, s, half
+        smlabb  tmp1, tmp1, s, half
+        smlabb  tmp2, tmp2, s, half
+        add     tmp0, tmp0, asr #8
+        add     tmp1, tmp1, asr #8
+        add     tmp2, tmp2, asr #8
+        pkhbt   tmp0, tmp0, tmp1, lsl #16
+        and     tmp2, tmp2, #0xff00
+        uxtb16  tmp0, tmp0, ror #8
+        orr     tmp0, tmp0, tmp2
+        uadd8   d, d, tmp0
+.endm
+
+.macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
+        and     d, d, #0xff000000
+        bic     s, s, #0xff000000
+        orr     d, d, s
+.endm
+
+.macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        ldm     SRC!, {WK0, WK1}
+        ldm     SRC!, {STRIDE_S, STRIDE_M}
+        ldrd    WK2, WK3, [DST], #16
+        orr     SCRATCH, WK0, WK1
+        and     ORIG_W, WK0, WK1
+        orr     SCRATCH, SCRATCH, STRIDE_S
+        and     ORIG_W, ORIG_W, STRIDE_S
+        orr     SCRATCH, SCRATCH, STRIDE_M
+        and     ORIG_W, ORIG_W, STRIDE_M
+        tst     SCRATCH, #0xff000000
+ .elseif numbytes == 8
+        ldm     SRC!, {WK0, WK1}
+        ldm     DST!, {WK2, WK3}
+        orr     SCRATCH, WK0, WK1
+        and     ORIG_W, WK0, WK1
+        tst     SCRATCH, #0xff000000
+ .else // numbytes == 4
+        ldr     WK0, [SRC], #4
+        ldr     WK2, [DST], #4
+        tst     WK0, #0xff000000
+ .endif
+.endm
+
+.macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
+        beq     20f @ all transparent
+ .if numbytes == 16
+        cmp     ORIG_W, #0xff000000
+        bhs     10f @ all opaque
+        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        strd    WK2, WK3, [DST, #-16]
+        ldrd    WK0, WK1, [SRC, #-8]
+        ldrd    WK2, WK3, [DST, #-8]
+        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        b       19f
+10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
+        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
+        strd    WK2, WK3, [DST, #-16]
+        ldrd    WK0, WK1, [SRC, #-8]
+        ldrd    WK2, WK3, [DST, #-8]
+        RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
+        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
+19:     strd    WK2, WK3, [DST, #-8]
+ .elseif numbytes == 8
+        cmp     ORIG_W, #0xff000000
+        bhs     10f @ all opaque
+        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        b       19f
+10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
+        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
+19:     strd    WK2, WK3, [DST, #-8]
+ .else // numbytes == 4
+        cmp     WK0, #0xff000000
+        bhs     10f @ opaque
+        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
+        b       19f
+10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
+19:     str     WK2, [DST, #-4]
+ .endif
+20:
+.endm
+
+generate_composite_function \
+    BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    RGBtoRGBPixelAlpha_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    RGBtoRGBPixelAlpha_process_head, \
+    RGBtoRGBPixelAlpha_process_tail
+
+/******************************************************************************/
diff --git a/src/video/arm/pixman-arm-simd-asm.h b/src/video/arm/pixman-arm-simd-asm.h
new file mode 100644
index 000000000..067d52c1a
--- /dev/null
+++ b/src/video/arm/pixman-arm-simd-asm.h
@@ -0,0 +1,1034 @@
+/*
+ * Copyright (c) 2012 Raspberry Pi Foundation
+ * Copyright (c) 2012 RISC OS Open Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Because the alignment of pixel data to cachelines, and even the number of
+ * cachelines per row can vary from row to row, and because of the need to
+ * preload each scanline once and only once, this prefetch strategy treats
+ * each row of pixels independently. When a pixel row is long enough, there
+ * are three distinct phases of prefetch:
+ * * an inner loop section, where each time a cacheline of data is
+ *    processed, another cacheline is preloaded (the exact distance ahead is
+ *    determined empirically using profiling results from lowlevel-blt-bench)
+ * * a leading section, where enough cachelines are preloaded to ensure no
+ *    cachelines escape being preloaded when the inner loop starts
+ * * a trailing section, where a limited number (0 or more) of cachelines
+ *    are preloaded to deal with data (if any) that hangs off the end of the
+ *    last iteration of the inner loop, plus any trailing bytes that were not
+ *    enough to make up one whole iteration of the inner loop
+ * 
+ * There are (in general) three distinct code paths, selected between
+ * depending upon how long the pixel row is. If it is long enough that there
+ * is at least one iteration of the inner loop (as described above) then
+ * this is described as the "wide" case. If it is shorter than that, but
+ * there are still enough bytes output that there is at least one 16-byte-
+ * long, 16-byte-aligned write to the destination (the optimum type of
+ * write), then this is the "medium" case. If it is not even this long, then
+ * this is the "narrow" case, and there is no attempt to align writes to
+ * 16-byte boundaries. In the "medium" and "narrow" cases, all the
+ * cachelines containing data from the pixel row are prefetched up-front.
+ */
+
+/*
+ * Determine whether we put the arguments on the stack for debugging.
+ */
+#undef DEBUG_PARAMS
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,         0
+.set FLAG_DST_READWRITE,         1
+.set FLAG_COND_EXEC,             0
+.set FLAG_BRANCH_OVER,           2
+.set FLAG_PROCESS_PRESERVES_PSR, 0
+.set FLAG_PROCESS_CORRUPTS_PSR,  4
+.set FLAG_PROCESS_DOESNT_STORE,  0
+.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
+.set FLAG_NO_SPILL_LINE_VARS,        0
+.set FLAG_SPILL_LINE_VARS_WIDE,      16
+.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
+.set FLAG_SPILL_LINE_VARS,           48
+.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
+.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0,     0
+.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST,               0
+.set FLAG_NO_PRELOAD_DST,            256
+
+/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
+
+/*
+ * Offset into stack where mask and source pointer/stride can be accessed.
+ */
+#ifdef DEBUG_PARAMS
+.set ARGS_STACK_OFFSET,        (9*4+9*4)
+#else
+.set ARGS_STACK_OFFSET,        (9*4)
+#endif
+
+/*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET,     0
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0
+.set PREFETCH_TYPE_STANDARD,   1
+
+/*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+ .if numbytes == 16
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+        op&r&cond    WK&reg2, [base], #4
+        op&r&cond    WK&reg3, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+  .endif
+ .elseif numbytes == 8
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1}
+  .endif
+ .elseif numbytes == 4
+        op&r&cond    WK&reg0, [base], #4
+ .elseif numbytes == 2
+        op&r&cond&h  WK&reg0, [base], #2
+ .elseif numbytes == 1
+        op&r&cond&b  WK&reg0, [base], #1
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+ .if numbytes == 16
+        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ .elseif numbytes == 8
+        stm&cond&db base, {WK&reg0,WK&reg1}
+ .elseif numbytes == 4
+        str&cond    WK&reg0, [base, #-4]
+ .elseif numbytes == 2
+        str&cond&h  WK&reg0, [base, #-2]
+ .elseif numbytes == 1
+        str&cond&b  WK&reg0, [base, #-1]
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixld cond, numbytes, firstreg, base, unaligned
+        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+.endm
+
+.macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .else
+        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .endif
+.endm
+
+.macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+        a x
+ .endif
+.endm
+
+
+.macro preload_leading_step1  bpp, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if bpp > 0
+        PF  bic,    ptr, base, #31
+  .set OFFSET, 0
+  .rept prefetch_distance+1
+        PF  pld,    [ptr, #OFFSET]
+   .set OFFSET, OFFSET+32
+  .endr
+ .endif
+.endm
+
+.macro preload_leading_step2  bpp, bpp_shift, ptr, base
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+ * are the bytes corresponding to the leading pixels more than the amount
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ *     inner_loop_offset = (src+leading_bytes)&31
+ *     extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+ .if bpp > 0
+  .ifc base,DST
+        /* The test can be simplified further when preloading the destination */
+        PF  tst,    base, #16
+        PF  beq,    61f
+  .else
+   .if bpp/dst_w_bpp == 4
+        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  and,    SCRATCH, SCRATCH, #31
+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
+        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
+        PF  bcs,    61f
+        PF  bpl,    60f
+        PF  pld,    [ptr, #32*(prefetch_distance+2)]
+   .else
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  bls,    61f
+   .endif
+  .endif
+60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
+61:
+ .endif
+.endm
+
+#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+.macro preload_middle   bpp, base, scratch_holds_offset
+ .if bpp > 0
+        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+   .if scratch_holds_offset
+        PF  pld,    [base, SCRATCH]
+   .else
+        PF  bic,    SCRATCH, base, #31
+        PF  pld,    [SCRATCH, #32*prefetch_distance]
+   .endif
+  .endif
+ .endif
+.endm
+
+.macro preload_trailing  bpp, bpp_shift, base
+ .if bpp > 0
+  .if bpp*pix_per_block > 256
+        /* Calculations are more complex if more than one fetch per block */
+        PF  and,    WK1, base, #31
+        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
+        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+        PF  bic,    SCRATCH, base, #31
+80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+        PF  add,    SCRATCH, SCRATCH, #32
+        PF  subs,   WK1, WK1, #32
+        PF  bhi,    80b
+  .else
+        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
+        PF  adceqs, SCRATCH, SCRATCH, #0
+        /* The instruction above has two effects: ensures Z is only
+         * set if C was clear (so Z indicates that both shifted quantities
+         * were 0), and clears C if Z was set (so C indicates that the sum
+         * of the shifted quantities was greater and not equal to 32) */
+        PF  beq,    82f
+        PF  bic,    SCRATCH, base, #31
+        PF  bcc,    81f
+        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
+81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+82:
+  .endif
+ .endif
+.endm
+
+
+.macro preload_line    narrow_case, bpp, bpp_shift, base
+/* "narrow_case" - just means that the macro was invoked from the "narrow"
+ *    code path rather than the "medium" one - because in the narrow case,
+ *    the row of pixels is known to output no more than 30 bytes, then
+ *    (assuming the source pixels are no wider than the the destination
+ *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ *    meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ *    destination) that's being preloaded, or 0 if this channel is not used
+ *    for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+ .if bpp > 0
+  .if narrow_case && (bpp <= dst_w_bpp)
+        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, LSL #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    90f
+        PF  pld,    [WK1]
+90:
+  .else
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, lsl #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    92f
+91:     PF  add,    WK0, WK0, #32
+        PF  cmp,    WK0, WK1
+        PF  pld,    [WK0]
+        PF  bne,    91b
+92:
+  .endif
+ .endif
+.endm
+
+
+.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+ .if decrementx
+        sub&cond X, X, #8*numbytes/dst_w_bpp
+ .endif
+        process_tail  cond, numbytes, firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   cond, numbytes, firstreg, DST
+ .endif
+.endm
+
+.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+  .ifc cond,mi
+        bpl     100f
+  .endif
+  .ifc cond,cs
+        bcc     100f
+  .endif
+  .ifc cond,ne
+        beq     100f
+  .endif
+        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+100:
+ .else
+        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .endif
+.endm
+
+.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+        /* Can't interleave reads and writes */
+        test
+        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+        test
+  .endif
+        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .else
+        /* Can interleave reads and writes for better scheduling */
+        test
+        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+  .if decrementx
+        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
+  .endif
+        process_tail  cond1, numbytes1, firstreg1
+        process_tail  cond2, numbytes2, firstreg2
+        pixst   cond1, numbytes1, firstreg1, DST
+        pixst   cond2, numbytes2, firstreg2, DST
+ .endif
+.endm
+
+
+.macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .else
+        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .endif
+.endm
+
+.macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .else
+        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .endif
+.endm
+
+.macro leading_15bytes  process_head, process_tail
+        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set DECREMENT_X, 0
+        sub     X, X, WK0, lsr #dst_bpp_shift
+        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
+        mov     X, WK0
+ .endif
+        /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+ .elseif dst_w_bpp == 16
+        test_bits_1_0_ptr
+        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
+ .endif
+.endm
+
+.macro test_bits_3_2_pix
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
+.endm
+
+.macro test_bits_1_0_pix
+ .if dst_w_bpp == 8
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+        movs    SCRATCH, X, lsr #1
+ .endif
+.endm
+
+.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ .if dst_w_bpp == 16
+        test_bits_1_0_pix
+        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ .endif
+.endm
+
+
+.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
+  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  src_bpp, SRC, 1
+  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  mask_bpp, MASK, 1
+  .else
+        preload_middle  src_bpp, SRC, 0
+        preload_middle  mask_bpp, MASK, 0
+  .endif
+  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
+        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+         * preloads for, to achieve staggered prefetches for multiple channels, because there are
+         * always two STMs per prefetch, so there is always an opposite STM on which to put the
+         * preload. Note, no need to BIC the base register here */
+        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
+  .endif
+        process_tail  , 16, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+  .endif
+  .set SUBBLOCK, SUBBLOCK+1
+ .endr
+        subs    X, X, #pix_per_block
+        bhs     110b
+.endm
+
+.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+        tst     DST, #16
+        bne     111f
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+        b       112f
+111:
+ .endif
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+112:
+        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+        PF  and,    WK0, X, #pix_per_block-1
+ .endif
+        preload_trailing  src_bpp, src_bpp_shift, SRC
+        preload_trailing  mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
+ .endif
+        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+        /* The remainder of the line is handled identically to the medium case */
+        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+.endm
+
+.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+120:
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
+        process_tail  , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+ .endif
+        subs    X, X, #128/dst_w_bpp
+        bhs     120b
+        /* Trailing pixels */
+        tst     X, #128/dst_w_bpp - 1
+        beq     exit_label
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+        tst     X, #16*8/dst_w_bpp
+        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+        /* Trailing pixels */
+        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+        tst     MASK, #3
+        bne     141f
+ .endif
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     140f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+140:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
+  .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+        b       exit_label
+141:
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     142f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+142:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
+  .endif
+ .endif
+.endm
+
+
+.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
+ .if SINGLE_SCANLINE
+  .ifc "last_one",""
+        b       198f
+  .endif
+ .else
+ .if vars_spilled
+        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+        /* This is ldmia sp,{} */
+        .word   0xE89D0000 | LINE_SAVED_REGS
+ .endif
+        subs    Y, Y, #1
+ .if vars_spilled
+  .if (LINE_SAVED_REGS) & (1<<1)
+        str     Y, [sp]
+  .endif
+ .endif
+        add     DST, DST, STRIDE_D
+ .if src_bpp > 0
+        add     SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+        add     MASK, MASK, STRIDE_M
+ .endif
+ .if restore_x
+        mov     X, ORIG_W
+ .endif
+        bhs     loop_label
+ .ifc "last_one",""
+  .if vars_spilled
+        b       197f
+  .else
+        b       198f
+  .endif
+ .else
+  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+        b       198f
+  .endif
+ .endif
+ .endif
+.endm
+
+
+.macro generate_composite_function_common fname, \
+                                          src_bpp_, \
+                                          mask_bpp_, \
+                                          dst_w_bpp_, \
+                                          flags_, \
+                                          prefetch_distance_, \
+                                          init, \
+                                          newline, \
+                                          cleanup, \
+                                          process_head, \
+                                          process_tail, \
+                                          process_inner_loop
+
+    pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set flags, flags_
+ .set prefetch_distance, prefetch_distance_
+
+/*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+ .endif
+
+ .if src_bpp == 32
+  .set src_bpp_shift, 2
+ .elseif src_bpp == 24
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+  .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 0
+  .set src_bpp_shift, -1
+ .else
+  .error "requested src bpp (src_bpp) is not supported"
+ .endif
+
+ .if mask_bpp == 32
+  .set mask_bpp_shift, 2
+ .elseif mask_bpp == 24
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+  .set mask_bpp_shift, -1
+ .else
+  .error "requested mask bpp (mask_bpp) is not supported"
+ .endif
+
+ .if dst_w_bpp == 32
+  .set dst_bpp_shift, 2
+ .elseif dst_w_bpp == 24
+  .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+  .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+  .set dst_bpp_shift, 0
+ .else
+  .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+ .if (((flags) & FLAG_DST_READWRITE) != 0)
+  .set dst_r_bpp, dst_w_bpp
+ .else
+  .set dst_r_bpp, 0
+ .endif
+
+ .set pix_per_block, 16*8/dst_w_bpp
+ .if src_bpp != 0
+  .if 32*8/src_bpp > pix_per_block
+   .set pix_per_block, 32*8/src_bpp
+  .endif
+ .endif
+ .if mask_bpp != 0
+  .if 32*8/mask_bpp > pix_per_block
+   .set pix_per_block, 32*8/mask_bpp
+  .endif
+ .endif
+ .if dst_r_bpp != 0
+  .if 32*8/dst_r_bpp > pix_per_block
+   .set pix_per_block, 32*8/dst_r_bpp
+  .endif
+ .endif
+
+/* The standard entry conditions set up by pixman-arm-common.h are:
+ * r0 = width (pixels)
+ * r1 = height (rows)
+ * r2 = pointer to top-left pixel of destination
+ * r3 = destination stride (pixels)
+ * [sp] = source pixel value, or pointer to top-left pixel of source
+ * [sp,#4] = 0 or source stride (pixels)
+ * The following arguments are unused for non-mask operations
+ * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
+ * [sp,#12] = 0 or mask stride (pixels)
+ *
+ * or in the single-scanline case:
+ * r0 = width (pixels)
+ * r1 = pointer to top-left pixel of destination
+ * r2 = pointer to top-left pixel of source
+ * The following argument is unused for non-mask operations
+ * r3 = pointer to top-left pixel of mask
+ */
+
+/*
+ * Assign symbolic names to registers
+ */
+    X           .req    r0  /* pixels to go on this line */
+ .if SINGLE_SCANLINE
+    DST         .req    r1  /* destination pixel pointer */
+    SRC         .req    r2  /* source pixel pointer */
+    MASK        .req    r3  /* mask pixel pointer (if applicable) */
+    Y           .req    r4  /* temporary */
+    STRIDE_D    .req    r5  /* temporary */
+    STRIDE_S    .req    r6  /* temporary */
+    STRIDE_M    .req    r7  /* temporary */
+ .else
+    Y           .req    r1  /* lines to go */
+    DST         .req    r2  /* destination pixel pointer */
+    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
+    SRC         .req    r4  /* source pixel pointer */
+    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
+    MASK        .req    r6  /* mask pixel pointer (if applicable) */
+    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
+ .endif
+    WK0         .req    r8  /* pixel data registers */
+    WK1         .req    r9
+    WK2         .req    r10
+    WK3         .req    r11
+    SCRATCH     .req    r12
+    ORIG_W      .req    r14 /* width (pixels) */
+
+        push    {r4-r11, lr}        /* save all registers */
+
+ .if !SINGLE_SCANLINE
+        subs    Y, Y, #1
+        blo     199f
+ .endif
+
+#ifdef DEBUG_PARAMS
+        sub     sp, sp, #9*4
+#endif
+
+ .if !SINGLE_SCANLINE
+ .if src_bpp > 0
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
+ .endif
+ .if mask_bpp > 0
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
+ .endif
+ .endif
+        
+#ifdef DEBUG_PARAMS
+        add     Y, Y, #1
+        stmia   sp, {r0-r7,pc}
+        sub     Y, Y, #1
+#endif
+
+        init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        /* Reserve a word in which to store X during leading pixels */
+        sub     sp, sp, #4
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
+        
+ .if !SINGLE_SCANLINE
+        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
+        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
+ .if src_bpp > 0
+        lsl     STRIDE_S, #src_bpp_shift
+        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
+ .endif
+ .if mask_bpp > 0
+        lsl     STRIDE_M, #mask_bpp_shift
+        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
+ .endif
+ .endif
+ 
+        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
+        cmp     X, #2*16*8/dst_w_bpp - 1
+        blo     170f
+ .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
+        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
+        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
+        blo     160f
+
+        /* Wide case */
+        /* Adjust X so that the decrement instruction can also test for
+         * inner loop termination. We want it to stop when there are
+         * (prefetch_distance+1) complete blocks to go. */
+        sub     X, X, #(prefetch_distance+2)*pix_per_block
+  .if !SINGLE_SCANLINE
+        mov     ORIG_W, X
+  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+  .endif
+  .endif
+151:    /* New line */
+        newline
+        preload_leading_step1  src_bpp, WK1, SRC
+        preload_leading_step1  mask_bpp, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+        preload_leading_step1  dst_r_bpp, WK3, DST
+  .endif
+        
+        ands    WK0, DST, #15
+        beq     154f
+        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
+
+        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
+        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
+  .endif
+
+        leading_15bytes  process_head, process_tail
+        
+154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, SRC, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, MASK, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+  .endif
+  .ifc "process_inner_loop",""
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+  .else
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+  .endif
+
+157:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE)
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .endif
+ .endif
+
+ .ltorg
+
+160:    /* Medium case */
+ .if !SINGLE_SCANLINE
+        mov     ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .endif
+ .endif
+161:    /* New line */
+        newline
+        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+        
+        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
+        ands    WK0, DST, #15
+        beq     164f
+        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
+        
+        leading_15bytes  process_head, process_tail
+        
+164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+        
+167:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if !SINGLE_SCANLINE
+ .if dst_w_bpp < 32
+        mov     ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+ .endif
+ .endif
+171:    /* New line */
+        newline
+        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+        
+ .if dst_w_bpp == 8
+        tst     DST, #3
+        beq     174f
+172:    subs    X, X, #1
+        blo     177f
+        process_head  , 1, 0, 1, 1, 0
+        process_tail  , 1, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 1, 0, DST
+  .endif
+        tst     DST, #3
+        bne     172b
+ .elseif dst_w_bpp == 16
+        tst     DST, #2
+        beq     174f
+        subs    X, X, #1
+        blo     177f
+        process_head  , 2, 0, 1, 1, 0
+        process_tail  , 2, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 2, 0, DST
+  .endif
+ .endif
+
+174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+
+177:    /* Check for another line */
+        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE)
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
+
+197:
+ .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS)
+        add     sp, sp, #LINE_SAVED_REG_COUNT*4
+ .endif
+198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+        add     sp, sp, #4
+ .endif
+
+        cleanup
+
+#ifdef DEBUG_PARAMS
+        add     sp, sp, #9*4 /* junk the debug copy of arguments */
+#endif
+199:
+        pop     {r4-r11, pc}  /* exit */
+
+ .ltorg
+
+    .unreq  X
+    .unreq  Y
+    .unreq  DST
+    .unreq  STRIDE_D
+    .unreq  SRC
+    .unreq  STRIDE_S
+    .unreq  MASK
+    .unreq  STRIDE_M
+    .unreq  WK0
+    .unreq  WK1
+    .unreq  WK2
+    .unreq  WK3
+    .unreq  SCRATCH
+    .unreq  ORIG_W
+    .endfunc
+.endm
+
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags_, \
+                                   prefetch_distance_, \
+                                   init, \
+                                   newline, \
+                                   cleanup, \
+                                   process_head, \
+                                   process_tail, \
+                                   process_inner_loop
+ .set SINGLE_SCANLINE, 0
+generate_composite_function_common \
+    fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
+    init, newline, cleanup, process_head, process_tail, process_inner_loop
+.endm
+
+.macro generate_composite_function_single_scanline fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags_, \
+                                                   prefetch_distance_, \
+                                                   init, \
+                                                   newline, \
+                                                   cleanup, \
+                                                   process_head, \
+                                                   process_tail, \
+                                                   process_inner_loop
+ .set SINGLE_SCANLINE, 1
+generate_composite_function_common \
+    fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
+    init, newline, cleanup, process_head, process_tail, process_inner_loop
+.endm
+
+.macro line_saved_regs  x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+ .irp SAVED_REG,x
+  .ifc "SAVED_REG","Y"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_D"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_S"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_M"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","ORIG_W"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+ .endr
+ .if SINGLE_SCANLINE
+  .set LINE_SAVED_REG_COUNT, 0
+ .endif
+.endm
+
+.macro nop_macro x:vararg
+.endm