mirror of https://github.com/encounter/SDL.git
ARM: NEON assembly optimization for function BlitRGBtoRGBPixelAlpha
This commit is contained in:
parent
a6bfdd103f
commit
2dfe060564
|
@ -421,6 +421,23 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if SDL_ARM_NEON_BLITTERS
|
||||
void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
|
||||
|
||||
static void
|
||||
BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
|
||||
{
|
||||
int32_t width = info->dst_w;
|
||||
int32_t height = info->dst_h;
|
||||
uint32_t *dstp = (uint32_t *)info->dst;
|
||||
int32_t dststride = width + (info->dst_skip >> 2);
|
||||
uint32_t *srcp = (uint32_t *)info->src;
|
||||
int32_t srcstride = width + (info->src_skip >> 2);
|
||||
|
||||
BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
|
||||
static void
|
||||
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
|
||||
|
@ -1356,6 +1373,10 @@ SDL_CalculateBlitA(SDL_Surface * surface)
|
|||
}
|
||||
#endif /* __MMX__ || __3dNOW__ */
|
||||
if (sf->Amask == 0xff000000) {
|
||||
#if SDL_ARM_NEON_BLITTERS
|
||||
if (SDL_HasNEON())
|
||||
return BlitRGBtoRGBPixelAlphaARMNEON;
|
||||
#endif
|
||||
#if SDL_ARM_SIMD_BLITTERS
|
||||
if (SDL_HasARMSIMD())
|
||||
return BlitRGBtoRGBPixelAlphaARMSIMD;
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
/*
|
||||
* Copyright © 2009 Nokia Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2018 RISC OS Open Ltd
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* Prevent the stack from becoming executable for no reason... */
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
|
||||
.text
|
||||
.fpu neon
|
||||
.arch armv7a
|
||||
.object_arch armv4
|
||||
.eabi_attribute 10, 0 /* suppress Tag_FP_arch */
|
||||
.eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
|
||||
.arm
|
||||
.altmacro
|
||||
.p2align 2
|
||||
|
||||
#include "pixman-arm-asm.h"
|
||||
#include "pixman-arm-neon-asm.h"
|
||||
|
||||
/* Global configuration options and preferences */
|
||||
|
||||
/*
|
||||
* The code can optionally make use of unaligned memory accesses to improve
|
||||
* performance of handling leading/trailing pixels for each scanline.
|
||||
* Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
|
||||
* example in linux if unaligned memory accesses are not configured to
|
||||
* generate.exceptions.
|
||||
*/
|
||||
.set RESPECT_STRICT_ALIGNMENT, 1
|
||||
|
||||
/*
|
||||
* Set default prefetch type. There is a choice between the following options:
|
||||
*
|
||||
* PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
|
||||
* as NOP to workaround some HW bugs or for whatever other reason)
|
||||
*
|
||||
* PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
|
||||
* advanced prefetch intruduces heavy overhead)
|
||||
*
|
||||
* PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
|
||||
* which can run ARM and NEON instructions simultaneously so that extra ARM
|
||||
* instructions do not add (many) extra cycles, but improve prefetch efficiency)
|
||||
*
|
||||
* Note: some types of function can't support advanced prefetch and fallback
|
||||
* to simple one (those which handle 24bpp pixels)
|
||||
*/
|
||||
.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
|
||||
|
||||
/* Prefetch distance in pixels for simple prefetch */
|
||||
.set PREFETCH_DISTANCE_SIMPLE, 64
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro RGBtoRGBPixelAlpha_process_pixblock_head
|
||||
vmvn d30, d3 /* get inverted source alpha */
|
||||
vmov d31, d7 /* dest alpha is always unchanged */
|
||||
vmull.u8 q14, d0, d3
|
||||
vmlal.u8 q14, d4, d30
|
||||
vmull.u8 q0, d1, d3
|
||||
vmlal.u8 q0, d5, d30
|
||||
vmull.u8 q1, d2, d3
|
||||
vmlal.u8 q1, d6, d30
|
||||
vrshr.u16 q2, q14, #8
|
||||
vrshr.u16 q3, q0, #8
|
||||
vraddhn.u16 d28, q14, q2
|
||||
vrshr.u16 q2, q1, #8
|
||||
vraddhn.u16 d29, q0, q3
|
||||
vraddhn.u16 d30, q1, q2
|
||||
.endm
|
||||
|
||||
.macro RGBtoRGBPixelAlpha_process_pixblock_tail
|
||||
/* nothing */
|
||||
.endm
|
||||
|
||||
.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
|
||||
vld4.8 {d0-d3}, [SRC]!
|
||||
PF add PF_X, PF_X, #8
|
||||
vst4.8 {d28-d31}, [DST_W :128]!
|
||||
PF tst PF_CTL, #0xF
|
||||
vld4.8 {d4-d7}, [DST_R :128]!
|
||||
PF addne PF_X, PF_X, #8
|
||||
vmvn d30, d3 /* get inverted source alpha */
|
||||
vmov d31, d7 /* dest alpha is always unchanged */
|
||||
vmull.u8 q14, d0, d3
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
vmlal.u8 q14, d4, d30
|
||||
PF cmp PF_X, ORIG_W
|
||||
vmull.u8 q0, d1, d3
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
vmlal.u8 q0, d5, d30
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vmull.u8 q1, d2, d3
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
vmlal.u8 q1, d6, d30
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
vrshr.u16 q2, q14, #8
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
vrshr.u16 q3, q0, #8
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vraddhn.u16 d28, q14, q2
|
||||
vrshr.u16 q2, q1, #8
|
||||
vraddhn.u16 d29, q0, q3
|
||||
vraddhn.u16 d30, q1, q2
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
|
||||
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
|
||||
8, /* number of pixels, processed in a single block */ \
|
||||
5, /* prefetch distance */ \
|
||||
default_init, \
|
||||
default_cleanup, \
|
||||
RGBtoRGBPixelAlpha_process_pixblock_head, \
|
||||
RGBtoRGBPixelAlpha_process_pixblock_tail, \
|
||||
RGBtoRGBPixelAlpha_process_pixblock_tail_head
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue