diff --git a/include/SDL_surface.h b/include/SDL_surface.h index d3f8c814e..a9975d790 100644 --- a/include/SDL_surface.h +++ b/include/SDL_surface.h @@ -519,6 +519,17 @@ extern DECLSPEC int SDLCALL SDL_SoftStretch(SDL_Surface * src, SDL_Surface * dst, const SDL_Rect * dstrect); +/** + * \brief Perform a bilinear scaling between two surfaces of the + * same pixel format, 32BPP. + * + */ +extern DECLSPEC int SDLCALL SDL_SoftStretchLinear(SDL_Surface * src, + const SDL_Rect * srcrect, + SDL_Surface * dst, + const SDL_Rect * dstrect); + + #define SDL_BlitScaled SDL_UpperBlitScaled /** diff --git a/src/video/SDL_stretch.c b/src/video/SDL_stretch.c index bb988e9aa..5fdb317de 100644 --- a/src/video/SDL_stretch.c +++ b/src/video/SDL_stretch.c @@ -198,6 +198,7 @@ copy_row3(Uint8 * src, int src_w, Uint8 * dst, int dst_w) } static int SDL_SoftStretchLowerNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect); +static int SDL_SoftStretchLowerLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect); static int SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect, SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode); /* Perform a stretch blit between two surfaces of the same format. @@ -210,6 +211,13 @@ SDL_SoftStretch(SDL_Surface *src, const SDL_Rect *srcrect, return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeNearest); } +int +SDL_SoftStretchLinear(SDL_Surface *src, const SDL_Rect *srcrect, + SDL_Surface *dst, const SDL_Rect *dstrect) +{ + return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeLinear); +} + static int SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect, SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode) @@ -273,7 +281,9 @@ SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect, } if (scaleMode == SDL_ScaleModeNearest) { - ret = SDL_SoftStretchLowerNearest(src, srcrect, dst, dstrect); + ret = SDL_SoftStretchLowerNearest(src, srcrect, dst, dstrect); + } else { + ret = SDL_SoftStretchLowerLinear(src, srcrect, dst, dstrect); } /* We need to unlock the surfaces if they're locked */ @@ -376,4 +386,713 @@ SDL_SoftStretchLowerNearest(SDL_Surface *src, const SDL_Rect *srcrect, return 0; } + +/* bilinear interpolation precision must be < 8 + Because with SSE: add-multiply: _mm_madd_epi16 works with signed int + so pixels 0xb1...... are negatives and false the result + same in NEON probably */ +#define PRECISION 7 + +#define FIXED_POINT(i) ((uint32_t)(i) << 16) +#define SRC_INDEX(fp) ((uint32_t)(fp) >> 16) +#define INTEGER(fp) ((uint32_t)(fp) >> PRECISION) +#define FRAC(fp) ((uint32_t)(fp >> (16 - PRECISION)) & ((1< dst_h - 1 - right_pad_h); \ + index_h = SRC_INDEX(fp_sum_h); \ + frac_h0 = FRAC(fp_sum_h); \ + \ + index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \ + frac_h0 = no_padding ? frac_h0 : 0; \ + incr_h1 = no_padding ? src_pitch : 0; \ + incr_h0 = index_h * src_pitch; \ + \ + src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \ + src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \ + \ + fp_sum_h += fp_step_h; \ + \ + frac_h1 = FRAC_ONE - frac_h0; \ + fp_sum_w = fp_sum_w_init; \ + right_pad_w = right_pad_w_init; \ + left_pad_w = left_pad_w_init; \ + middle = middle_init; \ + + + + +static void +#if defined(__clang__) +// Remove inlining of this function +// Crash with clang 9.0.8 / android-ndk-r21d +// Ok with clang 11.0.5 / android-ndk-22 +# if __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 8 +__attribute__((noinline)) +# endif +#endif +get_scaler_datas(int src_nb, int dst_nb, int *fp_start, int *fp_step, int *left_pad, int *right_pad) +{ + + int step = FIXED_POINT(src_nb) / (dst_nb); /* source step in fixed point */ + int x0 = FP_ONE / 2; /* dst first pixel center at 0.5 in fixed point */ + int fp_sum; + int i; +#if 0 + /* scale to source coordinates */ + x0 *= src_nb; + x0 /= dst_nb; /* x0 == step / 2 */ +#else + /* Use this code for perfect match with pixman */ + Sint64 tmp[2]; + tmp[0] = (Sint64)step * (x0 >> 16); + tmp[1] = (Sint64)step * (x0 & 0xFFFF); + x0 = tmp[0] + ((tmp[1] + 0x8000) >> 16); /* x0 == (step + 1) / 2 */ +#endif + /* -= 0.5, get back the pixel origin, in source coordinates */ + x0 -= FP_ONE / 2; + + *fp_start = x0; + *fp_step = step; + *left_pad = 0; + *right_pad = 0; + + fp_sum = x0; + for (i = 0; i < dst_nb; i++) { + if (fp_sum < 0) { + *left_pad += 1; + } else { + int index = SRC_INDEX(fp_sum); + if (index > src_nb - 2) { + *right_pad += 1; + } + } + fp_sum += step; + } +// SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad); +} + +typedef struct color_t { + Uint8 a; + Uint8 b; + Uint8 c; + Uint8 d; +} color_t; + +#if 0 +static void +printf_64(const char *str, void *var) +{ + uint8_t *val = (uint8_t*) var; + printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n", + str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); +} +#endif + +/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ + +static inline void +INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst) +{ + const color_t *c0 = (const color_t *)src_x0; + const color_t *c1 = (const color_t *)src_x1; + color_t *cx = (color_t *)dst; +#if 0 + cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a)); + cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b)); + cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c)); + cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d)); +#else + cx->a = INTEGER(frac1 * c0->a + frac0 * c1->a); + cx->b = INTEGER(frac1 * c0->b + frac0 * c1->b); + cx->c = INTEGER(frac1 * c0->c + frac0 * c1->c); + cx->d = INTEGER(frac1 * c0->d + frac0 * c1->d); +#endif +} + +static inline void +INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst) +{ + Uint32 tmp[2]; + unsigned int frac_w1 = FRAC_ONE - frac_w0; + + /* Vertical first, store to 'tmp' */ + INTERPOL(s0, s1, frac_h0, frac_h1, tmp); + INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1); + + /* Horizontal, store to 'dst' */ + INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst); +} + +static int +scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, + Uint32 *dst, int dst_w, int dst_h, int dst_pitch) +{ + BILINEAR___START + + for (i = 0; i < dst_h; i++) { + + BILINEAR___HEIGHT + + while (left_pad_w--) { + INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst); + dst += 1; + } + + while (middle--) { + const Uint32 *s_00_01; + const Uint32 *s_10_11; + int index_w = 4 * SRC_INDEX(fp_sum_w); + int frac_w = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + +/* + x00 ... x0_ ..... x01 + . . . + . x . + . . . + . . . + x10 ... x1_ ..... x11 +*/ + s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + + INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst); + + dst += 1; + } + + while (right_pad_w--) { + int index_w = 4 * (src_w - 2); + const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst); + dst += 1; + } + dst = (Uint32 *)((Uint8 *)dst + dst_gap); + } + return 0; +} + +#if defined(__SSE2__) +# define HAVE_SSE2_INTRINSICS 1 +#endif + +#if defined(__ARM_NEON) +# define HAVE_NEON_INTRINSICS 1 +#endif + +/* TODO: this didn't compile on Window10 universal package last time I tried .. */ +#if defined(__WINRT__) +# if defined(HAVE_NEON_INTRINSICS) +# undef HAVE_NEON_INTRINSICS +# endif +#endif + +#if defined(HAVE_SSE2_INTRINSICS) + +#if 0 +static void +printf_128(const char *str, __m128i var) +{ + uint16_t *val = (uint16_t*) &var; + printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n", + str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); +} +#endif + +static inline int +hasSSE2() +{ + static int val = -1; + if (val != -1) { + return val; + } + val = SDL_HasSSE2(); + return val; +} + +static inline void +INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero) +{ + __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ + __m128i v_frac_w0, k0, l0, d0, e0; + + int f, f2; + f = frac_w; + f2 = FRAC_ONE - frac_w; + v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2); + + + x_00_01 = _mm_loadl_epi64((const __m128i *)s0); /* Load x00 and x01 */ + x_10_11 = _mm_loadl_epi64((const __m128i *)s1); + + /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ + + /* Interpolation vertical */ + k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1); + l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0); + k0 = _mm_add_epi16(k0, l0); + + /* For perfect match, clear the factionnal part eventually. */ + /* + k0 = _mm_srli_epi16(k0, PRECISION); + k0 = _mm_slli_epi16(k0, PRECISION); + */ + + /* Interpolation horizontal */ + l0 = _mm_unpacklo_epi64(/* unused */ l0, k0); + k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0); + + /* Store 1 pixel */ + d0 = _mm_srli_epi32(k0, PRECISION * 2); + e0 = _mm_packs_epi32(d0, d0); + e0 = _mm_packus_epi16(e0, e0); + *dst = _mm_cvtsi128_si32(e0); +} + +static int +scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) +{ + BILINEAR___START + + for (i = 0; i < dst_h; i++) { + int nb_block2; + __m128i v_frac_h0; + __m128i v_frac_h1; + __m128i zero; + + BILINEAR___HEIGHT + + nb_block2 = middle / 2; + + v_frac_h0 = _mm_set_epi16(frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0); + v_frac_h1 = _mm_set_epi16(frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1); + zero = _mm_setzero_si128(); + + while (left_pad_w--) { + INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero); + dst += 1; + } + + while (nb_block2--) { + int index_w_0, frac_w_0; + int index_w_1, frac_w_1; + + const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13; + + __m128i x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */ + __m128i v_frac_w0, k0, l0, d0, e0; + __m128i v_frac_w1, k1, l1, d1, e1; + + int f, f2; + index_w_0 = 4 * SRC_INDEX(fp_sum_w); + frac_w_0 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + index_w_1 = 4 * SRC_INDEX(fp_sum_w); + frac_w_1 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; +/* + x00............ x01 x02...........x03 + . . . . . . + j0 f0 j1 j2 f1 j3 + . . . . . . + . . . . . . + . . . . . . + x10............ x11 x12...........x13 + */ + s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); + s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); + s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); + s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); + + f = frac_w_0; + f2 = FRAC_ONE - frac_w_0; + v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2); + + f = frac_w_1; + f2 = FRAC_ONE - frac_w_1; + v_frac_w1 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2); + + x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); /* Load x00 and x01 */ + x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03); + x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11); + x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13); + + /* Interpolation vertical */ + k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1); + l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0); + k0 = _mm_add_epi16(k0, l0); + k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1); + l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0); + k1 = _mm_add_epi16(k1, l1); + + /* Interpolation horizontal */ + l0 = _mm_unpacklo_epi64(/* unused */ l0, k0); + k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0); + l1 = _mm_unpacklo_epi64(/* unused */ l1, k1); + k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1); + + /* Store 1 pixel */ + d0 = _mm_srli_epi32(k0, PRECISION * 2); + e0 = _mm_packs_epi32(d0, d0); + e0 = _mm_packus_epi16(e0, e0); + *dst++ = _mm_cvtsi128_si32(e0); + + /* Store 1 pixel */ + d1 = _mm_srli_epi32(k1, PRECISION * 2); + e1 = _mm_packs_epi32(d1, d1); + e1 = _mm_packus_epi16(e1, e1); + *dst++ = _mm_cvtsi128_si32(e1); + } + + /* Last point */ + if (middle & 0x1) { + const Uint32 *s_00_01; + const Uint32 *s_10_11; + int index_w = 4 * SRC_INDEX(fp_sum_w); + int frac_w = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero); + dst += 1; + } + + while (right_pad_w--) { + int index_w = 4 * (src_w - 2); + const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero); + dst += 1; + } + dst = (Uint32 *)((Uint8 *)dst + dst_gap); + } + return 0; +} +#endif + +#if defined(HAVE_NEON_INTRINSICS) + +static inline int +hasNEON() +{ + static int val = -1; + if (val != -1) { + return val; + } + val = SDL_HasNEON(); + return val; +} + +static inline void +INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst) +{ + uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ + uint16x8_t k0; + uint32x4_t l0; + uint16x8_t d0; + uint8x8_t e0; + + x_00_01 = (uint8x8_t)vld1_u32(s0); /* Load 2 pixels */ + x_10_11 = (uint8x8_t)vld1_u32(s1); + + /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ + k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ + k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ + + /* k0 now contains 2 interpolated pixels { j0, j1 } */ + l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); + l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w); + l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w); + + /* Shift and narrow */ + d0 = vcombine_u16( + /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), + /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION) + ); + + /* Narrow again */ + e0 = vmovn_u16(d0); + + /* Store 1 pixel */ + *dst = vget_lane_u32((uint32x2_t)e0, 0); +} + + static int +scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) +{ + BILINEAR___START + + for (i = 0; i < dst_h; i++) { + int nb_block4; + uint8x8_t v_frac_h0, v_frac_h1; + + BILINEAR___HEIGHT + + nb_block4 = middle / 4; + + v_frac_h0 = vmov_n_u8(frac_h0); + v_frac_h1 = vmov_n_u8(frac_h1); + + while (left_pad_w--) { + INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst); + dst += 1; + } + + while (nb_block4--) { + int index_w_0, frac_w_0; + int index_w_1, frac_w_1; + int index_w_2, frac_w_2; + int index_w_3, frac_w_3; + + const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07; + const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17; + + uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */ + uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17; + + uint16x8_t k0, k1, k2, k3; + uint32x4_t l0, l1, l2, l3; + uint16x8_t d0, d1; + uint8x8_t e0, e1; + uint32x4_t f0; + + index_w_0 = 4 * SRC_INDEX(fp_sum_w); + frac_w_0 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + index_w_1 = 4 * SRC_INDEX(fp_sum_w); + frac_w_1 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + index_w_2 = 4 * SRC_INDEX(fp_sum_w); + frac_w_2 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + index_w_3 = 4 * SRC_INDEX(fp_sum_w); + frac_w_3 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + + s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); + s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); + s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2); + s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3); + s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); + s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); + s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2); + s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3); + + /* Interpolation vertical */ + x_00_01 = (uint8x8_t)vld1_u32(s_00_01); /* Load 2 pixels */ + x_02_03 = (uint8x8_t)vld1_u32(s_02_03); + x_04_05 = (uint8x8_t)vld1_u32(s_04_05); + x_06_07 = (uint8x8_t)vld1_u32(s_06_07); + x_10_11 = (uint8x8_t)vld1_u32(s_10_11); + x_12_13 = (uint8x8_t)vld1_u32(s_12_13); + x_14_15 = (uint8x8_t)vld1_u32(s_14_15); + x_16_17 = (uint8x8_t)vld1_u32(s_16_17); + + /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ + k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ + k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ + + k1 = vmull_u8(x_02_03, v_frac_h1); + k1 = vmlal_u8(k1, x_12_13, v_frac_h0); + + k2 = vmull_u8(x_04_05, v_frac_h1); + k2 = vmlal_u8(k2, x_14_15, v_frac_h0); + + k3 = vmull_u8(x_06_07, v_frac_h1); + k3 = vmlal_u8(k3, x_16_17, v_frac_h0); + + /* k0 now contains 2 interpolated pixels { j0, j1 } */ + /* k1 now contains 2 interpolated pixels { j2, j3 } */ + /* k2 now contains 2 interpolated pixels { j4, j5 } */ + /* k3 now contains 2 interpolated pixels { j6, j7 } */ + + l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); + l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0); + l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0); + + l1 = vshll_n_u16(vget_low_u16(k1), PRECISION); + l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1); + l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1); + + l2 = vshll_n_u16(vget_low_u16(k2), PRECISION); + l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2); + l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2); + + l3 = vshll_n_u16(vget_low_u16(k3), PRECISION); + l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3); + l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3); + + /* shift and narrow */ + d0 = vcombine_u16( + /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), + /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION) + ); + /* narrow again */ + e0 = vmovn_u16(d0); + + /* Shift and narrow */ + d1 = vcombine_u16( + /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION), + /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION) + ); + /* Narrow again */ + e1 = vmovn_u16(d1); + + f0 = vcombine_u32((uint32x2_t)e0, (uint32x2_t)e1); + /* Store 4 pixels */ + vst1q_u32(dst, f0); + + dst += 4; + } + + if (middle & 0x2) { + int index_w_0, frac_w_0; + int index_w_1, frac_w_1; + const Uint32 *s_00_01, *s_02_03; + const Uint32 *s_10_11, *s_12_13; + uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */ + uint16x8_t k0, k1; + uint32x4_t l0, l1; + uint16x8_t d0; + uint8x8_t e0; + + index_w_0 = 4 * SRC_INDEX(fp_sum_w); + frac_w_0 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; + index_w_1 = 4 * SRC_INDEX(fp_sum_w); + frac_w_1 = FRAC(fp_sum_w); + fp_sum_w += fp_step_w; +/* + x00............ x01 x02...........x03 + . . . . . . + j0 dest0 j1 j2 dest1 j3 + . . . . . . + . . . . . . + . . . . . . + x10............ x11 x12...........x13 +*/ + s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); + s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); + s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); + s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); + + /* Interpolation vertical */ + x_00_01 = (uint8x8_t)vld1_u32(s_00_01);/* Load 2 pixels */ + x_02_03 = (uint8x8_t)vld1_u32(s_02_03); + x_10_11 = (uint8x8_t)vld1_u32(s_10_11); + x_12_13 = (uint8x8_t)vld1_u32(s_12_13); + + /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ + k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ + k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ + + k1 = vmull_u8(x_02_03, v_frac_h1); + k1 = vmlal_u8(k1, x_12_13, v_frac_h0); + + /* k0 now contains 2 interpolated pixels { j0, j1 } */ + /* k1 now contains 2 interpolated pixels { j2, j3 } */ + + l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); + l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0); + l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0); + + l1 = vshll_n_u16(vget_low_u16(k1), PRECISION); + l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1); + l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1); + + /* Shift and narrow */ + + d0 = vcombine_u16( + /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), + /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION) + ); + + /* Narrow again */ + e0 = vmovn_u16(d0); + + /* Store 2 pixels */ + vst1_u32(dst, (uint32x2_t)e0); + dst += 2; + } + + /* Last point */ + if (middle & 0x1) { + int index_w = 4 * SRC_INDEX(fp_sum_w); + int frac_w = FRAC(fp_sum_w); + const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst); + dst += 1; + } + + while (right_pad_w--) { + int index_w = 4 * (src_w - 2); + const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); + const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); + INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst); + dst += 1; + } + + dst = (Uint32 *)((Uint8 *)dst + dst_gap); + } + return 0; +} +#endif + +int +SDL_SoftStretchLowerLinear(SDL_Surface *s, const SDL_Rect *srcrect, + SDL_Surface *d, const SDL_Rect *dstrect) +{ + int ret = -1; + int src_w = srcrect->w; + int src_h = srcrect->h; + int dst_w = dstrect->w; + int dst_h = dstrect->h; + int src_pitch = s->pitch; + int dst_pitch = d->pitch; + Uint32 *src = (Uint32 *) ((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch); + Uint32 *dst = (Uint32 *) ((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch); + +#if defined(HAVE_NEON_INTRINSICS) + if (ret == -1 && hasNEON()) { + ret = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); + } +#endif + +#if defined(HAVE_SSE2_INTRINSICS) + if (ret == -1 && hasSSE2()) { + ret = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); + } +#endif + + if (ret == -1) { + scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); + } + + return ret; +} + /* vi: set ts=4 sw=4 expandtab: */