Norman Feske 3909f9b6e4 blit: lift 8x8 restriction from back2front
This patch allows for the use of the back2front operation with textures
that are not a multiple of 8x8 pixels, which makes the utility
compatible with screen resolutions like 1366x768. In such cases, the
implementation falls back to the non-SIMD variant.

Issue #5428
Issue #5501
2025-04-10 14:26:47 +02:00

395 lines
11 KiB
C++

/*
* \brief 2D memory copy using SSE4
* \author Norman Feske
* \date 2025-01-21
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__INTERNAL__SSE4_H_
#define _INCLUDE__BLIT__INTERNAL__SSE4_H_
#include <blit/types.h>
/* compiler intrinsics */
#ifndef _MM_MALLOC_H_INCLUDED /* discharge dependency from stdlib.h */
#define _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED_PREVENTED
#endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#include <immintrin.h>
#pragma GCC diagnostic pop
#ifdef _MM_MALLOC_H_INCLUDED_PREVENTED
#undef _MM_MALLOC_H_INCLUDED
#undef _MM_MALLOC_H_INCLUDED_PREVENTED
#endif
namespace Blit { struct Sse4; };
struct Blit::Sse4
{
/**
* Padded hex output utility
*/
template <typename T>
struct Phex : Hex { explicit Phex(T v) : Hex(v, OMIT_PREFIX, PAD) { } };
/**
* Vector output utility
*/
template <typename T>
union Vec_as
{
__m128i v;
static constexpr unsigned N = 128/(8*sizeof(T));
T u[N];
Vec_as(__m128i v) : v(v) { }
void print(Output &out) const
{
for (unsigned i = 0; i < N; i++)
Genode::print(out, Phex(u[i]), i < (N-1) ? "." : "");
}
};
union Tile_4x4 { __m128i pi[4]; __m128 ps[4]; };
struct Src_ptr4
{
__m128i const *p0, *p1, *p2, *p3;
inline Src_ptr4(__m128i const *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }
void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void prefetch() const
{
_mm_prefetch(p0, _MM_HINT_T0); _mm_prefetch(p1, _MM_HINT_T0);
_mm_prefetch(p2, _MM_HINT_T0); _mm_prefetch(p3, _MM_HINT_T0);
}
void load(Tile_4x4 &tile) const
{
tile.pi[0] = _mm_load_si128(p0); tile.pi[1] = _mm_load_si128(p1);
tile.pi[2] = _mm_load_si128(p2); tile.pi[3] = _mm_load_si128(p3);
}
};
struct Dst_ptr4
{
__m128i *p0, *p1, *p2, *p3;
Dst_ptr4(__m128i *p, int const step_4)
:
p0(p), p1(p0 + step_4), p2(p1 + step_4), p3(p2 + step_4)
{ }
void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void store(Tile_4x4 const &tile) const
{
_mm_stream_si128(p0, tile.pi[0]); _mm_stream_si128(p1, tile.pi[1]);
_mm_stream_si128(p2, tile.pi[2]); _mm_stream_si128(p3, tile.pi[3]);
}
};
struct Steps { int src_y_4, dst_y_4; };
static inline void _reverse_line(__m128i const *s, __m128i *d, unsigned len_8)
{
static constexpr int reversed = (0 << 6) | (1 << 4) | (2 << 2) | 3;
d += 2*len_8; /* move 'dst' from end towards begin */
while (len_8--) {
__m128i const v0 = _mm_load_si128(s++);
__m128i const v1 = _mm_load_si128(s++);
_mm_stream_si128(--d, _mm_shuffle_epi32(v0, reversed));
_mm_stream_si128(--d, _mm_shuffle_epi32(v1, reversed));
}
};
static inline void _copy_line(__m128i const *s, __m128i *d, unsigned len_8)
{
while (len_8--) {
__m128i const v0 = _mm_load_si128(s++);
__m128i const v1 = _mm_load_si128(s++);
_mm_stream_si128(d++, v0);
_mm_stream_si128(d++, v1);
}
};
static inline void _rotate_4_lines(Src_ptr4 src, Dst_ptr4 dst,
unsigned len_4, auto const dst_4_step)
{
Tile_4x4 t;
while (len_4--) {
src.load(t);
src.incr_4(1);
src.prefetch();
_MM_TRANSPOSE4_PS(t.ps[0], t.ps[1], t.ps[2], t.ps[3]);
dst.store(t);
dst.incr_4(dst_4_step);
};
};
static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned w, unsigned h)
{
for (unsigned i = 2*h; i; i--) {
_rotate_4_lines(src, dst, 2*w, 4*steps.dst_y_4);
src.incr_4(4*steps.src_y_4);
dst.incr_4(1);
}
}
struct B2f;
struct B2f_flip;
struct Blend;
};
struct Blit::Sse4::B2f
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Sse4::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
_copy_line(s, d, w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Sse4::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*src_w*(8*h - 1), steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Sse4::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i *d = (__m128i *)dst;
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
for (unsigned i = h*8; i; i--) {
s -= 2*line_w;
_reverse_line(s, d, w);
d += 2*line_w;
}
}
void Blit::Sse4::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(8*w - 1), steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
struct Blit::Sse4::B2f_flip
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Sse4::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
_reverse_line(s, d, w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Sse4::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Sse4::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
s -= 2*line_w;
_copy_line(s, d, w);
d += 2*line_w;
}
}
void Blit::Sse4::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*int(src_w)*(h*8 - 1), steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(w*8 - 1), steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
struct Blit::Sse4::Blend
{
static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
__attribute__((optimize("-O3")))
static inline uint32_t _blend(uint32_t xrgb, unsigned alpha)
{
return (alpha * ((xrgb & 0xff00) >> 8) & 0xff00)
| (((alpha * (xrgb & 0xff00ff)) >> 8) & 0xff00ff);
}
__attribute__((optimize("-O3")))
static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha)
{
return (__builtin_expect(alpha == 0, false))
? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1);
}
struct Mix_masks
{
/* masks for distributing alpha values to 16-bit r, g, b lanes */
__m128i const a01 = _mm_set_epi32(0x03020302, 0x03020302, 0x01000100, 0x01000100);
__m128i const a23 = _mm_set_epi32(0x07060706, 0x07060706, 0x05040504, 0x05040504);
};
__attribute__((optimize("-O3")))
static inline void _mix_4(uint32_t *, uint32_t const *, uint8_t const *, Mix_masks const);
};
__attribute__((optimize("-O3")))
void Blit::Sse4::Blend::_mix_4(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha, Mix_masks const masks)
{
uint32_t const a_u8_x4 = *(uint32_t const *)alpha;
if (__builtin_expect(a_u8_x4 == 0, false))
return;
auto upper_half = [&] (__m128i const v) { return _mm_shuffle_epi32(v, 2 + (3<<2)); };
__m128i const
/* load four foreground pixel, background pixel, and alpha values */
fg_u8_4x4 = _mm_loadu_si128((__m128i const *)fg),
bg_u8_4x4 = _mm_loadu_si128((__m128i const *)bg),
/* extract first and second pair of pixel values */
fg01_u16_4x2 = _mm_cvtepu8_epi16(fg_u8_4x4),
fg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(fg_u8_4x4)),
bg01_u16_4x2 = _mm_cvtepu8_epi16(bg_u8_4x4),
bg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(bg_u8_4x4)),
/* prepare 4 destination and source alpha values */
a_u16_x4 = _mm_cvtepu8_epi16(_mm_set1_epi32(a_u8_x4)),
da_u16_x4 = _mm_sub_epi16(_mm_set1_epi16(256), a_u16_x4),
sa_u16_x4 = _mm_add_epi16(a_u16_x4, _mm_set1_epi16(1)),
/* mix first pixel pair */
da01_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a01),
sa01_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a01),
mixed01 = _mm_add_epi16(_mm_mullo_epi16(fg01_u16_4x2, sa01_u16_4x2),
_mm_mullo_epi16(bg01_u16_4x2, da01_u16_4x2)),
/* mix second pixel pair */
da23_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a23),
sa23_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a23),
mixed23 = _mm_add_epi16(_mm_mullo_epi16(fg23_u16_4x2, sa23_u16_4x2),
_mm_mullo_epi16(bg23_u16_4x2, da23_u16_4x2)),
result_4x4 = _mm_packus_epi16(_mm_srli_epi16(mixed01, 8),
_mm_srli_epi16(mixed23, 8));
_mm_storeu_si128((__m128i *)bg, result_4x4);
}
__attribute__((optimize("-O3")))
void Blit::Sse4::Blend::xrgb_a(uint32_t *dst, unsigned n,
uint32_t const *pixel, uint8_t const *alpha)
{
Mix_masks const mix_masks { };
for (; n > 3; n -= 4, dst += 4, pixel += 4, alpha += 4)
_mix_4(dst, pixel, alpha, mix_masks);
for (; n--; dst++, pixel++, alpha++)
*dst = _mix(*dst, *pixel, *alpha);
}
#endif /* _INCLUDE__BLIT__INTERNAL__SSE3_H_ */