blit: lift 8x8 restriction from back2front

This patch allows for the use of the back2front operation with textures
that are not a multiple of 8x8 pixels, which makes the utility
compatible with screen resolutions like 1366x768. In such cases, the
implementation falls back to the non-SIMD variant.

Issue #5428
Issue #5501
This commit is contained in:
Norman Feske 2025-04-02 11:54:27 +02:00
parent 3ba0e6fda3
commit 3909f9b6e4
6 changed files with 96 additions and 51 deletions

View File

@ -186,9 +186,11 @@ struct Blit::Neon::B2f
};
void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
void Blit::Neon::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;
@ -200,10 +202,12 @@ void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w,
}
void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
void Blit::Neon::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), 8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
@ -213,9 +217,11 @@ void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w,
}
void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
void Blit::Neon::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
uint32x4_t *d = (uint32x4_t *)dst;
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
@ -227,10 +233,12 @@ void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w,
}
void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
void Blit::Neon::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), -8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
@ -249,9 +257,11 @@ struct Blit::Neon::B2f_flip
};
void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;
@ -263,10 +273,12 @@ void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w,
}
void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), 8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
@ -276,9 +288,11 @@ void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w,
}
void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
uint32x4_t *d = (uint32x4_t *)dst;
@ -290,10 +304,12 @@ void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w,
}
void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), -8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);

View File

@ -31,9 +31,9 @@ namespace Blit {
uint32_t *dst,
unsigned w, unsigned h, int dx, int dy)
{
for (unsigned lines = h*8; lines; lines--) {
_write_line(src, dst, 8*w, dx);
src += 8*src_w;
for (unsigned lines = h; lines; lines--) {
_write_line(src, dst, w, dx);
src += src_w;
dst += dy;
}
};
@ -60,7 +60,7 @@ struct Blit::Slow::B2f
void Blit::Slow::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
_write_lines(src, line_w, dst, w, h, 1, 8*line_w);
_write_lines(src, line_w, dst, w, h, 1, line_w);
}
@ -68,15 +68,15 @@ void Blit::Slow::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
_write_lines(src, src_w, dst + 8*h - 1, w, h, 8*dst_w, -1);
_write_lines(src, src_w, dst + h - 1, w, h, dst_w, -1);
}
void Blit::Slow::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
dst += 8*w - 1 + (8*h - 1)*8*line_w;
_write_lines(src, line_w, dst, w, h, -1, -8*line_w);
dst += w - 1 + (h - 1)*line_w;
_write_lines(src, line_w, dst, w, h, -1, -line_w);
}
@ -84,8 +84,8 @@ void Blit::Slow::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst += 8*dst_w*(8*w - 1);
_write_lines(src, src_w, dst, w, h, -8*dst_w, 1);
dst += dst_w*(w - 1);
_write_lines(src, src_w, dst, w, h, -dst_w, 1);
}
@ -101,7 +101,7 @@ struct Blit::Slow::B2f_flip
void Blit::Slow::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
_write_lines(src, line_w, dst + 8*w - 1, w, h, -1, 8*line_w);
_write_lines(src, line_w, dst + w - 1, w, h, -1, line_w);
}
@ -109,15 +109,15 @@ void Blit::Slow::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
_write_lines(src, src_w, dst, w, h, 8*dst_w, 1);
_write_lines(src, src_w, dst, w, h, dst_w, 1);
}
void Blit::Slow::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
dst += (8*h - 1)*8*line_w;
_write_lines(src, line_w, dst, w, h, 1, -8*line_w);
dst += (h - 1)*line_w;
_write_lines(src, line_w, dst, w, h, 1, -line_w);
}
@ -125,8 +125,8 @@ void Blit::Slow::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst += 8*h - 1 + 8*dst_w*(8*w - 1);
_write_lines(src, src_w, dst, w, h, -8*dst_w, -1);
dst += h - 1 + dst_w*(w - 1);
_write_lines(src, src_w, dst, w, h, -dst_w, -1);
}

View File

@ -173,6 +173,8 @@ struct Blit::Sse4::B2f
void Blit::Sse4::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
@ -188,6 +190,8 @@ void Blit::Sse4::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*src_w*(8*h - 1), steps.src_y_4);
@ -200,6 +204,8 @@ void Blit::Sse4::B2f::r90(uint32_t *dst, unsigned dst_w,
void Blit::Sse4::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i *d = (__m128i *)dst;
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
@ -215,6 +221,8 @@ void Blit::Sse4::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
@ -236,6 +244,8 @@ struct Blit::Sse4::B2f_flip
void Blit::Sse4::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
@ -251,6 +261,8 @@ void Blit::Sse4::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { 2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
@ -263,6 +275,8 @@ void Blit::Sse4::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
void Blit::Sse4::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
line_w >>= 3, w >>= 3, h >>= 3;
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
__m128i *d = (__m128i *)dst;
@ -278,6 +292,8 @@ void Blit::Sse4::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
Steps const steps { -2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*int(src_w)*(h*8 - 1), steps.src_y_4);

View File

@ -74,6 +74,8 @@ namespace Blit {
.y = ((r.y2() + 8) & ~0x7) - 1 });
}
static inline bool divisable_by_8x8(Area a) { return ((a.w | a.h) & 0x7) == 0; }
template <typename B2F>
static inline void _b2f(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
@ -92,23 +94,17 @@ namespace Blit {
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
/* surface size must be divisible by 8 */
if (!aligned(surface.size().w, 2) || !aligned(surface.size().h, 2)) {
warning("surface size ", surface.size(), " not divisible by 8");
return;
}
/* check compatibility of surface with texture */
if (transformed(surface.size(), rotate) != texture.size()) {
warning("surface ", surface.size(), " mismatches texture ", texture.size());
return;
}
/* restrict rect to texture size */
rect = Rect::intersect(rect, Rect { { }, texture.size() });
/* snap src coordinates to multiple of px, restrict to texture size */
Rect const src_rect = Rect::intersect(snapped_to_8x8_grid(rect),
Rect { { }, texture.size() });
/* compute base addresses of affected pixel window */
Rect const src_rect = snapped_to_8x8_grid(rect);
Rect const dst_rect = transformed(src_rect, texture.size(), rotate, flip);
uint32_t const * const src = (uint32_t const *)texture.pixel()
@ -119,11 +115,10 @@ namespace Blit {
+ dst_rect.y1()*surface.size().w
+ dst_rect.x1();
/* coordinates converted to 8x8 units */
unsigned const src_w = texture.size().w >> 3,
dst_w = surface.size().w >> 3,
w = src_rect.area.w >> 3,
h = src_rect.area.h >> 3;
unsigned const src_w = texture.size().w,
dst_w = surface.size().w,
w = src_rect.area.w,
h = src_rect.area.h;
if (w && h) {
if (flip.enabled)

View File

@ -20,7 +20,16 @@
namespace Blit {
static inline void back2front (auto &&... args) { _b2f<Neon>(args...); }
static inline void back2front(Surface<Pixel_rgb888> &surface,
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
if (divisable_by_8x8(texture.size()))
_b2f<Neon>(surface, texture, rect, rotate, flip);
else
_b2f<Slow>(surface, texture, rect, rotate, flip);
}
static inline void blend_xrgb_a(auto &&... args) { Neon::Blend::xrgb_a(args...); }
}

View File

@ -20,7 +20,16 @@
namespace Blit {
static inline void back2front (auto &&... args) { _b2f<Sse4>(args...); }
static inline void back2front(Surface<Pixel_rgb888> &surface,
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
if (divisable_by_8x8(texture.size()))
_b2f<Sse4>(surface, texture, rect, rotate, flip);
else
_b2f<Slow>(surface, texture, rect, rotate, flip);
}
static inline void blend_xrgb_a(auto &&... args) { Sse4::Blend::xrgb_a(args...); }
}