blit: SIMD-based back2front copy

Issue #5428
This commit is contained in:
Norman Feske 2025-01-16 19:35:01 +01:00 committed by Christian Helmuth
parent 10a45b78d0
commit dc9ff4b3e3
16 changed files with 1266 additions and 22 deletions

View File

@ -1,34 +1,62 @@
/*
* \brief Interface of 2D-copy library
* \brief Blit API
* \author Norman Feske
* \date 2007-10-10
* \date 2025-01-16
*/
/*
* Copyright (C) 2007-2017 Genode Labs GmbH
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__BLIT_H_
#define _INCLUDE__BLIT__BLIT_H_
#ifndef _INCLUDE__BLIT_H_
#define _INCLUDE__BLIT_H_
/**
* Blit memory from source buffer to destination buffer
*
* \param src address of source buffer
* \param src_w line length of source buffer in bytes
* \param dst address of destination buffer
* \param dst_w line length of destination buffer in bytes
* \param w number of bytes per line to copy
* \param h number of lines to copy
*
* This function works at a granularity of 16bit.
* If the source and destination overlap, the result
* of the copy operation is not defined.
*/
extern "C" void blit(void const *src, unsigned src_w,
void *dst, unsigned dst_w, int w, int h);
#include <blit/types.h>
#include <blit/internal/slow.h>
#endif /* _INCLUDE__BLIT__BLIT_H_ */
namespace Blit {
/**
* Back-to-front copy
*
* Copy a rectangular part of a texture to a surface while optionally
* applying rotation and flipping. The width and height of the texture
* must be divisible by 8. Surface and texture must perfectly line up.
* E.g., when rotating by 90 degrees, the texture width must equal the
* surface height and vice versa. The clipping area of the surface is
* ignored.
*
* The combination of rotate and flip arguments works as follows:
*
* normal flipped
*
* rotated 0 0 1 2 3 3 2 1 0
* 4 5 6 7 7 6 5 4
* 8 9 10 11 11 10 9 8
*
* rotated 90 8 4 0 0 4 8
* 9 5 1 1 5 9
* 10 6 2 2 6 10
* 11 7 3 3 7 11
*
* rotated 180 11 10 9 8 8 9 10 11
* 7 6 5 4 4 5 6 7
* 3 2 1 0 0 1 2 3
*
* rotated 270 3 7 11 11 7 3
* 2 6 10 10 6 2
* 1 5 9 9 5 1
* 0 4 8 8 4 0
*/
static inline void back2front(Surface<Pixel_rgb888> &surface,
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
_b2f<Slow>(surface, texture, rect, rotate, flip);
}
}
#endif /* _INCLUDE__BLIT_H_ */

View File

@ -0,0 +1,294 @@
/*
* \brief 2D memory copy using ARM NEON
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_
#define _INCLUDE__BLIT__INTERNAL__NEON_H_
#include <blit/types.h>
/* compiler intrinsics */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnarrowing"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wfloat-conversion"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#include <arm_neon.h>
#pragma GCC diagnostic pop
namespace Blit { struct Neon; }
struct Blit::Neon
{
static inline uint32x4_t _reversed(uint32x4_t const v)
{
return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v)));
}
static inline void _reverse_line(uint32x4_t const *src, uint32x4_t *dst, unsigned len)
{
src += len; /* move 'src' from end of line towards begin */
while (len--)
*dst++ = _reversed(*--src);
};
static inline void _copy_line(uint32x4_t const *s, uint32x4_t *d, unsigned len)
{
while (len--)
*d++ = *s++;
};
struct Src_ptr4
{
uint32x4_t const *p0, *p1, *p2, *p3;
inline Src_ptr4(uint32x4_t const *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }
void incr_4(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void prefetch() const
{
__builtin_prefetch(p0); __builtin_prefetch(p1);
__builtin_prefetch(p2); __builtin_prefetch(p3);
}
void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; }
};
struct Dst_ptr4
{
uint32_t *p0, *p1, *p2, *p3;
Dst_ptr4(uint32_t *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }
void incr(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void store(uint32x4x4_t const &tile)
{
vst4q_lane_u32(p0, tile, 0);
vst4q_lane_u32(p1, tile, 1);
vst4q_lane_u32(p2, tile, 2);
vst4q_lane_u32(p3, tile, 3);
}
};
struct Steps
{
int const src_y, dst_y;
void incr_x_4(Src_ptr4 &p) const { p.incr_4(1); };
void incr_x_8(Src_ptr4 &p) const { p.incr_4(2); };
void incr_y_4(Src_ptr4 &p) const { p.incr_4(src_y << 2); };
void incr_y_8(Src_ptr4 &p) const { p.incr_4(src_y << 3); };
void incr_x_4(Dst_ptr4 &p) const { p.incr(4); };
void incr_x_8(Dst_ptr4 &p) const { p.incr(8); };
void incr_y_4(Dst_ptr4 &p) const { p.incr(dst_y << 2); };
void incr_y_8(Dst_ptr4 &p) const { p.incr(dst_y << 3); };
};
__attribute__((optimize("-O3")))
static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps)
{
uint32x4x4_t tile;
src.load(tile);
steps.incr_y_4(src);
src.prefetch();
dst.store(tile);
steps.incr_x_4(dst);
}
__attribute__((optimize("-O3")))
static inline void _rotate_8x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
for (unsigned i = 0; i < 2; i++)
_load_prefetch_store(src, dst, steps);
}
__attribute__((optimize("-O3")))
static inline void _rotate_8x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
_load_prefetch_store(src, dst, steps);
uint32x4x4_t tile;
src.load(tile);
dst.store(tile);
}
__attribute__((optimize("-O3")))
static inline void _rotate_8x8(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
_rotate_8x4(src, dst, steps);
steps.incr_y_4(dst);
steps.incr_x_4(src);
_rotate_8x4_last(src, dst, steps);
}
__attribute__((optimize("-O3")))
static inline void _rotate_8_lines(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned n)
{
for (; n; n--) {
_rotate_8x8(src, dst, steps);
steps.incr_y_8(dst);
steps.incr_x_8(src);
}
};
static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned w, unsigned h)
{
for (unsigned i = h; i; i--) {
_rotate_8_lines(src, dst, steps, w);
steps.incr_y_8(src);
steps.incr_x_8(dst);
}
}
struct B2f;
struct B2f_flip;
};
struct Blit::Neon::B2f
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;
for (unsigned lines = h*8; lines; lines--) {
_copy_line(s, d, 2*w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
{
Steps const steps { -2*int(src_w), 8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t *d = (uint32x4_t *)dst;
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
for (unsigned i = h*8; i; i--) {
s -= 2*line_w;
_reverse_line(s, d, 2*w);
d += 2*line_w;
}
}
void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
{
Steps const steps { 2*int(src_w), -8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
struct Blit::Neon::B2f_flip
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;
for (unsigned lines = h*8; lines; lines--) {
_reverse_line(s, d, 2*w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
{
Steps const steps { 2*int(src_w), 8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
uint32x4_t *d = (uint32x4_t *)dst;
for (unsigned lines = h*8; lines; lines--) {
s -= 2*line_w;
_copy_line(s, d, 2*w);
d += 2*line_w;
}
}
void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
{
Steps const steps { -2*int(src_w), -8*int(dst_w) };
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
#endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */

View File

@ -0,0 +1,131 @@
/*
* \brief Fallback 2D memory copy
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__INTERNAL__SLOW_H_
#define _INCLUDE__BLIT__INTERNAL__SLOW_H_
#include <blit/types.h>
namespace Blit {
struct Slow;
static inline void _write_line(uint32_t const *src, uint32_t *dst,
unsigned len, int dst_step)
{
for (; len--; dst += dst_step)
*dst = *src++;
}
static inline void _write_lines(uint32_t const *src, unsigned src_w,
uint32_t *dst,
unsigned w, unsigned h, int dx, int dy)
{
for (unsigned lines = h*8; lines; lines--) {
_write_line(src, dst, 8*w, dx);
src += 8*src_w;
dst += dy;
}
};
}
struct Blit::Slow
{
struct B2f;
struct B2f_flip;
};
struct Blit::Slow::B2f
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Slow::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
_write_lines(src, line_w, dst, w, h, 1, 8*line_w);
}
void Blit::Slow::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
_write_lines(src, src_w, dst + 8*h - 1, w, h, 8*dst_w, -1);
}
void Blit::Slow::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
dst += 8*w - 1 + (8*h - 1)*8*line_w;
_write_lines(src, line_w, dst, w, h, -1, -8*line_w);
}
void Blit::Slow::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst += 8*dst_w*(8*w - 1);
_write_lines(src, src_w, dst, w, h, -8*dst_w, 1);
}
struct Blit::Slow::B2f_flip
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Slow::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
_write_lines(src, line_w, dst + 8*w - 1, w, h, -1, 8*line_w);
}
void Blit::Slow::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
_write_lines(src, src_w, dst, w, h, 8*dst_w, 1);
}
void Blit::Slow::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
dst += (8*h - 1)*8*line_w;
_write_lines(src, line_w, dst, w, h, 1, -8*line_w);
}
void Blit::Slow::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
dst += 8*h - 1 + 8*dst_w*(8*w - 1);
_write_lines(src, src_w, dst, w, h, -8*dst_w, -1);
}
#endif /* _INCLUDE__BLIT__INTERNAL__SLOW_H_ */

View File

@ -0,0 +1,263 @@
/*
* \brief 2D memory copy using SSE4
* \author Norman Feske
* \date 2025-01-21
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__INTERNAL__SSE4_H_
#define _INCLUDE__BLIT__INTERNAL__SSE4_H_
#include <blit/types.h>
/* compiler intrinsics */
#ifndef _MM_MALLOC_H_INCLUDED /* discharge dependency from stdlib.h */
#define _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED_PREVENTED
#endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#include <immintrin.h>
#pragma GCC diagnostic pop
#ifdef _MM_MALLOC_H_INCLUDED_PREVENTED
#undef _MM_MALLOC_H_INCLUDED
#undef _MM_MALLOC_H_INCLUDED_PREVENTED
#endif
namespace Blit { struct Sse4; };
struct Blit::Sse4
{
union Tile_4x4 { __m128i pi[4]; __m128 ps[4]; };
struct Src_ptr4
{
__m128i const *p0, *p1, *p2, *p3;
inline Src_ptr4(__m128i const *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }
void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void prefetch() const
{
_mm_prefetch(p0, _MM_HINT_T0); _mm_prefetch(p1, _MM_HINT_T0);
_mm_prefetch(p2, _MM_HINT_T0); _mm_prefetch(p3, _MM_HINT_T0);
}
void load(Tile_4x4 &tile) const
{
tile.pi[0] = _mm_load_si128(p0); tile.pi[1] = _mm_load_si128(p1);
tile.pi[2] = _mm_load_si128(p2); tile.pi[3] = _mm_load_si128(p3);
}
};
struct Dst_ptr4
{
__m128i *p0, *p1, *p2, *p3;
Dst_ptr4(__m128i *p, int const step_4)
:
p0(p), p1(p0 + step_4), p2(p1 + step_4), p3(p2 + step_4)
{ }
void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }
void store(Tile_4x4 const &tile) const
{
_mm_stream_si128(p0, tile.pi[0]); _mm_stream_si128(p1, tile.pi[1]);
_mm_stream_si128(p2, tile.pi[2]); _mm_stream_si128(p3, tile.pi[3]);
}
};
struct Steps { int src_y_4, dst_y_4; };
static inline void _reverse_line(__m128i const *s, __m128i *d, unsigned len_8)
{
static constexpr int reversed = (0 << 6) | (1 << 4) | (2 << 2) | 3;
d += 2*len_8; /* move 'dst' from end towards begin */
while (len_8--) {
__m128i const v0 = _mm_load_si128(s++);
__m128i const v1 = _mm_load_si128(s++);
_mm_stream_si128(--d, _mm_shuffle_epi32(v0, reversed));
_mm_stream_si128(--d, _mm_shuffle_epi32(v1, reversed));
}
};
static inline void _copy_line(__m128i const *s, __m128i *d, unsigned len_8)
{
while (len_8--) {
__m128i const v0 = _mm_load_si128(s++);
__m128i const v1 = _mm_load_si128(s++);
_mm_stream_si128(d++, v0);
_mm_stream_si128(d++, v1);
}
};
static inline void _rotate_4_lines(Src_ptr4 src, Dst_ptr4 dst,
unsigned len_4, auto const dst_4_step)
{
Tile_4x4 t;
while (len_4--) {
src.load(t);
src.incr_4(1);
src.prefetch();
_MM_TRANSPOSE4_PS(t.ps[0], t.ps[1], t.ps[2], t.ps[3]);
dst.store(t);
dst.incr_4(dst_4_step);
};
};
static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned w, unsigned h)
{
for (unsigned i = 2*h; i; i--) {
_rotate_4_lines(src, dst, 2*w, 4*steps.dst_y_4);
src.incr_4(4*steps.src_y_4);
dst.incr_4(1);
}
}
struct B2f;
struct B2f_flip;
};
struct Blit::Sse4::B2f
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Sse4::B2f::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
_copy_line(s, d, w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Sse4::B2f::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
Steps const steps { -2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*src_w*(8*h - 1), steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Sse4::B2f::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
__m128i *d = (__m128i *)dst;
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
for (unsigned i = h*8; i; i--) {
s -= 2*line_w;
_reverse_line(s, d, w);
d += 2*line_w;
}
}
void Blit::Sse4::B2f::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
Steps const steps { 2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(8*w - 1), steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
struct Blit::Sse4::B2f_flip
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};
void Blit::Sse4::B2f_flip::r0(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
__m128i const *s = (__m128i const *)src;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
_reverse_line(s, d, w);
s += 2*line_w;
d += 2*line_w;
}
}
void Blit::Sse4::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
Steps const steps { 2*int(src_w), 2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
void Blit::Sse4::B2f_flip::r180(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
__m128i *d = (__m128i *)dst;
for (unsigned lines = h*8; lines; lines--) {
s -= 2*line_w;
_copy_line(s, d, w);
d += 2*line_w;
}
}
void Blit::Sse4::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h)
{
Steps const steps { -2*int(src_w), -2*int(dst_w) };
Src_ptr4 src_ptr4 ((__m128i *)src + 2*int(src_w)*(h*8 - 1), steps.src_y_4);
Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(w*8 - 1), steps.dst_y_4);
_rotate(src_ptr4, dst_ptr4, steps, w, h);
}
#endif /* _INCLUDE__BLIT__INTERNAL__SSE4_H_ */

View File

@ -0,0 +1,161 @@
/*
* \brief Types and utilities used for 2D memory copy
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__BLIT__TYPES_H_
#define _INCLUDE__BLIT__TYPES_H_
/* Genode includes */
#include <os/texture.h>
#include <os/surface.h>
#include <os/pixel_rgb888.h>
namespace Blit {
using namespace Genode;
using Rect = Surface_base::Rect;
using Area = Surface_base::Area;
using Point = Surface_base::Point;
enum class Rotate { R0, R90, R180, R270 };
struct Flip { bool enabled; };
static bool swap_w_h(Rotate r) { return r == Rotate::R90 || r == Rotate::R270; }
static Area transformed(Area a, Rotate rotate)
{
return swap_w_h(rotate) ? Area { a.h, a.w } : a;
}
static Point transformed(Point p, Area area, Rotate rotate, Flip flip)
{
int const w = area.w, h = area.h;
switch (rotate) {
case Rotate::R0: break;
case Rotate::R90: p = { .x = h - p.y - 1, .y = p.x }; break;
case Rotate::R180: p = { .x = w - p.x - 1, .y = h - p.y - 1 }; break;
case Rotate::R270: p = { .x = p.y, .y = w - p.x - 1 }; break;
}
if (flip.enabled)
p = { int(transformed(area, rotate).w) - p.x - 1, p.y };
return p;
}
static Rect transformed(Rect r, Area area, Rotate rotate, Flip flip)
{
auto rect_from_points = [&] (Point p1, Point p2)
{
return Rect::compound(Point { min(p1.x, p2.x), min(p1.y, p2.y) },
Point { max(p1.x, p2.x), max(p1.y, p2.y) });
};
return rect_from_points(transformed(r.p1(), area, rotate, flip),
transformed(r.p2(), area, rotate, flip));
}
static Rect snapped_to_8x8_grid(Rect r)
{
return Rect::compound(Point { .x = r.x1() & ~0x7,
.y = r.y1() & ~0x7 },
Point { .x = ((r.x2() + 8) & ~0x7) - 1,
.y = ((r.y2() + 8) & ~0x7) - 1 });
}
template <typename B2F>
static inline void _b2f(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w,
unsigned w, unsigned h, Rotate rotate)
{
switch (rotate) {
case Rotate::R0: B2F::r0 (dst, dst_w, src, w, h); break;
case Rotate::R90: B2F::r90 (dst, dst_w, src, src_w, w, h); break;
case Rotate::R180: B2F::r180(dst, dst_w, src, w, h); break;
case Rotate::R270: B2F::r270(dst, dst_w, src, src_w, w, h); break;
}
}
template <typename OP>
static inline void _b2f(Surface<Pixel_rgb888> &surface,
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
/* surface size must be divisible by 8 */
if (!aligned(surface.size().w, 2) || !aligned(surface.size().h, 2)) {
warning("surface size ", surface.size(), " not divisible by 8");
return;
}
/* check compatibility of surface with texture */
if (transformed(surface.size(), rotate) != texture.size()) {
warning("surface ", surface.size(), " mismatches texture ", texture.size());
return;
}
/* restrict rect to texture size */
rect = Rect::intersect(rect, Rect { { }, texture.size() });
/* compute base addresses of affected pixel window */
Rect const src_rect = snapped_to_8x8_grid(rect);
Rect const dst_rect = transformed(src_rect, texture.size(), rotate, flip);
uint32_t const * const src = (uint32_t const *)texture.pixel()
+ src_rect.y1()*texture.size().w
+ src_rect.x1();
uint32_t * const dst = (uint32_t *)surface.addr()
+ dst_rect.y1()*surface.size().w
+ dst_rect.x1();
/* coordinates converted to 8x8 units */
unsigned const src_w = texture.size().w >> 3,
dst_w = surface.size().w >> 3,
w = src_rect.area.w >> 3,
h = src_rect.area.h >> 3;
if (w && h) {
if (flip.enabled)
_b2f<typename OP::B2f_flip>(dst, dst_w, src, src_w, w, h, rotate);
else
_b2f<typename OP::B2f> (dst, dst_w, src, src_w, w, h, rotate);
}
surface.flush_pixels(dst_rect);
}
}
/****************
** Legacy API **
****************/
/**
* Blit memory from source buffer to destination buffer
*
* \param src address of source buffer
* \param src_w line length of source buffer in bytes
* \param dst address of destination buffer
* \param dst_w line length of destination buffer in bytes
* \param w number of bytes per line to copy
* \param h number of lines to copy
*
* This function works at a granularity of 16bit.
* If the source and destination overlap, the result
* of the copy operation is not defined.
*/
extern "C" void blit(void const *src, unsigned src_w,
void *dst, unsigned dst_w, int w, int h);
#endif /* _INCLUDE__BLIT__TYPES_H_ */

View File

@ -0,0 +1,25 @@
/*
* \brief Blit API
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__SPEC__ARM_64__BLIT_H_
#define _INCLUDE__SPEC__ARM_64__BLIT_H_
#include <blit/types.h>
#include <blit/internal/neon.h>
namespace Blit {
static inline void back2front(auto &&... args) { _b2f<Neon>(args...); }
}
#endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */

View File

@ -0,0 +1,25 @@
/*
* \brief Blit API
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__SPEC__X86_64__BLIT_H_
#define _INCLUDE__SPEC__X86_64__BLIT_H_
#include <blit/types.h>
#include <blit/internal/sse4.h>
namespace Blit {
static inline void back2front(auto &&... args) { _b2f<Sse4>(args...); }
}
#endif /* _INCLUDE__SPEC__X86_64__BLIT_H_ */

View File

@ -1,4 +1,6 @@
MIRROR_FROM_REP_DIR := include/blit \
include/spec/x86_64/blit \
include/spec/arm_64/blit \
src/lib/blit \
lib/mk/blit.mk \
lib/mk/spec/arm_64/blit.mk \

View File

@ -0,0 +1 @@
Scenario for testing 2D blitting operations

View File

@ -0,0 +1 @@
_/src/test-blit

View File

@ -0,0 +1,13 @@
<runtime ram="32M" caps="1000" binary="test-blit">
<fail after_seconds="20"/>
<succeed>[init] --- blit test finished ---</succeed>
<content>
<rom label="ld.lib.so"/>
<rom label="test-blit"/>
</content>
<config/>
</runtime>

View File

@ -0,0 +1,2 @@
SRC_DIR = src/test/blit
include $(GENODE_DIR)/repos/base/recipes/src/content.inc

View File

@ -0,0 +1 @@
2024-12-10 67b1a1ad0dddcdc22dd6e266309f8221ff30173f

View File

@ -0,0 +1,3 @@
base
blit
os

View File

@ -0,0 +1,291 @@
/*
* \brief Blitting test
* \author Norman Feske
* \date 2025-01-16
*/
/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#include <base/component.h>
#include <base/log.h>
#include <blit/blit.h>
#include <blit/internal/slow.h>
using namespace Blit;
/*******************************
** Low-level SIMD operations **
*******************************/
template <unsigned W, unsigned H>
struct Image
{
static constexpr unsigned w = W, h = H;
uint32_t pixels[W*H];
void print(Output &out) const
{
using Genode::print;
for (unsigned y = 0; y < H; y++) {
for (unsigned x = 0; x < min(25u, W); x++) {
uint32_t v = pixels[y*W+x];
if (v)
print(out, " ", Char('A' + (v&63)), Char(char('A' + ((v>>16)&63))));
else
print(out, " .");
}
if (y < H-1) print(out, "\n");
}
}
bool operator != (Image const &other)
{
for (unsigned i = 0; i < W*H; i++)
if (other.pixels[i] != pixels[i])
return true;
return false;
}
static Image pattern()
{
Image image { };
for (unsigned y = 0; y < H; y++)
for (unsigned x = 0; x < W; x++)
image.pixels[y*W + x] = (y << 16) | x;
return image;
}
};
#define TEST_LANDSCAPE(SIMD, FN, DST_W, DST_H, W, H) \
{ \
Image<DST_W, DST_H> dst { }, ref { }; \
Slow:: FN(ref.pixels, ref.w/8, src.pixels, W, H); \
log(#FN, " ref:\n", ref); \
SIMD:: FN(dst.pixels, dst.w/8, src.pixels, W, H); \
log(#FN, " got:\n", dst); \
if (dst != ref) { \
error("", #FN, " failed"); \
throw 1; \
} \
}
#define TEST_PORTRAIT(SIMD, FN, DST_W, DST_H, W, H) \
{ \
Image<DST_W, DST_H> dst { }, ref { }; \
Slow:: FN(ref.pixels, ref.w/8, src.pixels, src.w/8, W, H); \
log(#FN, " ref:\n", ref); \
SIMD:: FN(dst.pixels, dst.w/8, src.pixels, src.w/8, W, H); \
log(#FN, " got:\n", dst); \
if (dst != ref) { \
error("", #FN, " failed"); \
throw 1; \
} \
}
template <typename SIMD>
static void test_simd_b2f()
{
static Image<48,32> const src = Image<48,32>::pattern();
log("source image:\n", src);
TEST_LANDSCAPE ( SIMD, B2f ::r0, 48, 32, 2, 4 );
TEST_LANDSCAPE ( SIMD, B2f_flip ::r0, 48, 32, 2, 4 );
TEST_PORTRAIT ( SIMD, B2f ::r90, 32, 48, 4, 2 );
TEST_PORTRAIT ( SIMD, B2f_flip ::r90, 32, 48, 4, 2 );
TEST_LANDSCAPE ( SIMD, B2f ::r180, 48, 32, 2, 4 );
TEST_LANDSCAPE ( SIMD, B2f_flip::r180, 48, 32, 2, 4 );
TEST_PORTRAIT ( SIMD, B2f ::r270, 32, 48, 4, 2 );
TEST_PORTRAIT ( SIMD, B2f_flip::r270, 32, 48, 4, 2 );
}
/****************************************
** Back-to-front argument dispatching **
****************************************/
struct Recorded
{
struct Args
{
uint32_t *dst;
unsigned dst_w;
uint32_t const *src;
unsigned src_w;
unsigned w, h;
bool operator != (Args const &other) const
{
return dst != other.dst
|| dst_w != other.dst_w
|| src != other.src
|| src_w != other.src_w
|| w != other.w
|| h != other.h;
}
void print(Output &out) const
{
bool const valid = (*this != Args { });
if (!valid) {
Genode::print(out, "invalid");
return;
}
/* print src and dst pointer values in units of uint32_t words */
Genode::print(out, "dst=", Hex(addr_t(dst)/4), " dst_w=", dst_w,
" src=", Hex(addr_t(src)/4), " src_w=", src_w, " w=", w, " h=", h);
}
};
static Args recorded;
static void _record(uint32_t *dst, unsigned line_w,
uint32_t const *src, unsigned w, unsigned h)
{
recorded = { dst, line_w, src, line_w, w, h };
}
static void _record(uint32_t *dst, unsigned dst_w,
uint32_t const *src, unsigned src_w, unsigned w, unsigned h)
{
recorded = { dst, dst_w, src, src_w, w, h };
}
struct B2f
{
static inline void r0 (auto &&... args) { _record(args...); }
static inline void r90 (auto &&... args) { _record(args...); }
static inline void r180 (auto &&... args) { _record(args...); }
static inline void r270 (auto &&... args) { _record(args...); }
};
struct B2f_flip
{
static inline void r0 (auto &&... args) { _record(args...); }
static inline void r90 (auto &&... args) { _record(args...); }
static inline void r180(auto &&... args) { _record(args...); }
static inline void r270(auto &&... args) { _record(args...); }
};
};
Recorded::Args Recorded::recorded { };
namespace Blit {
static inline const char *name(Rotate r)
{
switch (r) {
case Rotate::R0: return "R0";
case Rotate::R90: return "R90";
case Rotate::R180: return "R180";
case Rotate::R270: return "R270";
}
return "invalid";
}
}
static void test_b2f_dispatch()
{
Texture<Pixel_rgb888> texture_landscape { nullptr, nullptr, { 640, 480 } };
Texture<Pixel_rgb888> texture_portrait { nullptr, nullptr, { 480, 640 } };
Surface<Pixel_rgb888> surface { nullptr, { 640, 480 } };
struct Expected : Recorded::Args { };
auto expected = [&] (addr_t dst, unsigned dst_w, addr_t src, unsigned src_w,
unsigned w, unsigned h)
{
return Expected { (uint32_t *)(4*dst), dst_w,
(uint32_t *)(4*src), src_w, w, h };
};
using Rect = Blit::Rect;
auto test = [&] (Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip,
Expected const &expected)
{
Recorded::recorded = { };
_b2f<Recorded>(surface, texture, rect, rotate, flip);
log("b2f: ", rect, " ", name(rotate), flip.enabled ? " flip" : "",
" -> ", Recorded::recorded);
if (Recorded::recorded != expected) {
error("test_b2f_dispatch failed, expected: ", expected);
throw 1;
}
};
log("offset calculation of destination window");
{
unsigned const x = 32, y = 16, w = 64, h = 48;
addr_t const src_landscape_ptr = y*640 + x,
src_portrait_ptr = y*480 + x;
Rect const rect { { x, y }, { w, h } };
test(texture_landscape, rect, Rotate::R0, Flip { },
expected(y*640 + x, 80, src_landscape_ptr, 80, 8, 6));
test(texture_landscape, rect, Rotate::R0, Flip { true },
expected(y*640 + 640 - w - x, 80, src_landscape_ptr, 80, 8, 6));
test(texture_portrait, rect, Rotate::R90, Flip { },
expected(x*640 + 640 - h - y, 80, src_portrait_ptr, 60, 8, 6));
test(texture_portrait, rect, Rotate::R90, Flip { true },
expected(x*640 + y, 80, src_portrait_ptr, 60, 8, 6));
test(texture_landscape, rect, Rotate::R180, Flip { },
expected((480 - y - h)*640 + 640 - x - w, 80, src_landscape_ptr, 80, 8, 6));
test(texture_landscape, rect, Rotate::R180, Flip { true },
expected((480 - y - h)*640 + x, 80, src_landscape_ptr, 80, 8, 6));
test(texture_portrait, rect, Rotate::R270, Flip { },
expected((480 - x - w)*640 + y, 80, src_portrait_ptr, 60, 8, 6));
test(texture_portrait, rect, Rotate::R270, Flip { true },
expected((480 - x - w)*640 + 640 - y - h, 80, src_portrait_ptr, 60, 8, 6));
}
log("check for compatibility of surface and texture");
test(texture_portrait, { { }, { 16, 16 } }, Rotate::R0, Flip { },
expected(0, 0, 0, 0, 0, 0));
log("clamp rect to texture size");
test(texture_landscape, { { -99, -99 }, { 999, 999 } }, Rotate::R0, Flip { },
expected(0, 80, 0, 80, 80, 60));
log("ignore out-of-bounds rect");
test(texture_landscape, { { 1000, 0 }, { 16, 16 } }, Rotate::R0, Flip { },
expected(0, 0, 0, 0, 0, 0));
/* snap to grid */
log("snap rect argument to 8x8 grid");
test(texture_landscape, { { 31, 63 }, { 2, 2 } }, Rotate::R0, Flip { },
expected(56*640 + 24, 80, 56*640 + 24, 80, 2, 2));
}
void Component::construct(Genode::Env &)
{
#ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_
log("-- ARM Neon --");
test_simd_b2f<Neon>();
#endif
#ifdef _INCLUDE__BLIT__INTERNAL__SSE4_H_
log("-- SSE4 --");
test_simd_b2f<Sse4>();
#endif
test_b2f_dispatch();
log("--- blit test finished ---");
}

View File

@ -0,0 +1,3 @@
TARGET = test-blit
SRC_CC = main.cc
LIBS = base