mirror of
https://github.com/genodelabs/genode.git
synced 2025-04-26 22:09:51 +00:00
This patch allows for the use of the back2front operation with textures that are not a multiple of 8x8 pixels, which makes the utility compatible with screen resolutions like 1366x768. In such cases, the implementation falls back to the non-SIMD variant. Issue #5428 Issue #5501
401 lines
11 KiB
C++
401 lines
11 KiB
C++
/*
|
|
* \brief 2D memory copy using ARM NEON
|
|
* \author Norman Feske
|
|
* \date 2025-01-16
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2025 Genode Labs GmbH
|
|
*
|
|
* This file is part of the Genode OS framework, which is distributed
|
|
* under the terms of the GNU Affero General Public License version 3.
|
|
*/
|
|
|
|
#ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_
|
|
#define _INCLUDE__BLIT__INTERNAL__NEON_H_
|
|
|
|
#include <blit/types.h>
|
|
|
|
/* compiler intrinsics */
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wnarrowing"
|
|
#pragma GCC diagnostic ignored "-Wconversion"
|
|
#pragma GCC diagnostic ignored "-Wfloat-conversion"
|
|
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
|
#include <arm_neon.h>
|
|
#pragma GCC diagnostic pop
|
|
|
|
|
|
namespace Blit { struct Neon; }
|
|
|
|
|
|
struct Blit::Neon
|
|
{
|
|
/**
|
|
* Helper for printing the raw lower 64 bits of a vector via Genode::Output
|
|
*/
|
|
template <typename T> union Printable
|
|
{
|
|
Genode::uint64_t u64; T vec;
|
|
Printable(T vec) : vec(vec) { }
|
|
void print(Output &out) const { Genode::print(out, Hex(u64)); }
|
|
};
|
|
|
|
static inline uint32x4_t _reversed(uint32x4_t const v)
|
|
{
|
|
return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v)));
|
|
}
|
|
|
|
static inline void _reverse_line(uint32x4_t const *src, uint32x4_t *dst, unsigned len)
|
|
{
|
|
src += len; /* move 'src' from end of line towards begin */
|
|
while (len--)
|
|
*dst++ = _reversed(*--src);
|
|
};
|
|
|
|
static inline void _copy_line(uint32x4_t const *s, uint32x4_t *d, unsigned len)
|
|
{
|
|
while (len--)
|
|
*d++ = *s++;
|
|
};
|
|
|
|
struct Src_ptr4
|
|
{
|
|
uint32x4_t const *p0, *p1, *p2, *p3;
|
|
|
|
inline Src_ptr4(uint32x4_t const *p, int const step)
|
|
:
|
|
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
|
|
{ }
|
|
|
|
void incr_4(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }
|
|
|
|
void prefetch() const
|
|
{
|
|
__builtin_prefetch(p0); __builtin_prefetch(p1);
|
|
__builtin_prefetch(p2); __builtin_prefetch(p3);
|
|
}
|
|
|
|
void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; }
|
|
};
|
|
|
|
struct Dst_ptr4
|
|
{
|
|
uint32_t *p0, *p1, *p2, *p3;
|
|
|
|
Dst_ptr4(uint32_t *p, int const step)
|
|
:
|
|
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
|
|
{ }
|
|
|
|
void incr(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }
|
|
|
|
void store(uint32x4x4_t const &tile)
|
|
{
|
|
vst4q_lane_u32(p0, tile, 0);
|
|
vst4q_lane_u32(p1, tile, 1);
|
|
vst4q_lane_u32(p2, tile, 2);
|
|
vst4q_lane_u32(p3, tile, 3);
|
|
}
|
|
};
|
|
|
|
struct Steps
|
|
{
|
|
int const src_y, dst_y;
|
|
|
|
void incr_x_4(Src_ptr4 &p) const { p.incr_4(1); };
|
|
void incr_x_8(Src_ptr4 &p) const { p.incr_4(2); };
|
|
void incr_y_4(Src_ptr4 &p) const { p.incr_4(src_y << 2); };
|
|
void incr_y_8(Src_ptr4 &p) const { p.incr_4(src_y << 3); };
|
|
|
|
void incr_x_4(Dst_ptr4 &p) const { p.incr(4); };
|
|
void incr_x_8(Dst_ptr4 &p) const { p.incr(8); };
|
|
void incr_y_4(Dst_ptr4 &p) const { p.incr(dst_y << 2); };
|
|
void incr_y_8(Dst_ptr4 &p) const { p.incr(dst_y << 3); };
|
|
};
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps)
|
|
{
|
|
uint32x4x4_t tile;
|
|
src.load(tile);
|
|
steps.incr_y_4(src);
|
|
src.prefetch();
|
|
dst.store(tile);
|
|
steps.incr_x_4(dst);
|
|
}
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _rotate_8x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
|
|
{
|
|
for (unsigned i = 0; i < 2; i++)
|
|
_load_prefetch_store(src, dst, steps);
|
|
}
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _rotate_8x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
|
|
{
|
|
_load_prefetch_store(src, dst, steps);
|
|
uint32x4x4_t tile;
|
|
src.load(tile);
|
|
dst.store(tile);
|
|
}
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _rotate_8x8(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
|
|
{
|
|
_rotate_8x4(src, dst, steps);
|
|
steps.incr_y_4(dst);
|
|
steps.incr_x_4(src);
|
|
_rotate_8x4_last(src, dst, steps);
|
|
}
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _rotate_8_lines(Src_ptr4 src, Dst_ptr4 dst,
|
|
Steps const steps, unsigned n)
|
|
{
|
|
for (; n; n--) {
|
|
_rotate_8x8(src, dst, steps);
|
|
steps.incr_y_8(dst);
|
|
steps.incr_x_8(src);
|
|
}
|
|
};
|
|
|
|
static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
|
|
Steps const steps, unsigned w, unsigned h)
|
|
{
|
|
for (unsigned i = h; i; i--) {
|
|
_rotate_8_lines(src, dst, steps, w);
|
|
steps.incr_y_8(src);
|
|
steps.incr_x_8(dst);
|
|
}
|
|
}
|
|
|
|
struct B2f;
|
|
struct B2f_flip;
|
|
struct Blend;
|
|
};
|
|
|
|
|
|
struct Blit::Neon::B2f
|
|
{
|
|
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
|
|
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
|
|
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
|
|
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
|
|
};
|
|
|
|
|
|
void Blit::Neon::B2f::r0(uint32_t *dst, unsigned line_w,
|
|
uint32_t const *src, unsigned w, unsigned h)
|
|
{
|
|
line_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
uint32x4_t const *s = (uint32x4_t const *)src;
|
|
uint32x4_t *d = (uint32x4_t *)dst;
|
|
|
|
for (unsigned lines = h*8; lines; lines--) {
|
|
_copy_line(s, d, 2*w);
|
|
s += 2*line_w;
|
|
d += 2*line_w;
|
|
}
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f::r90(uint32_t *dst, unsigned dst_w,
|
|
uint32_t const *src, unsigned src_w,
|
|
unsigned w, unsigned h)
|
|
{
|
|
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
Steps const steps { -2*int(src_w), 8*int(dst_w) };
|
|
|
|
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
|
|
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);
|
|
|
|
_rotate(src_ptr4, dst_ptr4, steps, w, h);
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f::r180(uint32_t *dst, unsigned line_w,
|
|
uint32_t const *src, unsigned w, unsigned h)
|
|
{
|
|
line_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
uint32x4_t *d = (uint32x4_t *)dst;
|
|
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
|
|
|
|
for (unsigned i = h*8; i; i--) {
|
|
s -= 2*line_w;
|
|
_reverse_line(s, d, 2*w);
|
|
d += 2*line_w;
|
|
}
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f::r270(uint32_t *dst, unsigned dst_w,
|
|
uint32_t const *src, unsigned src_w,
|
|
unsigned w, unsigned h)
|
|
{
|
|
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
Steps const steps { 2*int(src_w), -8*int(dst_w) };
|
|
|
|
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
|
|
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);
|
|
|
|
_rotate(src_ptr4, dst_ptr4, steps, w, h);
|
|
}
|
|
|
|
|
|
struct Blit::Neon::B2f_flip
|
|
{
|
|
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
|
|
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
|
|
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
|
|
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
|
|
};
|
|
|
|
|
|
void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned line_w,
|
|
uint32_t const *src, unsigned w, unsigned h)
|
|
{
|
|
line_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
uint32x4_t const *s = (uint32x4_t const *)src;
|
|
uint32x4_t *d = (uint32x4_t *)dst;
|
|
|
|
for (unsigned lines = h*8; lines; lines--) {
|
|
_reverse_line(s, d, 2*w);
|
|
s += 2*line_w;
|
|
d += 2*line_w;
|
|
}
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned dst_w,
|
|
uint32_t const *src, unsigned src_w,
|
|
unsigned w, unsigned h)
|
|
{
|
|
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
Steps const steps { 2*int(src_w), 8*int(dst_w) };
|
|
|
|
Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
|
|
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);
|
|
|
|
_rotate(src_ptr4, dst_ptr4, steps, w, h);
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned line_w,
|
|
uint32_t const *src, unsigned w, unsigned h)
|
|
{
|
|
line_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
|
|
uint32x4_t *d = (uint32x4_t *)dst;
|
|
|
|
for (unsigned lines = h*8; lines; lines--) {
|
|
s -= 2*line_w;
|
|
_copy_line(s, d, 2*w);
|
|
d += 2*line_w;
|
|
}
|
|
}
|
|
|
|
|
|
void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned dst_w,
|
|
uint32_t const *src, unsigned src_w,
|
|
unsigned w, unsigned h)
|
|
{
|
|
dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;
|
|
|
|
Steps const steps { -2*int(src_w), -8*int(dst_w) };
|
|
|
|
Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
|
|
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);
|
|
|
|
_rotate(src_ptr4, dst_ptr4, steps, w, h);
|
|
}
|
|
|
|
|
|
struct Blit::Neon::Blend
|
|
{
|
|
static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha)
|
|
{
|
|
if (__builtin_expect(alpha == 0, false))
|
|
return bg;
|
|
|
|
/*
|
|
* Compute r, g, b in the lower 3 16-bit lanes.
|
|
* The upper 5 lanes are unused.
|
|
*/
|
|
uint16x8_t const
|
|
a = vmovl_u8(vdup_n_u8(alpha)),
|
|
s = vmovl_u8(vcreate_u8(fg)),
|
|
d = vmovl_u8(vcreate_u8(bg)),
|
|
ar = vaddq_u16(vdupq_n_u16(1), a), /* for rounding up */
|
|
nar = vsubq_u16(vdupq_n_u16(256), a), /* 1.0 - alpha */
|
|
res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar));
|
|
|
|
return uint32_t(::uint64_t(vshrn_n_u16(res, 8)));
|
|
}
|
|
|
|
__attribute__((optimize("-O3")))
|
|
static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha)
|
|
{
|
|
/* fetch 8 alpha values */
|
|
uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha);
|
|
|
|
/* skip block if entirely transparent */
|
|
if (__builtin_expect(vmaxvq_u16(a) == 0, false))
|
|
return;
|
|
|
|
/* load 8 source and destination pixels */
|
|
uint8x8x4_t const s = vld4_u8((uint8_t const *)fg);
|
|
uint8x8x4_t d = vld4_u8((uint8_t const *)bg);
|
|
|
|
/* extend r, g, b components from uint8_t to uint16_t */
|
|
uint16x8x4_t const
|
|
s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) },
|
|
d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) };
|
|
|
|
/* load 8 alpha values, prepare as factors for source and destination */
|
|
uint16x8_t const
|
|
sa = vaddq_u16(vdupq_n_u16(1), a),
|
|
da = vsubq_u16(vdupq_n_u16(256), a); /* 1.0 - alpha */
|
|
|
|
/* mix components, keeping only their upper 8 bits */
|
|
for (unsigned i = 0; i < 3; i++)
|
|
d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da),
|
|
vmulq_u16(s_rgb.val[i], sa)), 8);
|
|
/* write 8 pixels */
|
|
vst4_u8((uint8_t *)bg, d);
|
|
}
|
|
};
|
|
|
|
|
|
__attribute__((optimize("-O3")))
|
|
void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n,
|
|
uint32_t const *pixel, uint8_t const *alpha)
|
|
{
|
|
int const prefetch_distance = 16; /* cache line / 32-bit pixel size */
|
|
for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) {
|
|
__builtin_prefetch(dst + prefetch_distance);
|
|
__builtin_prefetch(pixel + prefetch_distance);
|
|
__builtin_prefetch(alpha + prefetch_distance);
|
|
_mix_8(dst, pixel, alpha);
|
|
}
|
|
|
|
for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8)
|
|
_mix_8(dst, pixel, alpha);
|
|
|
|
for (; n--; dst++, pixel++, alpha++)
|
|
*dst = _mix(*dst, *pixel, *alpha);
|
|
}
|
|
|
|
#endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */
|