/* * \brief 2D memory copy using ARM NEON * \author Norman Feske * \date 2025-01-16 */ /* * Copyright (C) 2025 Genode Labs GmbH * * This file is part of the Genode OS framework, which is distributed * under the terms of the GNU Affero General Public License version 3. */ #ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_ #define _INCLUDE__BLIT__INTERNAL__NEON_H_ #include /* compiler intrinsics */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wnarrowing" #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wfloat-conversion" #pragma GCC diagnostic ignored "-Wunused-parameter" #include #pragma GCC diagnostic pop namespace Blit { struct Neon; } struct Blit::Neon { /** * Helper for printing the raw lower 64 bits of a vector via Genode::Output */ template union Printable { Genode::uint64_t u64; T vec; Printable(T vec) : vec(vec) { } void print(Output &out) const { Genode::print(out, Hex(u64)); } }; static inline uint32x4_t _reversed(uint32x4_t const v) { return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); } static inline void _reverse_line(uint32x4_t const *src, uint32x4_t *dst, unsigned len) { src += len; /* move 'src' from end of line towards begin */ while (len--) *dst++ = _reversed(*--src); }; static inline void _copy_line(uint32x4_t const *s, uint32x4_t *d, unsigned len) { while (len--) *d++ = *s++; }; struct Src_ptr4 { uint32x4_t const *p0, *p1, *p2, *p3; inline Src_ptr4(uint32x4_t const *p, int const step) : p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) { } void incr_4(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; } void prefetch() const { __builtin_prefetch(p0); __builtin_prefetch(p1); __builtin_prefetch(p2); __builtin_prefetch(p3); } void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; } }; struct Dst_ptr4 { uint32_t *p0, *p1, *p2, *p3; Dst_ptr4(uint32_t *p, int const step) : p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) { } void incr(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; } void store(uint32x4x4_t const &tile) { vst4q_lane_u32(p0, tile, 0); vst4q_lane_u32(p1, tile, 1); vst4q_lane_u32(p2, tile, 2); vst4q_lane_u32(p3, tile, 3); } }; struct Steps { int const src_y, dst_y; void incr_x_4(Src_ptr4 &p) const { p.incr_4(1); }; void incr_x_8(Src_ptr4 &p) const { p.incr_4(2); }; void incr_y_4(Src_ptr4 &p) const { p.incr_4(src_y << 2); }; void incr_y_8(Src_ptr4 &p) const { p.incr_4(src_y << 3); }; void incr_x_4(Dst_ptr4 &p) const { p.incr(4); }; void incr_x_8(Dst_ptr4 &p) const { p.incr(8); }; void incr_y_4(Dst_ptr4 &p) const { p.incr(dst_y << 2); }; void incr_y_8(Dst_ptr4 &p) const { p.incr(dst_y << 3); }; }; __attribute__((optimize("-O3"))) static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps) { uint32x4x4_t tile; src.load(tile); steps.incr_y_4(src); src.prefetch(); dst.store(tile); steps.incr_x_4(dst); } __attribute__((optimize("-O3"))) static inline void _rotate_8x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) { for (unsigned i = 0; i < 2; i++) _load_prefetch_store(src, dst, steps); } __attribute__((optimize("-O3"))) static inline void _rotate_8x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) { _load_prefetch_store(src, dst, steps); uint32x4x4_t tile; src.load(tile); dst.store(tile); } __attribute__((optimize("-O3"))) static inline void _rotate_8x8(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) { _rotate_8x4(src, dst, steps); steps.incr_y_4(dst); steps.incr_x_4(src); _rotate_8x4_last(src, dst, steps); } __attribute__((optimize("-O3"))) static inline void _rotate_8_lines(Src_ptr4 src, Dst_ptr4 dst, Steps const steps, unsigned n) { for (; n; n--) { _rotate_8x8(src, dst, steps); steps.incr_y_8(dst); steps.incr_x_8(src); } }; static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst, Steps const steps, unsigned w, unsigned h) { for (unsigned i = h; i; i--) { _rotate_8_lines(src, dst, steps, w); steps.incr_y_8(src); steps.incr_x_8(dst); } } struct B2f; struct B2f_flip; struct Blend; }; struct Blit::Neon::B2f { static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); }; void Blit::Neon::B2f::r0(uint32_t *dst, unsigned line_w, uint32_t const *src, unsigned w, unsigned h) { line_w >>= 3, w >>= 3, h >>= 3; uint32x4_t const *s = (uint32x4_t const *)src; uint32x4_t *d = (uint32x4_t *)dst; for (unsigned lines = h*8; lines; lines--) { _copy_line(s, d, 2*w); s += 2*line_w; d += 2*line_w; } } void Blit::Neon::B2f::r90(uint32_t *dst, unsigned dst_w, uint32_t const *src, unsigned src_w, unsigned w, unsigned h) { dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3; Steps const steps { -2*int(src_w), 8*int(dst_w) }; Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y); Dst_ptr4 dst_ptr4 (dst, steps.dst_y); _rotate(src_ptr4, dst_ptr4, steps, w, h); } void Blit::Neon::B2f::r180(uint32_t *dst, unsigned line_w, uint32_t const *src, unsigned w, unsigned h) { line_w >>= 3, w >>= 3, h >>= 3; uint32x4_t *d = (uint32x4_t *)dst; uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h; for (unsigned i = h*8; i; i--) { s -= 2*line_w; _reverse_line(s, d, 2*w); d += 2*line_w; } } void Blit::Neon::B2f::r270(uint32_t *dst, unsigned dst_w, uint32_t const *src, unsigned src_w, unsigned w, unsigned h) { dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3; Steps const steps { 2*int(src_w), -8*int(dst_w) }; Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y); _rotate(src_ptr4, dst_ptr4, steps, w, h); } struct Blit::Neon::B2f_flip { static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); }; void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned line_w, uint32_t const *src, unsigned w, unsigned h) { line_w >>= 3, w >>= 3, h >>= 3; uint32x4_t const *s = (uint32x4_t const *)src; uint32x4_t *d = (uint32x4_t *)dst; for (unsigned lines = h*8; lines; lines--) { _reverse_line(s, d, 2*w); s += 2*line_w; d += 2*line_w; } } void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned dst_w, uint32_t const *src, unsigned src_w, unsigned w, unsigned h) { dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3; Steps const steps { 2*int(src_w), 8*int(dst_w) }; Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); Dst_ptr4 dst_ptr4 (dst, steps.dst_y); _rotate(src_ptr4, dst_ptr4, steps, w, h); } void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned line_w, uint32_t const *src, unsigned w, unsigned h) { line_w >>= 3, w >>= 3, h >>= 3; uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h; uint32x4_t *d = (uint32x4_t *)dst; for (unsigned lines = h*8; lines; lines--) { s -= 2*line_w; _copy_line(s, d, 2*w); d += 2*line_w; } } void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned dst_w, uint32_t const *src, unsigned src_w, unsigned w, unsigned h) { dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3; Steps const steps { -2*int(src_w), -8*int(dst_w) }; Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y); Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y); _rotate(src_ptr4, dst_ptr4, steps, w, h); } struct Blit::Neon::Blend { static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *); __attribute__((optimize("-O3"))) static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha) { if (__builtin_expect(alpha == 0, false)) return bg; /* * Compute r, g, b in the lower 3 16-bit lanes. * The upper 5 lanes are unused. */ uint16x8_t const a = vmovl_u8(vdup_n_u8(alpha)), s = vmovl_u8(vcreate_u8(fg)), d = vmovl_u8(vcreate_u8(bg)), ar = vaddq_u16(vdupq_n_u16(1), a), /* for rounding up */ nar = vsubq_u16(vdupq_n_u16(256), a), /* 1.0 - alpha */ res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar)); return uint32_t(::uint64_t(vshrn_n_u16(res, 8))); } __attribute__((optimize("-O3"))) static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha) { /* fetch 8 alpha values */ uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha); /* skip block if entirely transparent */ if (__builtin_expect(vmaxvq_u16(a) == 0, false)) return; /* load 8 source and destination pixels */ uint8x8x4_t const s = vld4_u8((uint8_t const *)fg); uint8x8x4_t d = vld4_u8((uint8_t const *)bg); /* extend r, g, b components from uint8_t to uint16_t */ uint16x8x4_t const s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) }, d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) }; /* load 8 alpha values, prepare as factors for source and destination */ uint16x8_t const sa = vaddq_u16(vdupq_n_u16(1), a), da = vsubq_u16(vdupq_n_u16(256), a); /* 1.0 - alpha */ /* mix components, keeping only their upper 8 bits */ for (unsigned i = 0; i < 3; i++) d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da), vmulq_u16(s_rgb.val[i], sa)), 8); /* write 8 pixels */ vst4_u8((uint8_t *)bg, d); } }; __attribute__((optimize("-O3"))) void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n, uint32_t const *pixel, uint8_t const *alpha) { int const prefetch_distance = 16; /* cache line / 32-bit pixel size */ for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) { __builtin_prefetch(dst + prefetch_distance); __builtin_prefetch(pixel + prefetch_distance); __builtin_prefetch(alpha + prefetch_distance); _mix_8(dst, pixel, alpha); } for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8) _mix_8(dst, pixel, alpha); for (; n--; dst++, pixel++, alpha++) *dst = _mix(*dst, *pixel, *alpha); } #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */