genode/repos/os/include/blit/internal/sse4.h

/*
 * \brief  2D memory copy using SSE4
 * \author Norman Feske
 * \date   2025-01-21
 */

/*
 * Copyright (C) 2025 Genode Labs GmbH
 *
 * This file is part of the Genode OS framework, which is distributed
 * under the terms of the GNU Affero General Public License version 3.
 */

#ifndef _INCLUDE__BLIT__INTERNAL__SSE4_H_
#define _INCLUDE__BLIT__INTERNAL__SSE4_H_

#include <blit/types.h>

/* compiler intrinsics */
#ifndef _MM_MALLOC_H_INCLUDED   /* discharge dependency from stdlib.h */
#define _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED_PREVENTED
#endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#include <immintrin.h>
#pragma GCC diagnostic pop
#ifdef  _MM_MALLOC_H_INCLUDED_PREVENTED
#undef  _MM_MALLOC_H_INCLUDED
#undef  _MM_MALLOC_H_INCLUDED_PREVENTED
#endif


namespace Blit { struct Sse4; };


struct Blit::Sse4
{
	/**
	 * Padded hex output utility
	 */
	template <typename T>
	struct Phex : Hex { explicit Phex(T v) : Hex(v, OMIT_PREFIX, PAD) { } };

	/**
	 * Vector output utility
	 */
	template <typename T>
	union Vec_as
	{
		__m128i v;
		static constexpr unsigned N = 128/(8*sizeof(T));
		T u[N];

		Vec_as(__m128i v) : v(v) { }

		void print(Output &out) const
		{
			for (unsigned i = 0; i < N; i++)
				Genode::print(out, Phex(u[i]), i < (N-1) ? "." : "");
		}
	};

	union Tile_4x4 { __m128i pi[4]; __m128  ps[4]; };

	struct Src_ptr4
	{
		__m128i const *p0, *p1, *p2, *p3;

		inline Src_ptr4(__m128i const *p, int const step)
		:
			p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
		{ }

		void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }

		void prefetch() const
		{
			_mm_prefetch(p0, _MM_HINT_T0); _mm_prefetch(p1, _MM_HINT_T0);
			_mm_prefetch(p2, _MM_HINT_T0); _mm_prefetch(p3, _MM_HINT_T0);
		}

		void load(Tile_4x4 &tile) const
		{
			tile.pi[0] = _mm_load_si128(p0); tile.pi[1] = _mm_load_si128(p1);
			tile.pi[2] = _mm_load_si128(p2); tile.pi[3] = _mm_load_si128(p3);
		}
	};

	struct Dst_ptr4
	{
		__m128i *p0, *p1, *p2, *p3;

		Dst_ptr4(__m128i *p, int const step_4)
		:
			p0(p), p1(p0 + step_4), p2(p1 + step_4), p3(p2 + step_4)
		{ }

		void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; }

		void store(Tile_4x4 const &tile) const
		{
			_mm_stream_si128(p0, tile.pi[0]); _mm_stream_si128(p1, tile.pi[1]);
			_mm_stream_si128(p2, tile.pi[2]); _mm_stream_si128(p3, tile.pi[3]);
		}
	};

	struct Steps { int src_y_4, dst_y_4; };

	static inline void _reverse_line(__m128i const *s, __m128i *d, unsigned len_8)
	{
		static constexpr int reversed = (0 << 6) | (1 << 4) | (2 << 2) | 3;

		d += 2*len_8;   /* move 'dst' from end towards begin */

		while (len_8--) {
			__m128i const v0 = _mm_load_si128(s++);
			__m128i const v1 = _mm_load_si128(s++);
			_mm_stream_si128(--d, _mm_shuffle_epi32(v0, reversed));
			_mm_stream_si128(--d, _mm_shuffle_epi32(v1, reversed));
		}
	};

	static inline void _copy_line(__m128i const *s, __m128i *d, unsigned len_8)
	{
		while (len_8--) {
			__m128i const v0 = _mm_load_si128(s++);
			__m128i const v1 = _mm_load_si128(s++);
			_mm_stream_si128(d++, v0);
			_mm_stream_si128(d++, v1);
		}
	};

	static inline void _rotate_4_lines(Src_ptr4 src, Dst_ptr4 dst,
	                                   unsigned len_4, auto const dst_4_step)
	{
		Tile_4x4 t;
		while (len_4--) {
			src.load(t);
			src.incr_4(1);
			src.prefetch();
			_MM_TRANSPOSE4_PS(t.ps[0], t.ps[1], t.ps[2], t.ps[3]);
			dst.store(t);
			dst.incr_4(dst_4_step);
		};
	};

	static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
	                           Steps const steps, unsigned w, unsigned h)
	{
		for (unsigned i = 2*h; i; i--) {
			_rotate_4_lines(src, dst, 2*w, 4*steps.dst_y_4);
			src.incr_4(4*steps.src_y_4);
			dst.incr_4(1);
		}
	}

	struct B2f;
	struct B2f_flip;
	struct Blend;
};


struct Blit::Sse4::B2f
{
	static inline void r0  (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
	static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
	static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
	static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};


void Blit::Sse4::B2f::r0(uint32_t       *dst, unsigned line_w,
                         uint32_t const *src, unsigned w, unsigned h)
{
	line_w >>= 3, w >>= 3, h >>= 3;

	__m128i const *s = (__m128i const *)src;
	__m128i       *d = (__m128i       *)dst;

	for (unsigned lines = h*8; lines; lines--) {
		_copy_line(s, d, w);
		s += 2*line_w;
		d += 2*line_w;
	}
}


void Blit::Sse4::B2f::r90(uint32_t       *dst, unsigned dst_w,
                          uint32_t const *src, unsigned src_w,
                          unsigned w, unsigned h)
{
	dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;

	Steps const steps { -2*int(src_w), 2*int(dst_w) };

	Src_ptr4 src_ptr4 ((__m128i *)src + 2*src_w*(8*h - 1), steps.src_y_4);
	Dst_ptr4 dst_ptr4 ((__m128i *)dst,                     steps.dst_y_4);

	_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


void Blit::Sse4::B2f::r180(uint32_t       *dst, unsigned line_w,
                           uint32_t const *src, unsigned w, unsigned h)
{
	line_w >>= 3, w >>= 3, h >>= 3;

	__m128i       *d = (__m128i *)dst;
	__m128i const *s = (__m128i const *)src + 2*line_w*8*h;

	for (unsigned i = h*8; i; i--) {
		s -= 2*line_w;
		_reverse_line(s, d, w);
		d += 2*line_w;
	}
}


void Blit::Sse4::B2f::r270(uint32_t       *dst, unsigned dst_w,
                           uint32_t const *src, unsigned src_w,
                           unsigned w, unsigned h)
{
	dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;

	Steps const steps { 2*int(src_w), -2*int(dst_w) };

	Src_ptr4 src_ptr4 ((__m128i *)src,                          steps.src_y_4);
	Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(8*w - 1), steps.dst_y_4);

	_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


struct Blit::Sse4::B2f_flip
{
	static inline void r0  (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
	static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
	static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
	static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};


void Blit::Sse4::B2f_flip::r0(uint32_t       *dst, unsigned line_w,
                              uint32_t const *src, unsigned w, unsigned h)
{
	line_w >>= 3, w >>= 3, h >>= 3;

	__m128i const *s = (__m128i const *)src;
	__m128i       *d = (__m128i       *)dst;

	for (unsigned lines = h*8; lines; lines--) {
		_reverse_line(s, d, w);
		s += 2*line_w;
		d += 2*line_w;
	}
}


void Blit::Sse4::B2f_flip::r90(uint32_t       *dst, unsigned dst_w,
                               uint32_t const *src, unsigned src_w,
                               unsigned w, unsigned h)
{
	dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;

	Steps const steps { 2*int(src_w), 2*int(dst_w) };

	Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4);
	Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4);

	_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


void Blit::Sse4::B2f_flip::r180(uint32_t       *dst, unsigned line_w,
                                uint32_t const *src, unsigned w, unsigned h)
{
	line_w >>= 3, w >>= 3, h >>= 3;

	__m128i const *s = (__m128i const *)src + 2*line_w*8*h;
	__m128i       *d = (__m128i       *)dst;

	for (unsigned lines = h*8; lines; lines--) {
		s -= 2*line_w;
		_copy_line(s, d, w);
		d += 2*line_w;
	}
}


void Blit::Sse4::B2f_flip::r270(uint32_t       *dst, unsigned dst_w,
                                uint32_t const *src, unsigned src_w,
                                unsigned w, unsigned h)
{
	dst_w >>= 3, src_w >>= 3, w >>= 3, h >>= 3;

	Steps const steps { -2*int(src_w), -2*int(dst_w) };

	Src_ptr4 src_ptr4 ((__m128i *)src + 2*int(src_w)*(h*8 - 1), steps.src_y_4);
	Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(w*8 - 1), steps.dst_y_4);

	_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


struct Blit::Sse4::Blend
{
	static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);

	__attribute__((optimize("-O3")))
	static inline uint32_t _blend(uint32_t xrgb, unsigned alpha)
	{
		return (alpha * ((xrgb & 0xff00)    >> 8) & 0xff00)
		   | (((alpha *  (xrgb & 0xff00ff)) >> 8) & 0xff00ff);
	}

	__attribute__((optimize("-O3")))
	static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha)
	{
		return (__builtin_expect(alpha == 0, false))
		       ? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1);
	}

	struct Mix_masks
	{
		/* masks for distributing alpha values to 16-bit r, g, b lanes */
		__m128i const a01 = _mm_set_epi32(0x03020302, 0x03020302, 0x01000100, 0x01000100);
		__m128i const a23 = _mm_set_epi32(0x07060706, 0x07060706, 0x05040504, 0x05040504);
	};

	__attribute__((optimize("-O3")))
	static inline void _mix_4(uint32_t *, uint32_t const *, uint8_t const *, Mix_masks const);
};


__attribute__((optimize("-O3")))
void Blit::Sse4::Blend::_mix_4(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha, Mix_masks const masks)
{
	uint32_t const a_u8_x4 = *(uint32_t const *)alpha;

	if (__builtin_expect(a_u8_x4 == 0, false))
		return;

	auto upper_half = [&] (__m128i const v) { return _mm_shuffle_epi32(v, 2 + (3<<2)); };

	__m128i const
		/* load four foreground pixel, background pixel, and alpha values */
		fg_u8_4x4 = _mm_loadu_si128((__m128i const *)fg),
		bg_u8_4x4 = _mm_loadu_si128((__m128i const *)bg),

		/* extract first and second pair of pixel values */
		fg01_u16_4x2 = _mm_cvtepu8_epi16(fg_u8_4x4),
		fg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(fg_u8_4x4)),
		bg01_u16_4x2 = _mm_cvtepu8_epi16(bg_u8_4x4),
		bg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(bg_u8_4x4)),

		/* prepare 4 destination and source alpha values */
		a_u16_x4  = _mm_cvtepu8_epi16(_mm_set1_epi32(a_u8_x4)),
		da_u16_x4 = _mm_sub_epi16(_mm_set1_epi16(256), a_u16_x4),
		sa_u16_x4 = _mm_add_epi16(a_u16_x4, _mm_set1_epi16(1)),

		/* mix first pixel pair */
		da01_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a01),
		sa01_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a01),
		mixed01 = _mm_add_epi16(_mm_mullo_epi16(fg01_u16_4x2, sa01_u16_4x2),
		                        _mm_mullo_epi16(bg01_u16_4x2, da01_u16_4x2)),

		/* mix second pixel pair */
		da23_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a23),
		sa23_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a23),
		mixed23 = _mm_add_epi16(_mm_mullo_epi16(fg23_u16_4x2, sa23_u16_4x2),
		                        _mm_mullo_epi16(bg23_u16_4x2, da23_u16_4x2)),

		result_4x4 = _mm_packus_epi16(_mm_srli_epi16(mixed01, 8),
		                              _mm_srli_epi16(mixed23, 8));

	_mm_storeu_si128((__m128i *)bg, result_4x4);
}


__attribute__((optimize("-O3")))
void Blit::Sse4::Blend::xrgb_a(uint32_t *dst, unsigned n,
                               uint32_t const *pixel, uint8_t const *alpha)
{
	Mix_masks const mix_masks { };

	for (; n > 3; n -= 4, dst += 4, pixel += 4, alpha += 4)
		_mix_4(dst, pixel, alpha, mix_masks);

	for (; n--; dst++, pixel++, alpha++)
		*dst = _mix(*dst, *pixel, *alpha);
}

#endif /* _INCLUDE__BLIT__INTERNAL__SSE3_H_ */