memcpy (arm): cache align and use pld for speedup

Preloading a few cache lines ahead brings a significant speedup in
memcpy throughput. Note, the particular (optimal) value was empirically
determined on a Cortex-A9 (Zynq-7000) SoC @ 666Mhz. It is best combined
with L2 prefetching enabled (including double linefills and prefetch
offset 7). Yet, even without L2 prefetching this seems to be the sweet
spot.

genodelabs/genode#4456
This commit is contained in:
Johannes Schlatow 2022-03-25 10:12:46 +01:00 committed by Christian Helmuth
parent 4dcc095e5e
commit 0104a74028

View File

@ -30,21 +30,25 @@ namespace Genode {
{ {
unsigned char *d = (unsigned char *)dst, *s = (unsigned char *)src; unsigned char *d = (unsigned char *)dst, *s = (unsigned char *)src;
/* check 4 byte; alignment */ /* fetch the first cache line */
size_t d_align = (size_t)d & 0x3; asm volatile ("pld [%0, #0]\n\t" : "+r" (s));
size_t s_align = (size_t)s & 0x3;
/* only same alignments work for the following LDM/STM loop */ /* check 32-byte (cache line) alignment */
if (d_align != s_align) size_t d_align = (size_t)d & 0x1f;
size_t s_align = (size_t)s & 0x1f;
/* only same word-alignments work for the following LDM/STM loop */
if ((d_align & 0x3) != (s_align & 0x3))
return size; return size;
/* copy to 4 byte alignment */ /* copy to 32-byte alignment */
for (; (size > 0) && (s_align > 0) && (s_align < 4); for (; (size > 0) && (s_align > 0) && (s_align < 32);
s_align++, *d++ = *s++, size--); s_align++, *d++ = *s++, size--);
/* copy 32 byte chunks */ /* copy 32 byte chunks */
for (; size >= 32; size -= 32) { for (; size >= 32; size -= 32) {
asm volatile ("ldmia %0!, {r3 - r10} \n\t" asm volatile ("ldmia %0!, {r3 - r10} \n\t"
"pld [%0, #160]\n\t"
"stmia %1!, {r3 - r10} \n\t" "stmia %1!, {r3 - r10} \n\t"
: "+r" (s), "+r" (d) : "+r" (s), "+r" (d)
:: "r3","r4","r5","r6","r7","r8","r9","r10"); :: "r3","r4","r5","r6","r7","r8","r9","r10");