memcpy (arm): remove unused vfp implementation

The implementation is not in use any more. Furthermore, on typical ARM cores such as the Cortex-A9, the cached read appears to be the bottleneck rather than instruction density. On a Zynq-7000 SoC, the vfp implementation performed significantly worse than the standard load/store multiple implementation with preloading. genodelabs/genode#4456
2025-04-09 04:15:52 +00:00 · 2022-03-25 10:29:37 +01:00 · 2022-03-25 10:29:37 +01:00 · 4dcc095e5e
commit 4dcc095e5e
parent 052c33fc8c
1 changed files with 0 additions and 69 deletions
--- a/repos/base/include/spec/arm/vfp/cpu/string.h
+++ b/repos/base/include/spec/arm/vfp/cpu/string.h
@ -1,69 +0,0 @@
-/*
- * \brief  ARM-specific memcpy using VFP
- * \author Sebastian Sumpf
- * \date   2013-06-19
- *
- * Should work for VFPv2, VFPv3, and Advanced SIMD.
- */
-
-/*
- * Copyright (C) 2012-2017 Genode Labs GmbH
- *
- * This file is part of the Genode OS framework, which is distributed
- * under the terms of the GNU Affero General Public License version 3.
- */
-
-#ifndef _INCLUDE__SPEC__ARM__VFP__CPU__STRING_H_
-#define _INCLUDE__SPEC__ARM__VFP__CPU__STRING_H_
-
-namespace Genode {
-
-	/**
-	 * Copy memory block
-	 *
-	 * \param dst   destination memory block
-	 * \param src   source memory block
-	 * \param size  number of bytes to copy
-	 *
-	 * \return      Number of bytes not copied
-	 */
-	inline size_t memcpy_cpu(void *dst, const void *src, size_t size)
-	{
-		unsigned char *d = (unsigned char *)dst, *s = (unsigned char *)src;
-		/* check 4 byte; alignment */
-		size_t d_align = (size_t)d & 0x3;
-		size_t s_align = (size_t)s & 0x3;
-
-		/* only same alignments work for the following loops */
-		if (d_align != s_align)
-			return size;
-
-		/* copy to 4 byte alignment */
-		for (; (size > 0) && (s_align > 0) && (s_align < 4);
-		     s_align++, *d++ = *s++, size--);
-
-		/* copy 64 byte chunks using FPU */
-		for (; size >= 64; size -= 64)
-			asm volatile ("pld [%0, #0xc0]  \n\t"
-			              "vldm %0!,{d0-d7} \n\t"
-			              "vstm %1!,{d0-d7} \n\t"
-			              : "+r"(s), "+r" (d)
-                    :: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7");
-
-		/* copy left over 32 byte chunk */
-		for (; size >= 32; size -= 32)
-			asm volatile ("ldmia %0!, {r3 - r10} \n\t"
-			              "stmia %1!, {r3 - r10} \n\t"
-			              : "+r" (s), "+r" (d)
-			              :: "r3","r4","r5","r6","r7","r8","r9","r10");
-
-		for(; size >= 4; size -= 4)
-			asm volatile ("ldr r3, [%0], #4 \n\t"
-			              "str r3, [%1], #4 \n\t"
-			              : "+r" (s), "+r" (d)
-			              :: "r3");
-		return size;
-	}
-}
-
-#endif /* _INCLUDE__SPEC__ARM__VFP__CPU__STRING_H_ */