openwrt/target/linux/starfive/patches-6.6/0092-riscv-Optimize-memcpy-with-aligned-version.patch

From dedbbfd891e4d0cdb825454eddfbf8386d0025b3 Mon Sep 17 00:00:00 2001
From: Mason Huo <mason.huo@starfivetech.com>
Date: Tue, 20 Jun 2023 13:37:52 +0800
Subject: [PATCH 092/116] riscv: Optimize memcpy with aligned version

Optimizing the 128 byte align case, this will improve the
performance of large block memcpy.

Here we combine the memcpy of glibc and kernel.

Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
Signed-off-by: Hal Feng <hal.feng@starfivetech.com>
---
 arch/riscv/lib/Makefile                       |   3 +-
 arch/riscv/lib/{memcpy.S => memcpy_aligned.S} |  36 +--
 arch/riscv/lib/string.c                       | 266 ++++++++++++++++++
 3 files changed, 273 insertions(+), 32 deletions(-)
 rename arch/riscv/lib/{memcpy.S => memcpy_aligned.S} (67%)
 create mode 100644 arch/riscv/lib/string.c

--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y			+= delay.o
-lib-y			+= memcpy.o
 lib-y			+= memset.o
 lib-y			+= memmove.o
 lib-y			+= strcmp.o
@@ -9,5 +8,7 @@ lib-y			+= strncmp.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
+lib-y			+= string.o
+lib-y			+= memcpy_aligned.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
--- a/arch/riscv/lib/memcpy.S
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memcpy(void *, const void *, size_t) */
-ENTRY(__memcpy)
-WEAK(memcpy)
-	move t6, a0  /* Preserve return value */
-
-	/* Defer to byte-oriented copy for small sizes */
-	sltiu a3, a2, 128
-	bnez a3, 4f
-	/* Use word-oriented copy only if low-order bits match */
-	andi a3, t6, SZREG-1
-	andi a4, a1, SZREG-1
-	bne a3, a4, 4f
-
-	beqz a3, 2f  /* Skip if already aligned */
-	/*
-	 * Round to nearest double word-aligned address
-	 * greater than or equal to start address
-	 */
-	andi a3, a1, ~(SZREG-1)
-	addi a3, a3, SZREG
-	/* Handle initial misalignment */
-	sub a4, a3, a1
-1:
-	lb a5, 0(a1)
-	addi a1, a1, 1
-	sb a5, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 1b
-	sub a2, a2, a4  /* Update count */
-
-2:
-	andi a4, a2, ~((16*SZREG)-1)
-	beqz a4, 4f
-	add a3, a1, a4
-3:
-	REG_L a4,       0(a1)
-	REG_L a5,   SZREG(a1)
-	REG_L a6, 2*SZREG(a1)
-	REG_L a7, 3*SZREG(a1)
-	REG_L t0, 4*SZREG(a1)
-	REG_L t1, 5*SZREG(a1)
-	REG_L t2, 6*SZREG(a1)
-	REG_L t3, 7*SZREG(a1)
-	REG_L t4, 8*SZREG(a1)
-	REG_L t5, 9*SZREG(a1)
-	REG_S a4,       0(t6)
-	REG_S a5,   SZREG(t6)
-	REG_S a6, 2*SZREG(t6)
-	REG_S a7, 3*SZREG(t6)
-	REG_S t0, 4*SZREG(t6)
-	REG_S t1, 5*SZREG(t6)
-	REG_S t2, 6*SZREG(t6)
-	REG_S t3, 7*SZREG(t6)
-	REG_S t4, 8*SZREG(t6)
-	REG_S t5, 9*SZREG(t6)
-	REG_L a4, 10*SZREG(a1)
-	REG_L a5, 11*SZREG(a1)
-	REG_L a6, 12*SZREG(a1)
-	REG_L a7, 13*SZREG(a1)
-	REG_L t0, 14*SZREG(a1)
-	REG_L t1, 15*SZREG(a1)
-	addi a1, a1, 16*SZREG
-	REG_S a4, 10*SZREG(t6)
-	REG_S a5, 11*SZREG(t6)
-	REG_S a6, 12*SZREG(t6)
-	REG_S a7, 13*SZREG(t6)
-	REG_S t0, 14*SZREG(t6)
-	REG_S t1, 15*SZREG(t6)
-	addi t6, t6, 16*SZREG
-	bltu a1, a3, 3b
-	andi a2, a2, (16*SZREG)-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, a1, a2
-
-	/* Use word-oriented copy if co-aligned to word boundary */
-	or a5, a1, t6
-	or a5, a5, a3
-	andi a5, a5, 3
-	bnez a5, 5f
-7:
-	lw a4, 0(a1)
-	addi a1, a1, 4
-	sw a4, 0(t6)
-	addi t6, t6, 4
-	bltu a1, a3, 7b
-
-	ret
-
-5:
-	lb a4, 0(a1)
-	addi a1, a1, 1
-	sb a4, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 5b
-6:
-	ret
-END(__memcpy)
-SYM_FUNC_ALIAS(__pi_memcpy, __memcpy)
-SYM_FUNC_ALIAS(__pi___memcpy, __memcpy)
--- /dev/null
+++ b/arch/riscv/lib/memcpy_aligned.S
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/* void *__memcpy_aligned(void *, const void *, size_t) */
+ENTRY(__memcpy_aligned)
+	move t6, a0  /* Preserve return value */
+
+2:
+	andi a4, a2, ~((16*SZREG)-1)
+	beqz a4, 4f
+	add a3, a1, a4
+3:
+	REG_L a4,       0(a1)
+	REG_L a5,   SZREG(a1)
+	REG_L a6, 2*SZREG(a1)
+	REG_L a7, 3*SZREG(a1)
+	REG_L t0, 4*SZREG(a1)
+	REG_L t1, 5*SZREG(a1)
+	REG_L t2, 6*SZREG(a1)
+	REG_L t3, 7*SZREG(a1)
+	REG_L t4, 8*SZREG(a1)
+	REG_L t5, 9*SZREG(a1)
+	REG_S a4,       0(t6)
+	REG_S a5,   SZREG(t6)
+	REG_S a6, 2*SZREG(t6)
+	REG_S a7, 3*SZREG(t6)
+	REG_S t0, 4*SZREG(t6)
+	REG_S t1, 5*SZREG(t6)
+	REG_S t2, 6*SZREG(t6)
+	REG_S t3, 7*SZREG(t6)
+	REG_S t4, 8*SZREG(t6)
+	REG_S t5, 9*SZREG(t6)
+	REG_L a4, 10*SZREG(a1)
+	REG_L a5, 11*SZREG(a1)
+	REG_L a6, 12*SZREG(a1)
+	REG_L a7, 13*SZREG(a1)
+	REG_L t0, 14*SZREG(a1)
+	REG_L t1, 15*SZREG(a1)
+	addi a1, a1, 16*SZREG
+	REG_S a4, 10*SZREG(t6)
+	REG_S a5, 11*SZREG(t6)
+	REG_S a6, 12*SZREG(t6)
+	REG_S a7, 13*SZREG(t6)
+	REG_S t0, 14*SZREG(t6)
+	REG_S t1, 15*SZREG(t6)
+	addi t6, t6, 16*SZREG
+	bltu a1, a3, 3b
+	andi a2, a2, (16*SZREG)-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz a2, 6f
+	add a3, a1, a2
+
+	/* Use word-oriented copy if co-aligned to word boundary */
+	or a5, a1, t6
+	or a5, a5, a3
+	andi a5, a5, 3
+	bnez a5, 5f
+7:
+	lw a4, 0(a1)
+	addi a1, a1, 4
+	sw a4, 0(t6)
+	addi t6, t6, 4
+	bltu a1, a3, 7b
+
+	ret
+
+5:
+	lb a4, 0(a1)
+	addi a1, a1, 1
+	sb a4, 0(t6)
+	addi t6, t6, 1
+	bltu a1, a3, 5b
+6:
+	ret
+END(__memcpy_aligned)
+SYM_FUNC_ALIAS(__pi_memcpy, __memcpy_aligned)
+SYM_FUNC_ALIAS(__pi___memcpy, __memcpy_aligned)
--- /dev/null
+++ b/arch/riscv/lib/string.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copy memory to memory until the specified number of bytes
+ * has been copied.  Overlap is NOT handled correctly.
+ * Copyright (C) 1991-2020 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Torbjorn Granlund (tege@sics.se).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the GNU C Library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ *
+ */
+
+#define __NO_FORTIFY
+#include <linux/types.h>
+#include <linux/module.h>
+
+#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+#define OP_T_THRES      16
+#define op_t    unsigned long
+#define OPSIZ   (sizeof(op_t))
+#define OPSIZ_MASK   (sizeof(op_t) - 1)
+#define FAST_COPY_THRES  (128)
+#define byte    unsigned char
+
+static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1;
+
+	switch (len % 8) {
+	case 2:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 6 * OPSIZ;
+		dstp -= 7 * OPSIZ;
+		len += 6;
+		goto do1;
+	case 3:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 5 * OPSIZ;
+		dstp -= 6 * OPSIZ;
+		len += 5;
+		goto do2;
+	case 4:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 4 * OPSIZ;
+		dstp -= 5 * OPSIZ;
+		len += 4;
+		goto do3;
+	case 5:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 3 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		len += 3;
+		goto do4;
+	case 6:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 2 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do5;
+	case 7:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 1 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do6;
+
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 0 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		goto do7;
+	case 1:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= -1 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do8;                 /* No-op.  */
+	}
+
+	do {
+do8:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = a1;
+do7:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = a0;
+do6:
+		a0 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = a1;
+do5:
+		a1 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = a0;
+do4:
+		a0 = ((op_t *) srcp)[4];
+		((op_t *) dstp)[4] = a1;
+do3:
+		a1 = ((op_t *) srcp)[5];
+		((op_t *) dstp)[5] = a0;
+do2:
+		a0 = ((op_t *) srcp)[6];
+		((op_t *) dstp)[6] = a1;
+do1:
+		a1 = ((op_t *) srcp)[7];
+		((op_t *) dstp)[7] = a0;
+
+		srcp += 8 * OPSIZ;
+		dstp += 8 * OPSIZ;
+		len -= 8;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = a1;
+}
+
+static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1, a2, a3;
+	int sh_1, sh_2;
+
+	/* Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+
+	sh_1 = 8 * (srcp % OPSIZ);
+	sh_2 = 8 * OPSIZ - sh_1;
+
+	/* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+	 * it points in the middle of.
+	 */
+	srcp &= -OPSIZ;
+
+	switch (len % 4) {
+	case 2:
+		a1 = ((op_t *) srcp)[0];
+		a2 = ((op_t *) srcp)[1];
+		srcp -= 1 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do1;
+	case 3:
+		a0 = ((op_t *) srcp)[0];
+		a1 = ((op_t *) srcp)[1];
+		srcp -= 0 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do2;
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a3 = ((op_t *) srcp)[0];
+		a0 = ((op_t *) srcp)[1];
+		srcp -= -1 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		len += 0;
+		goto do3;
+	case 1:
+		a2 = ((op_t *) srcp)[0];
+		a3 = ((op_t *) srcp)[1];
+		srcp -= -2 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do4;                 /* No-op.  */
+	}
+
+	do {
+do4:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+do3:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);
+do2:
+		a2 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);
+do1:
+		a3 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);
+
+		srcp += 4 * OPSIZ;
+		dstp += 4 * OPSIZ;
+		len -= 4;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+}
+
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)		\
+do {							\
+	size_t __nbytes = (nbytes);			\
+	while (__nbytes > 0) {						\
+		byte __x = ((byte *) src_bp)[0];		\
+		src_bp += 1;				\
+		__nbytes -= 1;				\
+		((byte *) dst_bp)[0] = __x;		\
+		dst_bp += 1;				\
+	}						\
+} while (0)
+
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)			\
+do {										\
+	if (src_bp % OPSIZ == 0)						\
+		_wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	else									\
+		_wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	src_bp += (nbytes) & -OPSIZ;						\
+	dst_bp += (nbytes) & -OPSIZ;						\
+	(nbytes_left) = (nbytes) % OPSIZ;					\
+} while (0)
+
+extern void *__memcpy_aligned(void *dest, const void *src, size_t len);
+void *__memcpy(void *dest, const void *src, size_t len)
+{
+	unsigned long dstp = (long) dest;
+	unsigned long srcp = (long) src;
+
+	/* If there not too few bytes to copy, use word copy.  */
+	if (len >= OP_T_THRES) {
+		if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&
+			((srcp & OPSIZ_MASK) == 0)) {
+			__memcpy_aligned(dest, src, len);
+			return dest;
+		}
+		/* Copy just a few bytes to make DSTP aligned.  */
+		len -= (-dstp) % OPSIZ;
+		BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
+
+		/* Copy from SRCP to DSTP taking advantage of the known alignment of
+		 * DSTP.  Number of bytes remaining is put in the third argument,
+		 * i.e. in LEN.  This number may vary from machine to machine.
+		 */
+		WORD_COPY_FWD(dstp, srcp, len, len);
+	/* Fall out and copy the tail.  */
+	}
+
+	/* There are just a few bytes to copy.  Use byte memory operations.  */
+	BYTE_COPY_FWD(dstp, srcp, len);
+
+	return dest;
+}
+
+void *memcpy(void *dest, const void *src, size_t len) __weak __alias(__memcpy);
starfive: add patches for 6.6 Add updated patches for 6.6. DMA/cache-handling patches have been reworked / backported from upstream. Signed-off-by: Zoltan HERPAI <wigyori@uid0.hu> 2024-09-01 14:06:29 +00:00			`From dedbbfd891e4d0cdb825454eddfbf8386d0025b3 Mon Sep 17 00:00:00 2001`
			`From: Mason Huo <mason.huo@starfivetech.com>`
			`Date: Tue, 20 Jun 2023 13:37:52 +0800`
			`Subject: [PATCH 092/116] riscv: Optimize memcpy with aligned version`

			`Optimizing the 128 byte align case, this will improve the`
			`performance of large block memcpy.`

			`Here we combine the memcpy of glibc and kernel.`

			`Signed-off-by: Mason Huo <mason.huo@starfivetech.com>`
			`Signed-off-by: Hal Feng <hal.feng@starfivetech.com>`
			`---`
			`arch/riscv/lib/Makefile \| 3 +-`
			`arch/riscv/lib/{memcpy.S => memcpy_aligned.S} \| 36 +--`
			`arch/riscv/lib/string.c \| 266 ++++++++++++++++++`
			`3 files changed, 273 insertions(+), 32 deletions(-)`
			`rename arch/riscv/lib/{memcpy.S => memcpy_aligned.S} (67%)`
			`create mode 100644 arch/riscv/lib/string.c`

			`--- a/arch/riscv/lib/Makefile`
			`+++ b/arch/riscv/lib/Makefile`
			`@@ -1,6 +1,5 @@`
			`# SPDX-License-Identifier: GPL-2.0-only`
			`lib-y += delay.o`
			`-lib-y += memcpy.o`
			`lib-y += memset.o`
			`lib-y += memmove.o`
			`lib-y += strcmp.o`
			`@@ -9,5 +8,7 @@ lib-y += strncmp.o`
			`lib-$(CONFIG_MMU) += uaccess.o`
			`lib-$(CONFIG_64BIT) += tishift.o`
			`lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o`
			`+lib-y += string.o`
			`+lib-y += memcpy_aligned.o`

			`obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o`
			`--- a/arch/riscv/lib/memcpy.S`
			`+++ /dev/null`
			`@@ -1,110 +0,0 @@`
			`-/* SPDX-License-Identifier: GPL-2.0-only */`
			`-/*`
			`- * Copyright (C) 2013 Regents of the University of California`
			`- */`
			`-`
			`-#include <linux/linkage.h>`
			`-#include <asm/asm.h>`
			`-`
			`-/* void memcpy(void , const void , size_t) /`
			`-ENTRY(__memcpy)`
			`-WEAK(memcpy)`
			`- move t6, a0 /* Preserve return value */`
			`-`
			`- /* Defer to byte-oriented copy for small sizes */`
			`- sltiu a3, a2, 128`
			`- bnez a3, 4f`
			`- /* Use word-oriented copy only if low-order bits match */`
			`- andi a3, t6, SZREG-1`
			`- andi a4, a1, SZREG-1`
			`- bne a3, a4, 4f`
			`-`
			`- beqz a3, 2f /* Skip if already aligned */`
			`- /*`
			`- * Round to nearest double word-aligned address`
			`- * greater than or equal to start address`
			`- */`
			`- andi a3, a1, ~(SZREG-1)`
			`- addi a3, a3, SZREG`
			`- /* Handle initial misalignment */`
			`- sub a4, a3, a1`
			`-1:`
			`- lb a5, 0(a1)`
			`- addi a1, a1, 1`
			`- sb a5, 0(t6)`
			`- addi t6, t6, 1`
			`- bltu a1, a3, 1b`
			`- sub a2, a2, a4 /* Update count */`
			`-`
			`-2:`
			`- andi a4, a2, ~((16*SZREG)-1)`
			`- beqz a4, 4f`
			`- add a3, a1, a4`
			`-3:`
			`- REG_L a4, 0(a1)`
			`- REG_L a5, SZREG(a1)`
			`- REG_L a6, 2*SZREG(a1)`
			`- REG_L a7, 3*SZREG(a1)`
			`- REG_L t0, 4*SZREG(a1)`
			`- REG_L t1, 5*SZREG(a1)`
			`- REG_L t2, 6*SZREG(a1)`
			`- REG_L t3, 7*SZREG(a1)`
			`- REG_L t4, 8*SZREG(a1)`
			`- REG_L t5, 9*SZREG(a1)`
			`- REG_S a4, 0(t6)`
			`- REG_S a5, SZREG(t6)`
			`- REG_S a6, 2*SZREG(t6)`
			`- REG_S a7, 3*SZREG(t6)`
			`- REG_S t0, 4*SZREG(t6)`
			`- REG_S t1, 5*SZREG(t6)`
			`- REG_S t2, 6*SZREG(t6)`
			`- REG_S t3, 7*SZREG(t6)`
			`- REG_S t4, 8*SZREG(t6)`
			`- REG_S t5, 9*SZREG(t6)`
			`- REG_L a4, 10*SZREG(a1)`
			`- REG_L a5, 11*SZREG(a1)`
			`- REG_L a6, 12*SZREG(a1)`
			`- REG_L a7, 13*SZREG(a1)`
			`- REG_L t0, 14*SZREG(a1)`
			`- REG_L t1, 15*SZREG(a1)`
			`- addi a1, a1, 16*SZREG`
			`- REG_S a4, 10*SZREG(t6)`
			`- REG_S a5, 11*SZREG(t6)`
			`- REG_S a6, 12*SZREG(t6)`
			`- REG_S a7, 13*SZREG(t6)`
			`- REG_S t0, 14*SZREG(t6)`
			`- REG_S t1, 15*SZREG(t6)`
			`- addi t6, t6, 16*SZREG`
			`- bltu a1, a3, 3b`
			`- andi a2, a2, (16SZREG)-1 / Update count */`
			`-`
			`-4:`
			`- /* Handle trailing misalignment */`
			`- beqz a2, 6f`
			`- add a3, a1, a2`
			`-`
			`- /* Use word-oriented copy if co-aligned to word boundary */`
			`- or a5, a1, t6`
			`- or a5, a5, a3`
			`- andi a5, a5, 3`
			`- bnez a5, 5f`
			`-7:`
			`- lw a4, 0(a1)`
			`- addi a1, a1, 4`
			`- sw a4, 0(t6)`
			`- addi t6, t6, 4`
			`- bltu a1, a3, 7b`
			`-`
			`- ret`
			`-`
			`-5:`
			`- lb a4, 0(a1)`
			`- addi a1, a1, 1`
			`- sb a4, 0(t6)`
			`- addi t6, t6, 1`
			`- bltu a1, a3, 5b`
			`-6:`
			`- ret`
			`-END(__memcpy)`
			`-SYM_FUNC_ALIAS(__pi_memcpy, __memcpy)`
			`-SYM_FUNC_ALIAS(__pi___memcpy, __memcpy)`
			`--- /dev/null`
			`+++ b/arch/riscv/lib/memcpy_aligned.S`
			`@@ -0,0 +1,84 @@`
			`+/* SPDX-License-Identifier: GPL-2.0-only */`
			`+/*`
			`+ * Copyright (C) 2013 Regents of the University of California`
			`+ */`
			`+`
			`+#include <linux/linkage.h>`
			`+#include <asm/asm.h>`
			`+`
			`+/* void __memcpy_aligned(void , const void , size_t) /`
			`+ENTRY(__memcpy_aligned)`
			`+ move t6, a0 /* Preserve return value */`
			`+`
			`+2:`
			`+ andi a4, a2, ~((16*SZREG)-1)`
			`+ beqz a4, 4f`
			`+ add a3, a1, a4`
			`+3:`
			`+ REG_L a4, 0(a1)`
			`+ REG_L a5, SZREG(a1)`
			`+ REG_L a6, 2*SZREG(a1)`
			`+ REG_L a7, 3*SZREG(a1)`
			`+ REG_L t0, 4*SZREG(a1)`
			`+ REG_L t1, 5*SZREG(a1)`
			`+ REG_L t2, 6*SZREG(a1)`
			`+ REG_L t3, 7*SZREG(a1)`
			`+ REG_L t4, 8*SZREG(a1)`
			`+ REG_L t5, 9*SZREG(a1)`
			`+ REG_S a4, 0(t6)`
			`+ REG_S a5, SZREG(t6)`
			`+ REG_S a6, 2*SZREG(t6)`
			`+ REG_S a7, 3*SZREG(t6)`
			`+ REG_S t0, 4*SZREG(t6)`
			`+ REG_S t1, 5*SZREG(t6)`
			`+ REG_S t2, 6*SZREG(t6)`
			`+ REG_S t3, 7*SZREG(t6)`
			`+ REG_S t4, 8*SZREG(t6)`
			`+ REG_S t5, 9*SZREG(t6)`
			`+ REG_L a4, 10*SZREG(a1)`
			`+ REG_L a5, 11*SZREG(a1)`
			`+ REG_L a6, 12*SZREG(a1)`
			`+ REG_L a7, 13*SZREG(a1)`
			`+ REG_L t0, 14*SZREG(a1)`
			`+ REG_L t1, 15*SZREG(a1)`
			`+ addi a1, a1, 16*SZREG`
			`+ REG_S a4, 10*SZREG(t6)`
			`+ REG_S a5, 11*SZREG(t6)`
			`+ REG_S a6, 12*SZREG(t6)`
			`+ REG_S a7, 13*SZREG(t6)`
			`+ REG_S t0, 14*SZREG(t6)`
			`+ REG_S t1, 15*SZREG(t6)`
			`+ addi t6, t6, 16*SZREG`
			`+ bltu a1, a3, 3b`
			`+ andi a2, a2, (16SZREG)-1 / Update count */`
			`+`
			`+4:`
			`+ /* Handle trailing misalignment */`
			`+ beqz a2, 6f`
			`+ add a3, a1, a2`
			`+`
			`+ /* Use word-oriented copy if co-aligned to word boundary */`
			`+ or a5, a1, t6`
			`+ or a5, a5, a3`
			`+ andi a5, a5, 3`
			`+ bnez a5, 5f`
			`+7:`
			`+ lw a4, 0(a1)`
			`+ addi a1, a1, 4`
			`+ sw a4, 0(t6)`
			`+ addi t6, t6, 4`
			`+ bltu a1, a3, 7b`
			`+`
			`+ ret`
			`+`
			`+5:`
			`+ lb a4, 0(a1)`
			`+ addi a1, a1, 1`
			`+ sb a4, 0(t6)`
			`+ addi t6, t6, 1`
			`+ bltu a1, a3, 5b`
			`+6:`
			`+ ret`
			`+END(__memcpy_aligned)`
			`+SYM_FUNC_ALIAS(__pi_memcpy, __memcpy_aligned)`
			`+SYM_FUNC_ALIAS(__pi___memcpy, __memcpy_aligned)`
			`--- /dev/null`
			`+++ b/arch/riscv/lib/string.c`
			`@@ -0,0 +1,266 @@`
			`+// SPDX-License-Identifier: GPL-2.0-only`
			`+/*`
			`+ * Copy memory to memory until the specified number of bytes`
			`+ * has been copied. Overlap is NOT handled correctly.`
			`+ * Copyright (C) 1991-2020 Free Software Foundation, Inc.`
			`+ * This file is part of the GNU C Library.`
			`+ * Contributed by Torbjorn Granlund (tege@sics.se).`
			`+ *`
			`+ * The GNU C Library is free software; you can redistribute it and/or`
			`+ * modify it under the terms of the GNU Lesser General Public`
			`+ * License as published by the Free Software Foundation; either`
			`+ * version 2.1 of the License, or (at your option) any later version.`
			`+ *`
			`+ * The GNU C Library is distributed in the hope that it will be useful,`
			`+ * but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`+ * Lesser General Public License for more details.`
			`+ *`
			`+ * You should have received a copy of the GNU Lesser General Public`
			`+ * License along with the GNU C Library; if not, see`
			`+ * <https://www.gnu.org/licenses/>.`
			`+ *`
			`+ */`
			`+`
			`+#define __NO_FORTIFY`
			`+#include <linux/types.h>`
			`+#include <linux/module.h>`
			`+`
			`+#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) \| ((w1) << (sh_2)))`
			`+#define OP_T_THRES 16`
			`+#define op_t unsigned long`
			`+#define OPSIZ (sizeof(op_t))`
			`+#define OPSIZ_MASK (sizeof(op_t) - 1)`
			`+#define FAST_COPY_THRES (128)`
			`+#define byte unsigned char`
			`+`
			`+static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)`
			`+{`
			`+ op_t a0, a1;`
			`+`
			`+ switch (len % 8) {`
			`+ case 2:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ srcp -= 6 * OPSIZ;`
			`+ dstp -= 7 * OPSIZ;`
			`+ len += 6;`
			`+ goto do1;`
			`+ case 3:`
			`+ a1 = ((op_t *) srcp)[0];`
			`+ srcp -= 5 * OPSIZ;`
			`+ dstp -= 6 * OPSIZ;`
			`+ len += 5;`
			`+ goto do2;`
			`+ case 4:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ srcp -= 4 * OPSIZ;`
			`+ dstp -= 5 * OPSIZ;`
			`+ len += 4;`
			`+ goto do3;`
			`+ case 5:`
			`+ a1 = ((op_t *) srcp)[0];`
			`+ srcp -= 3 * OPSIZ;`
			`+ dstp -= 4 * OPSIZ;`
			`+ len += 3;`
			`+ goto do4;`
			`+ case 6:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ srcp -= 2 * OPSIZ;`
			`+ dstp -= 3 * OPSIZ;`
			`+ len += 2;`
			`+ goto do5;`
			`+ case 7:`
			`+ a1 = ((op_t *) srcp)[0];`
			`+ srcp -= 1 * OPSIZ;`
			`+ dstp -= 2 * OPSIZ;`
			`+ len += 1;`
			`+ goto do6;`
			`+`
			`+ case 0:`
			`+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)`
			`+ return;`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ srcp -= 0 * OPSIZ;`
			`+ dstp -= 1 * OPSIZ;`
			`+ goto do7;`
			`+ case 1:`
			`+ a1 = ((op_t *) srcp)[0];`
			`+ srcp -= -1 * OPSIZ;`
			`+ dstp -= 0 * OPSIZ;`
			`+ len -= 1;`
			`+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)`
			`+ goto do0;`
			`+ goto do8; /* No-op. */`
			`+ }`
			`+`
			`+ do {`
			`+do8:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ ((op_t *) dstp)[0] = a1;`
			`+do7:`
			`+ a1 = ((op_t *) srcp)[1];`
			`+ ((op_t *) dstp)[1] = a0;`
			`+do6:`
			`+ a0 = ((op_t *) srcp)[2];`
			`+ ((op_t *) dstp)[2] = a1;`
			`+do5:`
			`+ a1 = ((op_t *) srcp)[3];`
			`+ ((op_t *) dstp)[3] = a0;`
			`+do4:`
			`+ a0 = ((op_t *) srcp)[4];`
			`+ ((op_t *) dstp)[4] = a1;`
			`+do3:`
			`+ a1 = ((op_t *) srcp)[5];`
			`+ ((op_t *) dstp)[5] = a0;`
			`+do2:`
			`+ a0 = ((op_t *) srcp)[6];`
			`+ ((op_t *) dstp)[6] = a1;`
			`+do1:`
			`+ a1 = ((op_t *) srcp)[7];`
			`+ ((op_t *) dstp)[7] = a0;`
			`+`
			`+ srcp += 8 * OPSIZ;`
			`+ dstp += 8 * OPSIZ;`
			`+ len -= 8;`
			`+ } while (len != 0);`
			`+`
			`+ /* This is the right position for do0. Please don't move`
			`+ * it into the loop.`
			`+ */`
			`+do0:`
			`+ ((op_t *) dstp)[0] = a1;`
			`+}`
			`+`
			`+static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)`
			`+{`
			`+ op_t a0, a1, a2, a3;`
			`+ int sh_1, sh_2;`
			`+`
			`+ /* Calculate how to shift a word read at the memory operation`
			`+ * aligned srcp to make it aligned for copy.`
			`+ */`
			`+`
			`+ sh_1 = 8 * (srcp % OPSIZ);`
			`+ sh_2 = 8 * OPSIZ - sh_1;`
			`+`
			+ /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
			`+ * it points in the middle of.`
			`+ */`
			`+ srcp &= -OPSIZ;`
			`+`
			`+ switch (len % 4) {`
			`+ case 2:`
			`+ a1 = ((op_t *) srcp)[0];`
			`+ a2 = ((op_t *) srcp)[1];`
			`+ srcp -= 1 * OPSIZ;`
			`+ dstp -= 3 * OPSIZ;`
			`+ len += 2;`
			`+ goto do1;`
			`+ case 3:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ a1 = ((op_t *) srcp)[1];`
			`+ srcp -= 0 * OPSIZ;`
			`+ dstp -= 2 * OPSIZ;`
			`+ len += 1;`
			`+ goto do2;`
			`+ case 0:`
			`+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)`
			`+ return;`
			`+ a3 = ((op_t *) srcp)[0];`
			`+ a0 = ((op_t *) srcp)[1];`
			`+ srcp -= -1 * OPSIZ;`
			`+ dstp -= 1 * OPSIZ;`
			`+ len += 0;`
			`+ goto do3;`
			`+ case 1:`
			`+ a2 = ((op_t *) srcp)[0];`
			`+ a3 = ((op_t *) srcp)[1];`
			`+ srcp -= -2 * OPSIZ;`
			`+ dstp -= 0 * OPSIZ;`
			`+ len -= 1;`
			`+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)`
			`+ goto do0;`
			`+ goto do4; /* No-op. */`
			`+ }`
			`+`
			`+ do {`
			`+do4:`
			`+ a0 = ((op_t *) srcp)[0];`
			`+ ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);`
			`+do3:`
			`+ a1 = ((op_t *) srcp)[1];`
			`+ ((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);`
			`+do2:`
			`+ a2 = ((op_t *) srcp)[2];`
			`+ ((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);`
			`+do1:`
			`+ a3 = ((op_t *) srcp)[3];`
			`+ ((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);`
			`+`
			`+ srcp += 4 * OPSIZ;`
			`+ dstp += 4 * OPSIZ;`
			`+ len -= 4;`
			`+ } while (len != 0);`
			`+`
			`+ /* This is the right position for do0. Please don't move`
			`+ * it into the loop.`
			`+ */`
			`+do0:`
			`+ ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);`
			`+}`
			`+`
			`+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \`
			`+do { \`
			`+ size_t __nbytes = (nbytes); \`
			`+ while (__nbytes > 0) { \`
			`+ byte __x = ((byte *) src_bp)[0]; \`
			`+ src_bp += 1; \`
			`+ __nbytes -= 1; \`
			`+ ((byte *) dst_bp)[0] = __x; \`
			`+ dst_bp += 1; \`
			`+ } \`
			`+} while (0)`
			`+`
			`+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \`
			`+do { \`
			`+ if (src_bp % OPSIZ == 0) \`
			`+ _wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \`
			`+ else \`
			`+ _wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \`
			`+ src_bp += (nbytes) & -OPSIZ; \`
			`+ dst_bp += (nbytes) & -OPSIZ; \`
			`+ (nbytes_left) = (nbytes) % OPSIZ; \`
			`+} while (0)`
			`+`
			`+extern void __memcpy_aligned(void dest, const void *src, size_t len);`
			`+void __memcpy(void dest, const void *src, size_t len)`
			`+{`
			`+ unsigned long dstp = (long) dest;`
			`+ unsigned long srcp = (long) src;`
			`+`
			`+ /* If there not too few bytes to copy, use word copy. */`
			`+ if (len >= OP_T_THRES) {`
			`+ if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&`
			`+ ((srcp & OPSIZ_MASK) == 0)) {`
			`+ __memcpy_aligned(dest, src, len);`
			`+ return dest;`
			`+ }`
			`+ /* Copy just a few bytes to make DSTP aligned. */`
			`+ len -= (-dstp) % OPSIZ;`
			`+ BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);`
			`+`
			`+ /* Copy from SRCP to DSTP taking advantage of the known alignment of`
			`+ * DSTP. Number of bytes remaining is put in the third argument,`
			`+ * i.e. in LEN. This number may vary from machine to machine.`
			`+ */`
			`+ WORD_COPY_FWD(dstp, srcp, len, len);`
			`+ /* Fall out and copy the tail. */`
			`+ }`
			`+`
			`+ /* There are just a few bytes to copy. Use byte memory operations. */`
			`+ BYTE_COPY_FWD(dstp, srcp, len);`
			`+`
			`+ return dest;`
			`+}`
			`+`
			`+void memcpy(void dest, const void *src, size_t len) __weak __alias(__memcpy);`