From dedbbfd891e4d0cdb825454eddfbf8386d0025b3 Mon Sep 17 00:00:00 2001 From: Mason Huo Date: Tue, 20 Jun 2023 13:37:52 +0800 Subject: [PATCH 092/116] riscv: Optimize memcpy with aligned version Optimizing the 128 byte align case, this will improve the performance of large block memcpy. Here we combine the memcpy of glibc and kernel. Signed-off-by: Mason Huo Signed-off-by: Hal Feng --- arch/riscv/lib/Makefile | 3 +- arch/riscv/lib/{memcpy.S => memcpy_aligned.S} | 36 +-- arch/riscv/lib/string.c | 266 ++++++++++++++++++ 3 files changed, 273 insertions(+), 32 deletions(-) rename arch/riscv/lib/{memcpy.S => memcpy_aligned.S} (67%) create mode 100644 arch/riscv/lib/string.c --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y += delay.o -lib-y += memcpy.o lib-y += memset.o lib-y += memmove.o lib-y += strcmp.o @@ -9,5 +8,7 @@ lib-y += strncmp.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o +lib-y += string.o +lib-y += memcpy_aligned.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o --- a/arch/riscv/lib/memcpy.S +++ /dev/null @@ -1,110 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2013 Regents of the University of California - */ - -#include -#include - -/* void *memcpy(void *, const void *, size_t) */ -ENTRY(__memcpy) -WEAK(memcpy) - move t6, a0 /* Preserve return value */ - - /* Defer to byte-oriented copy for small sizes */ - sltiu a3, a2, 128 - bnez a3, 4f - /* Use word-oriented copy only if low-order bits match */ - andi a3, t6, SZREG-1 - andi a4, a1, SZREG-1 - bne a3, a4, 4f - - beqz a3, 2f /* Skip if already aligned */ - /* - * Round to nearest double word-aligned address - * greater than or equal to start address - */ - andi a3, a1, ~(SZREG-1) - addi a3, a3, SZREG - /* Handle initial misalignment */ - sub a4, a3, a1 -1: - lb a5, 0(a1) - addi a1, a1, 1 - sb a5, 0(t6) - addi t6, t6, 1 - bltu a1, a3, 1b - sub a2, a2, a4 /* Update count */ - -2: - andi a4, a2, ~((16*SZREG)-1) - beqz a4, 4f - add a3, a1, a4 -3: - REG_L a4, 0(a1) - REG_L a5, SZREG(a1) - REG_L a6, 2*SZREG(a1) - REG_L a7, 3*SZREG(a1) - REG_L t0, 4*SZREG(a1) - REG_L t1, 5*SZREG(a1) - REG_L t2, 6*SZREG(a1) - REG_L t3, 7*SZREG(a1) - REG_L t4, 8*SZREG(a1) - REG_L t5, 9*SZREG(a1) - REG_S a4, 0(t6) - REG_S a5, SZREG(t6) - REG_S a6, 2*SZREG(t6) - REG_S a7, 3*SZREG(t6) - REG_S t0, 4*SZREG(t6) - REG_S t1, 5*SZREG(t6) - REG_S t2, 6*SZREG(t6) - REG_S t3, 7*SZREG(t6) - REG_S t4, 8*SZREG(t6) - REG_S t5, 9*SZREG(t6) - REG_L a4, 10*SZREG(a1) - REG_L a5, 11*SZREG(a1) - REG_L a6, 12*SZREG(a1) - REG_L a7, 13*SZREG(a1) - REG_L t0, 14*SZREG(a1) - REG_L t1, 15*SZREG(a1) - addi a1, a1, 16*SZREG - REG_S a4, 10*SZREG(t6) - REG_S a5, 11*SZREG(t6) - REG_S a6, 12*SZREG(t6) - REG_S a7, 13*SZREG(t6) - REG_S t0, 14*SZREG(t6) - REG_S t1, 15*SZREG(t6) - addi t6, t6, 16*SZREG - bltu a1, a3, 3b - andi a2, a2, (16*SZREG)-1 /* Update count */ - -4: - /* Handle trailing misalignment */ - beqz a2, 6f - add a3, a1, a2 - - /* Use word-oriented copy if co-aligned to word boundary */ - or a5, a1, t6 - or a5, a5, a3 - andi a5, a5, 3 - bnez a5, 5f -7: - lw a4, 0(a1) - addi a1, a1, 4 - sw a4, 0(t6) - addi t6, t6, 4 - bltu a1, a3, 7b - - ret - -5: - lb a4, 0(a1) - addi a1, a1, 1 - sb a4, 0(t6) - addi t6, t6, 1 - bltu a1, a3, 5b -6: - ret -END(__memcpy) -SYM_FUNC_ALIAS(__pi_memcpy, __memcpy) -SYM_FUNC_ALIAS(__pi___memcpy, __memcpy) --- /dev/null +++ b/arch/riscv/lib/memcpy_aligned.S @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 Regents of the University of California + */ + +#include +#include + +/* void *__memcpy_aligned(void *, const void *, size_t) */ +ENTRY(__memcpy_aligned) + move t6, a0 /* Preserve return value */ + +2: + andi a4, a2, ~((16*SZREG)-1) + beqz a4, 4f + add a3, a1, a4 +3: + REG_L a4, 0(a1) + REG_L a5, SZREG(a1) + REG_L a6, 2*SZREG(a1) + REG_L a7, 3*SZREG(a1) + REG_L t0, 4*SZREG(a1) + REG_L t1, 5*SZREG(a1) + REG_L t2, 6*SZREG(a1) + REG_L t3, 7*SZREG(a1) + REG_L t4, 8*SZREG(a1) + REG_L t5, 9*SZREG(a1) + REG_S a4, 0(t6) + REG_S a5, SZREG(t6) + REG_S a6, 2*SZREG(t6) + REG_S a7, 3*SZREG(t6) + REG_S t0, 4*SZREG(t6) + REG_S t1, 5*SZREG(t6) + REG_S t2, 6*SZREG(t6) + REG_S t3, 7*SZREG(t6) + REG_S t4, 8*SZREG(t6) + REG_S t5, 9*SZREG(t6) + REG_L a4, 10*SZREG(a1) + REG_L a5, 11*SZREG(a1) + REG_L a6, 12*SZREG(a1) + REG_L a7, 13*SZREG(a1) + REG_L t0, 14*SZREG(a1) + REG_L t1, 15*SZREG(a1) + addi a1, a1, 16*SZREG + REG_S a4, 10*SZREG(t6) + REG_S a5, 11*SZREG(t6) + REG_S a6, 12*SZREG(t6) + REG_S a7, 13*SZREG(t6) + REG_S t0, 14*SZREG(t6) + REG_S t1, 15*SZREG(t6) + addi t6, t6, 16*SZREG + bltu a1, a3, 3b + andi a2, a2, (16*SZREG)-1 /* Update count */ + +4: + /* Handle trailing misalignment */ + beqz a2, 6f + add a3, a1, a2 + + /* Use word-oriented copy if co-aligned to word boundary */ + or a5, a1, t6 + or a5, a5, a3 + andi a5, a5, 3 + bnez a5, 5f +7: + lw a4, 0(a1) + addi a1, a1, 4 + sw a4, 0(t6) + addi t6, t6, 4 + bltu a1, a3, 7b + + ret + +5: + lb a4, 0(a1) + addi a1, a1, 1 + sb a4, 0(t6) + addi t6, t6, 1 + bltu a1, a3, 5b +6: + ret +END(__memcpy_aligned) +SYM_FUNC_ALIAS(__pi_memcpy, __memcpy_aligned) +SYM_FUNC_ALIAS(__pi___memcpy, __memcpy_aligned) --- /dev/null +++ b/arch/riscv/lib/string.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copy memory to memory until the specified number of bytes + * has been copied. Overlap is NOT handled correctly. + * Copyright (C) 1991-2020 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Torbjorn Granlund (tege@sics.se). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with the GNU C Library; if not, see + * . + * + */ + +#define __NO_FORTIFY +#include +#include + +#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2))) +#define OP_T_THRES 16 +#define op_t unsigned long +#define OPSIZ (sizeof(op_t)) +#define OPSIZ_MASK (sizeof(op_t) - 1) +#define FAST_COPY_THRES (128) +#define byte unsigned char + +static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len) +{ + op_t a0, a1; + + switch (len % 8) { + case 2: + a0 = ((op_t *) srcp)[0]; + srcp -= 6 * OPSIZ; + dstp -= 7 * OPSIZ; + len += 6; + goto do1; + case 3: + a1 = ((op_t *) srcp)[0]; + srcp -= 5 * OPSIZ; + dstp -= 6 * OPSIZ; + len += 5; + goto do2; + case 4: + a0 = ((op_t *) srcp)[0]; + srcp -= 4 * OPSIZ; + dstp -= 5 * OPSIZ; + len += 4; + goto do3; + case 5: + a1 = ((op_t *) srcp)[0]; + srcp -= 3 * OPSIZ; + dstp -= 4 * OPSIZ; + len += 3; + goto do4; + case 6: + a0 = ((op_t *) srcp)[0]; + srcp -= 2 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do5; + case 7: + a1 = ((op_t *) srcp)[0]; + srcp -= 1 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a0 = ((op_t *) srcp)[0]; + srcp -= 0 * OPSIZ; + dstp -= 1 * OPSIZ; + goto do7; + case 1: + a1 = ((op_t *) srcp)[0]; + srcp -= -1 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do { +do8: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; +do7: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = a0; +do6: + a0 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = a1; +do5: + a1 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = a0; +do4: + a0 = ((op_t *) srcp)[4]; + ((op_t *) dstp)[4] = a1; +do3: + a1 = ((op_t *) srcp)[5]; + ((op_t *) dstp)[5] = a0; +do2: + a0 = ((op_t *) srcp)[6]; + ((op_t *) dstp)[6] = a1; +do1: + a1 = ((op_t *) srcp)[7]; + ((op_t *) dstp)[7] = a0; + + srcp += 8 * OPSIZ; + dstp += 8 * OPSIZ; + len -= 8; + } while (len != 0); + + /* This is the right position for do0. Please don't move + * it into the loop. + */ +do0: + ((op_t *) dstp)[0] = a1; +} + +static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len) +{ + op_t a0, a1, a2, a3; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + * aligned srcp to make it aligned for copy. + */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' + * it points in the middle of. + */ + srcp &= -OPSIZ; + + switch (len % 4) { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + srcp -= 1 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + srcp -= 0 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + srcp -= -1 * OPSIZ; + dstp -= 1 * OPSIZ; + len += 0; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp -= -2 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do { +do4: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2); +do3: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2); +do2: + a2 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2); +do1: + a3 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2); + + srcp += 4 * OPSIZ; + dstp += 4 * OPSIZ; + len -= 4; + } while (len != 0); + + /* This is the right position for do0. Please don't move + * it into the loop. + */ +do0: + ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2); +} + +#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \ +do { \ + size_t __nbytes = (nbytes); \ + while (__nbytes > 0) { \ + byte __x = ((byte *) src_bp)[0]; \ + src_bp += 1; \ + __nbytes -= 1; \ + ((byte *) dst_bp)[0] = __x; \ + dst_bp += 1; \ + } \ +} while (0) + +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ +do { \ + if (src_bp % OPSIZ == 0) \ + _wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \ + else \ + _wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \ + src_bp += (nbytes) & -OPSIZ; \ + dst_bp += (nbytes) & -OPSIZ; \ + (nbytes_left) = (nbytes) % OPSIZ; \ +} while (0) + +extern void *__memcpy_aligned(void *dest, const void *src, size_t len); +void *__memcpy(void *dest, const void *src, size_t len) +{ + unsigned long dstp = (long) dest; + unsigned long srcp = (long) src; + + /* If there not too few bytes to copy, use word copy. */ + if (len >= OP_T_THRES) { + if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) && + ((srcp & OPSIZ_MASK) == 0)) { + __memcpy_aligned(dest, src, len); + return dest; + } + /* Copy just a few bytes to make DSTP aligned. */ + len -= (-dstp) % OPSIZ; + BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ); + + /* Copy from SRCP to DSTP taking advantage of the known alignment of + * DSTP. Number of bytes remaining is put in the third argument, + * i.e. in LEN. This number may vary from machine to machine. + */ + WORD_COPY_FWD(dstp, srcp, len, len); + /* Fall out and copy the tail. */ + } + + /* There are just a few bytes to copy. Use byte memory operations. */ + BYTE_COPY_FWD(dstp, srcp, len); + + return dest; +} + +void *memcpy(void *dest, const void *src, size_t len) __weak __alias(__memcpy);