LoongArch: Optimize string functions including memcpy, memmove,
memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen. (cherry picked from commit 4ba365320a633ecd4cb47d8f171aa81fcd1dd6ef)
This commit is contained in:
parent
f147708dc8
commit
4adfab66ba
693
1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
Normal file
693
1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
Normal file
@ -0,0 +1,693 @@
|
|||||||
|
From 939b5ed88b61d03bae6d20bf97ad0f77f9b110bb Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:20:26 +0800
|
||||||
|
Subject: [PATCH 1/6] LoongArch: Optimize string functions memcpy, memmove.
|
||||||
|
|
||||||
|
Change-Id: Ib0e78d062082a657d5bf572403f19bf5bfe0a28d
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/memcpy.S | 259 ++++++++++++++++++++
|
||||||
|
sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 665 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/memcpy.S
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/memmove.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..5d850123
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/memcpy.S
|
||||||
|
@@ -0,0 +1,259 @@
|
||||||
|
+/* Optimized memcpy implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* Allow the routine to be named something else if desired. */
|
||||||
|
+#ifndef MEMCPY_NAME
|
||||||
|
+#define MEMCPY_NAME memcpy
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#define LD_64(reg, n) \
|
||||||
|
+ ld.d t0, reg, n; \
|
||||||
|
+ ld.d t1, reg, n+8; \
|
||||||
|
+ ld.d t2, reg, n+16; \
|
||||||
|
+ ld.d t3, reg, n+24; \
|
||||||
|
+ ld.d t4, reg, n+32; \
|
||||||
|
+ ld.d t5, reg, n+40; \
|
||||||
|
+ ld.d t6, reg, n+48; \
|
||||||
|
+ ld.d t7, reg, n+56;
|
||||||
|
+
|
||||||
|
+#define ST_64(reg, n) \
|
||||||
|
+ st.d t0, reg, n; \
|
||||||
|
+ st.d t1, reg, n+8; \
|
||||||
|
+ st.d t2, reg, n+16; \
|
||||||
|
+ st.d t3, reg, n+24; \
|
||||||
|
+ st.d t4, reg, n+32; \
|
||||||
|
+ st.d t5, reg, n+40; \
|
||||||
|
+ st.d t6, reg, n+48; \
|
||||||
|
+ st.d t7, reg, n+56;
|
||||||
|
+
|
||||||
|
+LEAF(MEMCPY_NAME)
|
||||||
|
+//1st var: dst ptr: void *a1 $r4 a0
|
||||||
|
+//2nd var: src ptr: void *a2 $r5 a1
|
||||||
|
+//3rd var: size_t len $r6 a2
|
||||||
|
+//t0~t9 registers as temp
|
||||||
|
+
|
||||||
|
+ add.d a4, a1, a2
|
||||||
|
+ add.d a3, a0, a2
|
||||||
|
+ li.w a6, 16
|
||||||
|
+ bge a6, a2, less_16bytes
|
||||||
|
+ li.w a6, 128
|
||||||
|
+ blt a6, a2, long_bytes
|
||||||
|
+ li.w a6, 64
|
||||||
|
+ blt a6, a2, more_64bytes
|
||||||
|
+ li.w a6, 32
|
||||||
|
+ blt a6, a2, more_32bytes
|
||||||
|
+
|
||||||
|
+ /* 17...32 */
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a1, 8
|
||||||
|
+ ld.d t2, a4, -16
|
||||||
|
+ ld.d t3, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a0, 8
|
||||||
|
+ st.d t2, a3, -16
|
||||||
|
+ st.d t3, a3, -8
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+more_64bytes:
|
||||||
|
+ srli.d t8, a0, 3
|
||||||
|
+ slli.d t8, t8, 3
|
||||||
|
+ addi.d t8, t8, 0x8
|
||||||
|
+ sub.d a7, a0, t8
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ sub.d a1, a1, a7
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+
|
||||||
|
+ add.d a7, a7, a2
|
||||||
|
+ addi.d a7, a7, -0x20
|
||||||
|
+loop_32:
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a1, 8
|
||||||
|
+ ld.d t2, a1, 16
|
||||||
|
+ ld.d t3, a1, 24
|
||||||
|
+ st.d t0, t8, 0
|
||||||
|
+ st.d t1, t8, 8
|
||||||
|
+ st.d t2, t8, 16
|
||||||
|
+ st.d t3, t8, 24
|
||||||
|
+
|
||||||
|
+ addi.d t8, t8, 0x20
|
||||||
|
+ addi.d a1, a1, 0x20
|
||||||
|
+ addi.d a7, a7, -0x20
|
||||||
|
+ blt zero, a7, loop_32
|
||||||
|
+
|
||||||
|
+ ld.d t4, a4, -32
|
||||||
|
+ ld.d t5, a4, -24
|
||||||
|
+ ld.d t6, a4, -16
|
||||||
|
+ ld.d t7, a4, -8
|
||||||
|
+ st.d t4, a3, -32
|
||||||
|
+ st.d t5, a3, -24
|
||||||
|
+ st.d t6, a3, -16
|
||||||
|
+ st.d t7, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+more_32bytes:
|
||||||
|
+ /* 33...64 */
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a1, 8
|
||||||
|
+ ld.d t2, a1, 16
|
||||||
|
+ ld.d t3, a1, 24
|
||||||
|
+ ld.d t4, a4, -32
|
||||||
|
+ ld.d t5, a4, -24
|
||||||
|
+ ld.d t6, a4, -16
|
||||||
|
+ ld.d t7, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a0, 8
|
||||||
|
+ st.d t2, a0, 16
|
||||||
|
+ st.d t3, a0, 24
|
||||||
|
+ st.d t4, a3, -32
|
||||||
|
+ st.d t5, a3, -24
|
||||||
|
+ st.d t6, a3, -16
|
||||||
|
+ st.d t7, a3, -8
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_16bytes:
|
||||||
|
+ srai.d a6, a2, 3
|
||||||
|
+ beqz a6, less_8bytes
|
||||||
|
+
|
||||||
|
+ /* 8...16 */
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_8bytes:
|
||||||
|
+ srai.d a6, a2, 2
|
||||||
|
+ beqz a6, less_4bytes
|
||||||
|
+
|
||||||
|
+ /* 4...7 */
|
||||||
|
+ ld.w t0, a1, 0
|
||||||
|
+ ld.w t1, a4, -4
|
||||||
|
+ st.w t0, a0, 0
|
||||||
|
+ st.w t1, a3, -4
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_4bytes:
|
||||||
|
+ srai.d a6, a2, 1
|
||||||
|
+ beqz a6, less_2bytes
|
||||||
|
+
|
||||||
|
+ /* 2...3 */
|
||||||
|
+ ld.h t0, a1, 0
|
||||||
|
+ ld.h t1, a4, -2
|
||||||
|
+ st.h t0, a0, 0
|
||||||
|
+ st.h t1, a3, -2
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_2bytes:
|
||||||
|
+ beqz a2, less_1bytes
|
||||||
|
+
|
||||||
|
+ ld.b t0, a1, 0
|
||||||
|
+ st.b t0, a0, 0
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_1bytes:
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+long_bytes:
|
||||||
|
+ srli.d t8, a0, 3
|
||||||
|
+ slli.d t8, t8, 3
|
||||||
|
+ beq a0, t8, start
|
||||||
|
+
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ addi.d t8, t8, 0x8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ sub.d a7, a0, t8
|
||||||
|
+ sub.d a1, a1, a7
|
||||||
|
+
|
||||||
|
+start:
|
||||||
|
+ addi.d a5, a3, -0x80
|
||||||
|
+ blt a5, t8, align_end_proc
|
||||||
|
+
|
||||||
|
+loop_128:
|
||||||
|
+ LD_64(a1, 0)
|
||||||
|
+ ST_64(t8, 0)
|
||||||
|
+ LD_64(a1, 64)
|
||||||
|
+ addi.d a1, a1, 0x80
|
||||||
|
+ ST_64(t8, 64)
|
||||||
|
+ addi.d t8, t8, 0x80
|
||||||
|
+ bge a5, t8, loop_128
|
||||||
|
+
|
||||||
|
+align_end_proc:
|
||||||
|
+ sub.d a2, a3, t8
|
||||||
|
+
|
||||||
|
+ pcaddi t1, 34
|
||||||
|
+ andi t2, a2, 0x78
|
||||||
|
+ sub.d t1, t1, t2
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+end_120_128_unalign:
|
||||||
|
+ ld.d t0, a1, 112
|
||||||
|
+ st.d t0, t8, 112
|
||||||
|
+end_112_120_unalign:
|
||||||
|
+ ld.d t0, a1, 104
|
||||||
|
+ st.d t0, t8, 104
|
||||||
|
+end_104_112_unalign:
|
||||||
|
+ ld.d t0, a1, 96
|
||||||
|
+ st.d t0, t8, 96
|
||||||
|
+end_96_104_unalign:
|
||||||
|
+ ld.d t0, a1, 88
|
||||||
|
+ st.d t0, t8, 88
|
||||||
|
+end_88_96_unalign:
|
||||||
|
+ ld.d t0, a1, 80
|
||||||
|
+ st.d t0, t8, 80
|
||||||
|
+end_80_88_unalign:
|
||||||
|
+ ld.d t0, a1, 72
|
||||||
|
+ st.d t0, t8, 72
|
||||||
|
+end_72_80_unalign:
|
||||||
|
+ ld.d t0, a1, 64
|
||||||
|
+ st.d t0, t8, 64
|
||||||
|
+end_64_72_unalign:
|
||||||
|
+ ld.d t0, a1, 56
|
||||||
|
+ st.d t0, t8, 56
|
||||||
|
+end_56_64_unalign:
|
||||||
|
+ ld.d t0, a1, 48
|
||||||
|
+ st.d t0, t8, 48
|
||||||
|
+end_48_56_unalign:
|
||||||
|
+ ld.d t0, a1, 40
|
||||||
|
+ st.d t0, t8, 40
|
||||||
|
+end_40_48_unalign:
|
||||||
|
+ ld.d t0, a1, 32
|
||||||
|
+ st.d t0, t8, 32
|
||||||
|
+end_32_40_unalign:
|
||||||
|
+ ld.d t0, a1, 24
|
||||||
|
+ st.d t0, t8, 24
|
||||||
|
+end_24_32_unalign:
|
||||||
|
+ ld.d t0, a1, 16
|
||||||
|
+ st.d t0, t8, 16
|
||||||
|
+end_16_24_unalign:
|
||||||
|
+ ld.d t0, a1, 8
|
||||||
|
+ st.d t0, t8, 8
|
||||||
|
+end_8_16_unalign:
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ st.d t0, t8, 0
|
||||||
|
+end_0_8_unalign:
|
||||||
|
+ ld.d t0, a4, -8
|
||||||
|
+ st.d t0, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+END(MEMCPY_NAME)
|
||||||
|
+libc_hidden_builtin_def (MEMCPY_NAME)
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..edd9cf3d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/memmove.S
|
||||||
|
@@ -0,0 +1,406 @@
|
||||||
|
+/* Optimized memmove implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* Allow the routine to be named something else if desired. */
|
||||||
|
+#ifndef MEMMOVE_NAME
|
||||||
|
+#define MEMMOVE_NAME memmove
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#define LD_64(reg, n) \
|
||||||
|
+ ld.d t0, reg, n; \
|
||||||
|
+ ld.d t1, reg, n+8; \
|
||||||
|
+ ld.d t2, reg, n+16; \
|
||||||
|
+ ld.d t3, reg, n+24; \
|
||||||
|
+ ld.d t4, reg, n+32; \
|
||||||
|
+ ld.d t5, reg, n+40; \
|
||||||
|
+ ld.d t6, reg, n+48; \
|
||||||
|
+ ld.d t7, reg, n+56;
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+#define ST_64(reg, n) \
|
||||||
|
+ st.d t0, reg, n; \
|
||||||
|
+ st.d t1, reg, n+8; \
|
||||||
|
+ st.d t2, reg, n+16; \
|
||||||
|
+ st.d t3, reg, n+24; \
|
||||||
|
+ st.d t4, reg, n+32; \
|
||||||
|
+ st.d t5, reg, n+40; \
|
||||||
|
+ st.d t6, reg, n+48; \
|
||||||
|
+ st.d t7, reg, n+56;
|
||||||
|
+
|
||||||
|
+/* memmove (const void *dst, const void *src, size_t n) */
|
||||||
|
+LEAF(MEMMOVE_NAME)
|
||||||
|
+ add.d a4, a1, a2
|
||||||
|
+ add.d a3, a0, a2
|
||||||
|
+ beq a1, a0, less_1bytes
|
||||||
|
+ move t8, a0
|
||||||
|
+ srai.d a6, a2, 4 #num/16
|
||||||
|
+ beqz a6, less_16bytes #num<16
|
||||||
|
+ srai.d a6, a2, 6 #num/64
|
||||||
|
+ bnez a6, more_64bytes #num>64
|
||||||
|
+ srai.d a6, a2, 5
|
||||||
|
+ beqz a6, less_32bytes #num<32
|
||||||
|
+
|
||||||
|
+ ld.d t0, a1, 0 #32<num<64
|
||||||
|
+ ld.d t1, a1, 8
|
||||||
|
+ ld.d t2, a1, 16
|
||||||
|
+ ld.d t3, a1, 24
|
||||||
|
+ ld.d t4, a4, -32
|
||||||
|
+ ld.d t5, a4, -24
|
||||||
|
+ ld.d t6, a4, -16
|
||||||
|
+ ld.d t7, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a0, 8
|
||||||
|
+ st.d t2, a0, 16
|
||||||
|
+ st.d t3, a0, 24
|
||||||
|
+ st.d t4, a3, -32
|
||||||
|
+ st.d t5, a3, -24
|
||||||
|
+ st.d t6, a3, -16
|
||||||
|
+ st.d t7, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_32bytes:
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a1, 8
|
||||||
|
+ ld.d t2, a4, -16
|
||||||
|
+ ld.d t3, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a0, 8
|
||||||
|
+ st.d t2, a3, -16
|
||||||
|
+ st.d t3, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_16bytes:
|
||||||
|
+ srai.d a6, a2, 3 #num/8
|
||||||
|
+ beqz a6, less_8bytes
|
||||||
|
+
|
||||||
|
+ ld.d t0, a1, 0
|
||||||
|
+ ld.d t1, a4, -8
|
||||||
|
+ st.d t0, a0, 0
|
||||||
|
+ st.d t1, a3, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_8bytes:
|
||||||
|
+ srai.d a6, a2, 2
|
||||||
|
+ beqz a6, less_4bytes
|
||||||
|
+
|
||||||
|
+ ld.w t0, a1, 0
|
||||||
|
+ ld.w t1, a4, -4
|
||||||
|
+ st.w t0, a0, 0
|
||||||
|
+ st.w t1, a3, -4
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_4bytes:
|
||||||
|
+ srai.d a6, a2, 1
|
||||||
|
+ beqz a6, less_2bytes
|
||||||
|
+
|
||||||
|
+ ld.h t0, a1, 0
|
||||||
|
+ ld.h t1, a4, -2
|
||||||
|
+ st.h t0, a0, 0
|
||||||
|
+ st.h t1, a3, -2
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_2bytes:
|
||||||
|
+ beqz a2, less_1bytes
|
||||||
|
+
|
||||||
|
+ ld.b t0, a1, 0
|
||||||
|
+ st.b t0, a0, 0
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_1bytes:
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+more_64bytes:
|
||||||
|
+ sub.d a7, a0, a1
|
||||||
|
+ bltu a7, a2, copy_backward
|
||||||
|
+
|
||||||
|
+copy_forward:
|
||||||
|
+ srli.d a0, a0, 3
|
||||||
|
+ slli.d a0, a0, 3
|
||||||
|
+ beq a0, t8, all_align
|
||||||
|
+ addi.d a0, a0, 0x8
|
||||||
|
+ sub.d a7, t8, a0
|
||||||
|
+ sub.d a1, a1, a7
|
||||||
|
+ add.d a2, a7, a2
|
||||||
|
+
|
||||||
|
+start_unalign_proc:
|
||||||
|
+ pcaddi t1, 18
|
||||||
|
+ slli.d a6, a7, 3
|
||||||
|
+ add.d t1, t1, a6
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+start_7_unalign:
|
||||||
|
+ ld.b t0, a1, -7
|
||||||
|
+ st.b t0, a0, -7
|
||||||
|
+start_6_unalign:
|
||||||
|
+ ld.b t0, a1, -6
|
||||||
|
+ st.b t0, a0, -6
|
||||||
|
+start_5_unalign:
|
||||||
|
+ ld.b t0, a1, -5
|
||||||
|
+ st.b t0, a0, -5
|
||||||
|
+start_4_unalign:
|
||||||
|
+ ld.b t0, a1, -4
|
||||||
|
+ st.b t0, a0, -4
|
||||||
|
+start_3_unalign:
|
||||||
|
+ ld.b t0, a1, -3
|
||||||
|
+ st.b t0, a0, -3
|
||||||
|
+start_2_unalign:
|
||||||
|
+ ld.b t0, a1, -2
|
||||||
|
+ st.b t0, a0, -2
|
||||||
|
+start_1_unalign:
|
||||||
|
+ ld.b t0, a1, -1
|
||||||
|
+ st.b t0, a0, -1
|
||||||
|
+start_over:
|
||||||
|
+
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ blt a2, zero, end_unalign_proc
|
||||||
|
+
|
||||||
|
+loop_less:
|
||||||
|
+ LD_64(a1, 0)
|
||||||
|
+ ST_64(a0, 0)
|
||||||
|
+ LD_64(a1, 64)
|
||||||
|
+ ST_64(a0, 64)
|
||||||
|
+
|
||||||
|
+ addi.d a0, a0, 0x80
|
||||||
|
+ addi.d a1, a1, 0x80
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ bge a2, zero, loop_less
|
||||||
|
+
|
||||||
|
+end_unalign_proc:
|
||||||
|
+ addi.d a2, a2, 0x80
|
||||||
|
+
|
||||||
|
+ pcaddi t1, 36
|
||||||
|
+ andi t2, a2, 0x78
|
||||||
|
+ add.d a1, a1, t2
|
||||||
|
+ add.d a0, a0, t2
|
||||||
|
+ sub.d t1, t1, t2
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+end_120_128_unalign:
|
||||||
|
+ ld.d t0, a1, -120
|
||||||
|
+ st.d t0, a0, -120
|
||||||
|
+end_112_120_unalign:
|
||||||
|
+ ld.d t0, a1, -112
|
||||||
|
+ st.d t0, a0, -112
|
||||||
|
+end_104_112_unalign:
|
||||||
|
+ ld.d t0, a1, -104
|
||||||
|
+ st.d t0, a0, -104
|
||||||
|
+end_96_104_unalign:
|
||||||
|
+ ld.d t0, a1, -96
|
||||||
|
+ st.d t0, a0, -96
|
||||||
|
+end_88_96_unalign:
|
||||||
|
+ ld.d t0, a1, -88
|
||||||
|
+ st.d t0, a0, -88
|
||||||
|
+end_80_88_unalign:
|
||||||
|
+ ld.d t0, a1, -80
|
||||||
|
+ st.d t0, a0, -80
|
||||||
|
+end_72_80_unalign:
|
||||||
|
+ ld.d t0, a1, -72
|
||||||
|
+ st.d t0, a0, -72
|
||||||
|
+end_64_72_unalign:
|
||||||
|
+ ld.d t0, a1, -64
|
||||||
|
+ st.d t0, a0, -64
|
||||||
|
+end_56_64_unalign:
|
||||||
|
+ ld.d t0, a1, -56
|
||||||
|
+ st.d t0, a0, -56
|
||||||
|
+end_48_56_unalign:
|
||||||
|
+ ld.d t0, a1, -48
|
||||||
|
+ st.d t0, a0, -48
|
||||||
|
+end_40_48_unalign:
|
||||||
|
+ ld.d t0, a1, -40
|
||||||
|
+ st.d t0, a0, -40
|
||||||
|
+end_32_40_unalign:
|
||||||
|
+ ld.d t0, a1, -32
|
||||||
|
+ st.d t0, a0, -32
|
||||||
|
+end_24_32_unalign:
|
||||||
|
+ ld.d t0, a1, -24
|
||||||
|
+ st.d t0, a0, -24
|
||||||
|
+end_16_24_unalign:
|
||||||
|
+ ld.d t0, a1, -16
|
||||||
|
+ st.d t0, a0, -16
|
||||||
|
+end_8_16_unalign:
|
||||||
|
+ ld.d t0, a1, -8
|
||||||
|
+ st.d t0, a0, -8
|
||||||
|
+end_0_8_unalign:
|
||||||
|
+
|
||||||
|
+ andi a2, a2, 0x7
|
||||||
|
+ pcaddi t1, 18
|
||||||
|
+ slli.d a2, a2, 3
|
||||||
|
+ sub.d t1, t1, a2
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+end_7_unalign:
|
||||||
|
+ ld.b t0, a4, -7
|
||||||
|
+ st.b t0, a3, -7
|
||||||
|
+end_6_unalign:
|
||||||
|
+ ld.b t0, a4, -6
|
||||||
|
+ st.b t0, a3, -6
|
||||||
|
+end_5_unalign:
|
||||||
|
+ ld.b t0, a4, -5
|
||||||
|
+ st.b t0, a3, -5
|
||||||
|
+end_4_unalign:
|
||||||
|
+ ld.b t0, a4, -4
|
||||||
|
+ st.b t0, a3, -4
|
||||||
|
+end_3_unalign:
|
||||||
|
+ ld.b t0, a4, -3
|
||||||
|
+ st.b t0, a3, -3
|
||||||
|
+end_2_unalign:
|
||||||
|
+ ld.b t0, a4, -2
|
||||||
|
+ st.b t0, a3, -2
|
||||||
|
+end_1_unalign:
|
||||||
|
+ ld.b t0, a4, -1
|
||||||
|
+ st.b t0, a3, -1
|
||||||
|
+end:
|
||||||
|
+
|
||||||
|
+ move v0, t8
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+all_align:
|
||||||
|
+ addi.d a1, a1, 0x8
|
||||||
|
+ addi.d a0, a0, 0x8
|
||||||
|
+ ld.d t0, a1, -8
|
||||||
|
+ st.d t0, a0, -8
|
||||||
|
+ addi.d a2, a2, -8
|
||||||
|
+ b start_over
|
||||||
|
+
|
||||||
|
+all_align_back:
|
||||||
|
+ addi.d a4, a4, -0x8
|
||||||
|
+ addi.d a3, a3, -0x8
|
||||||
|
+ ld.d t0, a4, 0
|
||||||
|
+ st.d t0, a3, 0
|
||||||
|
+ addi.d a2, a2, -8
|
||||||
|
+ b start_over_back
|
||||||
|
+
|
||||||
|
+copy_backward:
|
||||||
|
+ move a5, a3
|
||||||
|
+ srli.d a3, a3, 3
|
||||||
|
+ slli.d a3, a3, 3
|
||||||
|
+ beq a3, a5, all_align_back
|
||||||
|
+ sub.d a7, a3, a5
|
||||||
|
+ add.d a4, a4, a7
|
||||||
|
+ add.d a2, a7, a2
|
||||||
|
+
|
||||||
|
+ pcaddi t1, 18
|
||||||
|
+ slli.d a6, a7, 3
|
||||||
|
+ add.d t1, t1, a6
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+ ld.b t0, a4, 6
|
||||||
|
+ st.b t0, a3, 6
|
||||||
|
+ ld.b t0, a4, 5
|
||||||
|
+ st.b t0, a3, 5
|
||||||
|
+ ld.b t0, a4, 4
|
||||||
|
+ st.b t0, a3, 4
|
||||||
|
+ ld.b t0, a4, 3
|
||||||
|
+ st.b t0, a3, 3
|
||||||
|
+ ld.b t0, a4, 2
|
||||||
|
+ st.b t0, a3, 2
|
||||||
|
+ ld.b t0, a4, 1
|
||||||
|
+ st.b t0, a3, 1
|
||||||
|
+ ld.b t0, a4, 0
|
||||||
|
+ st.b t0, a3, 0
|
||||||
|
+start_over_back:
|
||||||
|
+
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ blt a2, zero, end_unalign_proc_back
|
||||||
|
+
|
||||||
|
+loop_less_back:
|
||||||
|
+ LD_64(a4, -64)
|
||||||
|
+ ST_64(a3, -64)
|
||||||
|
+ LD_64(a4, -128)
|
||||||
|
+ ST_64(a3, -128)
|
||||||
|
+
|
||||||
|
+ addi.d a4, a4, -0x80
|
||||||
|
+ addi.d a3, a3, -0x80
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ bge a2, zero, loop_less_back
|
||||||
|
+
|
||||||
|
+end_unalign_proc_back:
|
||||||
|
+ addi.d a2, a2, 0x80
|
||||||
|
+
|
||||||
|
+ pcaddi t1, 36
|
||||||
|
+ andi t2, a2, 0x78
|
||||||
|
+ sub.d a4, a4, t2
|
||||||
|
+ sub.d a3, a3, t2
|
||||||
|
+ sub.d t1, t1, t2
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+ ld.d t0, a4, 112
|
||||||
|
+ st.d t0, a3, 112
|
||||||
|
+ ld.d t0, a4, 104
|
||||||
|
+ st.d t0, a3, 104
|
||||||
|
+ ld.d t0, a4, 96
|
||||||
|
+ st.d t0, a3, 96
|
||||||
|
+ ld.d t0, a4, 88
|
||||||
|
+ st.d t0, a3, 88
|
||||||
|
+ ld.d t0, a4, 80
|
||||||
|
+ st.d t0, a3, 80
|
||||||
|
+ ld.d t0, a4, 72
|
||||||
|
+ st.d t0, a3, 72
|
||||||
|
+ ld.d t0, a4, 64
|
||||||
|
+ st.d t0, a3, 64
|
||||||
|
+ ld.d t0, a4, 56
|
||||||
|
+ st.d t0, a3, 56
|
||||||
|
+ ld.d t0, a4, 48
|
||||||
|
+ st.d t0, a3, 48
|
||||||
|
+ ld.d t0, a4, 40
|
||||||
|
+ st.d t0, a3, 40
|
||||||
|
+ ld.d t0, a4, 32
|
||||||
|
+ st.d t0, a3, 32
|
||||||
|
+ ld.d t0, a4, 24
|
||||||
|
+ st.d t0, a3, 24
|
||||||
|
+ ld.d t0, a4, 16
|
||||||
|
+ st.d t0, a3, 16
|
||||||
|
+ ld.d t0, a4, 8
|
||||||
|
+ st.d t0, a3, 8
|
||||||
|
+ ld.d t0, a4, 0
|
||||||
|
+ st.d t0, a3, 0
|
||||||
|
+
|
||||||
|
+ andi a2, a2, 0x7
|
||||||
|
+ pcaddi t1, 18
|
||||||
|
+ slli.d a2, a2, 3
|
||||||
|
+ sub.d t1, t1, a2
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+ ld.b t0, a1, 6
|
||||||
|
+ st.b t0, a0, 6
|
||||||
|
+ ld.b t0, a1, 5
|
||||||
|
+ st.b t0, a0, 5
|
||||||
|
+ ld.b t0, a1, 4
|
||||||
|
+ st.b t0, a0, 4
|
||||||
|
+ ld.b t0, a1, 3
|
||||||
|
+ st.b t0, a0, 3
|
||||||
|
+ ld.b t0, a1, 2
|
||||||
|
+ st.b t0, a0, 2
|
||||||
|
+ ld.b t0, a1, 1
|
||||||
|
+ st.b t0, a0, 1
|
||||||
|
+ ld.b t0, a1, 0
|
||||||
|
+ st.b t0, a0, 0
|
||||||
|
+
|
||||||
|
+ move v0, t8
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+END(MEMMOVE_NAME)
|
||||||
|
+libc_hidden_builtin_def (MEMMOVE_NAME)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
250
2_6-LoongArch-Optimize-string-functions-strchr-strchrnul.patch
Normal file
250
2_6-LoongArch-Optimize-string-functions-strchr-strchrnul.patch
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
From dd99689b821162293506e0344f163b82349a9298 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:22:01 +0800
|
||||||
|
Subject: [PATCH 2/6] LoongArch: Optimize string functions strchr, strchrnull.
|
||||||
|
|
||||||
|
Change-Id: I8b274972642b6a1926d8fc176404bfd83344bc51
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/strchr.S | 107 +++++++++++++++++++++++++++
|
||||||
|
sysdeps/loongarch/lp64/strchrnul.S | 115 +++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 222 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strchr.S
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..3d64c684
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strchr.S
|
||||||
|
@@ -0,0 +1,107 @@
|
||||||
|
+/* Optimized strchr implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* char * strchr (const char *s1, int c); */
|
||||||
|
+LEAF(strchr)
|
||||||
|
+ .align 6
|
||||||
|
+
|
||||||
|
+ li.w t4, 0x7
|
||||||
|
+ lu12i.w a2, 0x01010
|
||||||
|
+ bstrins.d a1, a1, 15, 8
|
||||||
|
+ andi t0, a0, 0x7
|
||||||
|
+
|
||||||
|
+ ori a2, a2, 0x101
|
||||||
|
+ andn t4, a0, t4
|
||||||
|
+ slli.w t1, t0, 3
|
||||||
|
+
|
||||||
|
+ ld.d t4, t4, 0
|
||||||
|
+
|
||||||
|
+ nor t8, zero, zero
|
||||||
|
+ bstrins.d a1, a1, 31, 16
|
||||||
|
+ srl.d t4, t4, t1
|
||||||
|
+
|
||||||
|
+ bstrins.d a1, a1, 63, 32
|
||||||
|
+ bstrins.d a2, a2, 63, 32
|
||||||
|
+ srl.d a7, t8, t1
|
||||||
|
+
|
||||||
|
+ li.w t1, 8
|
||||||
|
+ nor t8, a7, zero
|
||||||
|
+ slli.d a3, a2, 7
|
||||||
|
+ or t5, t8, t4
|
||||||
|
+ and t3, a7, a1
|
||||||
|
+
|
||||||
|
+ sub.w t1, t1, t0
|
||||||
|
+ nor a3, a3, zero
|
||||||
|
+ xor t2, t5, t3
|
||||||
|
+ sub.d a7, t5, a2
|
||||||
|
+ nor a6, t5, a3
|
||||||
|
+
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ bnez a7, L(_mc8_a)
|
||||||
|
+
|
||||||
|
+ add.d a0, a0, t1
|
||||||
|
+L(_aloop):
|
||||||
|
+ ld.d t4, a0, 0
|
||||||
|
+
|
||||||
|
+ xor t2, t4, a1
|
||||||
|
+ sub.d a7, t4, a2
|
||||||
|
+ nor a6, t4, a3
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ bnez a7, L(_mc8_a)
|
||||||
|
+
|
||||||
|
+ ld.d t4, a0, 8
|
||||||
|
+ addi.d a0, a0, 16
|
||||||
|
+ xor t2, t4, a1
|
||||||
|
+ sub.d a7, t4, a2
|
||||||
|
+ nor a6, t4, a3
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ beqz a7, L(_aloop)
|
||||||
|
+
|
||||||
|
+ addi.d a0, a0, -8
|
||||||
|
+L(_mc8_a):
|
||||||
|
+
|
||||||
|
+ ctz.d t0, a5
|
||||||
|
+ ctz.d t2, a6
|
||||||
|
+
|
||||||
|
+ srli.w t0, t0, 3
|
||||||
|
+ srli.w t2, t2, 3
|
||||||
|
+ sltu t1, t2, t0
|
||||||
|
+ add.d v0, a0, t0
|
||||||
|
+ masknez v0, v0, t1
|
||||||
|
+ jr ra
|
||||||
|
+END(strchr)
|
||||||
|
+
|
||||||
|
+libc_hidden_builtin_def (strchr)
|
||||||
|
+weak_alias (strchr, index)
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..58b8b372
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strchrnul.S
|
||||||
|
@@ -0,0 +1,115 @@
|
||||||
|
+/* Optimized strchrnul implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+#define MOVZ(rd,rs,rt) \
|
||||||
|
+ masknez t6, rs, rt;\
|
||||||
|
+ maskeqz rd, rd, rt;\
|
||||||
|
+ or rd, rd, t6
|
||||||
|
+
|
||||||
|
+/* char *strchrnul(const char *s, int c); */
|
||||||
|
+LEAF(__strchrnul)
|
||||||
|
+ .align 6
|
||||||
|
+
|
||||||
|
+ li.w t4, 0x7
|
||||||
|
+ lu12i.w a2, 0x01010
|
||||||
|
+ bstrins.d a1, a1, 15, 8
|
||||||
|
+ andi t0, a0, 0x7
|
||||||
|
+
|
||||||
|
+ ori a2, a2, 0x101
|
||||||
|
+ andn t4, a0, t4
|
||||||
|
+ slli.w t1, t0, 3
|
||||||
|
+ ld.d t4, t4, 0
|
||||||
|
+
|
||||||
|
+ nor t8, zero, zero
|
||||||
|
+ bstrins.d a1, a1, 31, 16
|
||||||
|
+ srl.d t4, t4, t1
|
||||||
|
+
|
||||||
|
+ preld 0, a0, 32
|
||||||
|
+ bstrins.d a1, a1, 63, 32
|
||||||
|
+ bstrins.d a2, a2, 63, 32
|
||||||
|
+ srl.d a7, t8, t1
|
||||||
|
+
|
||||||
|
+ nor t8, a7, zero
|
||||||
|
+ slli.d a3, a2, 7
|
||||||
|
+ or t5, t8, t4
|
||||||
|
+ and t3, a7, a1
|
||||||
|
+
|
||||||
|
+ nor a3, a3, zero
|
||||||
|
+ xor t2, t5, t3
|
||||||
|
+ sub.d a7, t5, a2
|
||||||
|
+ nor a6, t5, a3
|
||||||
|
+
|
||||||
|
+ li.w t1, 8
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ bnez a7, L(_mc8_a)
|
||||||
|
+
|
||||||
|
+ sub.w t1, t1, t0
|
||||||
|
+ add.d a0, a0, t1
|
||||||
|
+L(_aloop):
|
||||||
|
+ ld.d t4, a0, 0
|
||||||
|
+
|
||||||
|
+ xor t2, t4, a1
|
||||||
|
+ sub.d a7, t4, a2
|
||||||
|
+ nor a6, t4, a3
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ bnez a7, L(_mc8_a)
|
||||||
|
+
|
||||||
|
+ ld.d t4, a0, 8
|
||||||
|
+ addi.d a0, a0, 16
|
||||||
|
+
|
||||||
|
+ xor t2, t4, a1
|
||||||
|
+ sub.d a7, t4, a2
|
||||||
|
+ nor a6, t4, a3
|
||||||
|
+ sub.d a5, t2, a2
|
||||||
|
+
|
||||||
|
+ nor a4, t2, a3
|
||||||
|
+ and a6, a7, a6
|
||||||
|
+ and a5, a5, a4
|
||||||
|
+
|
||||||
|
+ or a7, a6, a5
|
||||||
|
+ beqz a7, L(_aloop)
|
||||||
|
+
|
||||||
|
+ addi.d a0, a0, -8
|
||||||
|
+L(_mc8_a):
|
||||||
|
+ ctz.d t0, a5
|
||||||
|
+ ctz.d t2, a6
|
||||||
|
+
|
||||||
|
+ srli.w t0, t0, 3
|
||||||
|
+ srli.w t2, t2, 3
|
||||||
|
+ slt t1, t0, t2
|
||||||
|
+
|
||||||
|
+ MOVZ(t0,t2,t1)
|
||||||
|
+
|
||||||
|
+ add.d v0, a0, t0
|
||||||
|
+ jr ra
|
||||||
|
+END(__strchrnul)
|
||||||
|
+
|
||||||
|
+weak_alias(__strchrnul, strchrnul)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
190
3_6-LoongArch-Optimize-string-function-memset.patch
Normal file
190
3_6-LoongArch-Optimize-string-function-memset.patch
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
From 603aa93569ec4034aa1d5a310f59504b5d6aad4d Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:23:06 +0800
|
||||||
|
Subject: [PATCH 3/6] LoongArch: Optimize string function memset.
|
||||||
|
|
||||||
|
Change-Id: I04906c31a2eabd380b19bb3a4cab603128526cd1
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/memset.S | 170 ++++++++++++++++++++++++++++++++
|
||||||
|
1 file changed, 170 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/memset.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..261504b1
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/memset.S
|
||||||
|
@@ -0,0 +1,170 @@
|
||||||
|
+/* Optimized memset implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+#define ST_128(n) \
|
||||||
|
+ st.d a1, a0, n; \
|
||||||
|
+ st.d a1, a0, n+8 ; \
|
||||||
|
+ st.d a1, a0, n+16 ; \
|
||||||
|
+ st.d a1, a0, n+24 ; \
|
||||||
|
+ st.d a1, a0, n+32 ; \
|
||||||
|
+ st.d a1, a0, n+40 ; \
|
||||||
|
+ st.d a1, a0, n+48 ; \
|
||||||
|
+ st.d a1, a0, n+56 ; \
|
||||||
|
+ st.d a1, a0, n+64 ; \
|
||||||
|
+ st.d a1, a0, n+72 ; \
|
||||||
|
+ st.d a1, a0, n+80 ; \
|
||||||
|
+ st.d a1, a0, n+88 ; \
|
||||||
|
+ st.d a1, a0, n+96 ; \
|
||||||
|
+ st.d a1, a0, n+104; \
|
||||||
|
+ st.d a1, a0, n+112; \
|
||||||
|
+ st.d a1, a0, n+120; \
|
||||||
|
+
|
||||||
|
+/* void *memset(void *s, int c, size_t n); */
|
||||||
|
+LEAF(memset)
|
||||||
|
+ .align 6
|
||||||
|
+
|
||||||
|
+ bstrins.d a1, a1, 15, 8
|
||||||
|
+ add.d t7, a0, a2
|
||||||
|
+ bstrins.d a1, a1, 31, 16
|
||||||
|
+ move t0, a0
|
||||||
|
+ bstrins.d a1, a1, 63, 32
|
||||||
|
+ srai.d t8, a2, 4 #num/16
|
||||||
|
+ beqz t8, less_16bytes #num<16
|
||||||
|
+ srai.d t8, a2, 6 #num/64
|
||||||
|
+ bnez t8, more_64bytes #num>64
|
||||||
|
+ srai.d t8, a2, 5 #num/32
|
||||||
|
+ beqz t8, less_32bytes #num<32
|
||||||
|
+ st.d a1, a0, 0 #32<num<64
|
||||||
|
+ st.d a1, a0, 8
|
||||||
|
+ st.d a1, a0, 16
|
||||||
|
+ st.d a1, a0, 24
|
||||||
|
+ st.d a1, t7, -32
|
||||||
|
+ st.d a1, t7, -24
|
||||||
|
+ st.d a1, t7, -16
|
||||||
|
+ st.d a1, t7, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_32bytes:
|
||||||
|
+ st.d a1, a0, 0
|
||||||
|
+ st.d a1, a0, 8
|
||||||
|
+ st.d a1, t7, -16
|
||||||
|
+ st.d a1, t7, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_16bytes:
|
||||||
|
+ srai.d t8, a2, 3 #num/8
|
||||||
|
+ beqz t8, less_8bytes
|
||||||
|
+ st.d a1, a0, 0
|
||||||
|
+ st.d a1, t7, -8
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_8bytes:
|
||||||
|
+ srai.d t8, a2, 2
|
||||||
|
+ beqz t8, less_4bytes
|
||||||
|
+ st.w a1, a0, 0
|
||||||
|
+ st.w a1, t7, -4
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_4bytes:
|
||||||
|
+ srai.d t8, a2, 1
|
||||||
|
+ beqz t8, less_2bytes
|
||||||
|
+ st.h a1, a0, 0
|
||||||
|
+ st.h a1, t7, -2
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_2bytes:
|
||||||
|
+ beqz a2, less_1bytes
|
||||||
|
+ st.b a1, a0, 0
|
||||||
|
+
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+less_1bytes:
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+more_64bytes:
|
||||||
|
+ srli.d a0, a0, 3
|
||||||
|
+ slli.d a0, a0, 3
|
||||||
|
+ addi.d a0, a0, 0x8
|
||||||
|
+ st.d a1, t0, 0
|
||||||
|
+ sub.d t2, t0, a0
|
||||||
|
+ add.d a2, t2, a2
|
||||||
|
+
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ blt a2, zero, end_unalign_proc
|
||||||
|
+
|
||||||
|
+loop_less:
|
||||||
|
+ ST_128(0)
|
||||||
|
+ addi.d a0, a0, 0x80
|
||||||
|
+ addi.d a2, a2, -0x80
|
||||||
|
+ bge a2, zero, loop_less
|
||||||
|
+
|
||||||
|
+end_unalign_proc:
|
||||||
|
+ addi.d a2, a2, 0x80
|
||||||
|
+
|
||||||
|
+ pcaddi t1, 20
|
||||||
|
+ andi t5, a2, 0x78
|
||||||
|
+ srli.d t5, t5, 1
|
||||||
|
+ sub.d t1, t1, t5
|
||||||
|
+ jirl zero, t1, 0
|
||||||
|
+
|
||||||
|
+end_120_128_unalign:
|
||||||
|
+ st.d a1, a0, 112
|
||||||
|
+end_112_120_unalign:
|
||||||
|
+ st.d a1, a0, 104
|
||||||
|
+end_104_112_unalign:
|
||||||
|
+ st.d a1, a0, 96
|
||||||
|
+end_96_104_unalign:
|
||||||
|
+ st.d a1, a0, 88
|
||||||
|
+end_88_96_unalign:
|
||||||
|
+ st.d a1, a0, 80
|
||||||
|
+end_80_88_unalign:
|
||||||
|
+ st.d a1, a0, 72
|
||||||
|
+end_72_80_unalign:
|
||||||
|
+ st.d a1, a0, 64
|
||||||
|
+end_64_72_unalign:
|
||||||
|
+ st.d a1, a0, 56
|
||||||
|
+end_56_64_unalign:
|
||||||
|
+ st.d a1, a0, 48
|
||||||
|
+end_48_56_unalign:
|
||||||
|
+ st.d a1, a0, 40
|
||||||
|
+end_40_48_unalign:
|
||||||
|
+ st.d a1, a0, 32
|
||||||
|
+end_32_40_unalign:
|
||||||
|
+ st.d a1, a0, 24
|
||||||
|
+end_24_32_unalign:
|
||||||
|
+ st.d a1, a0, 16
|
||||||
|
+end_16_24_unalign:
|
||||||
|
+ st.d a1, a0, 8
|
||||||
|
+end_8_16_unalign:
|
||||||
|
+ st.d a1, a0, 0
|
||||||
|
+end_0_8_unalign:
|
||||||
|
+ st.d a1, t7, -8
|
||||||
|
+
|
||||||
|
+ move v0, t0
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+END(memset)
|
||||||
|
+
|
||||||
|
+libc_hidden_builtin_def (memset)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
414
4_6-LoongArch-Optimize-string-functions-strcmp-strncmp.patch
Normal file
414
4_6-LoongArch-Optimize-string-functions-strcmp-strncmp.patch
Normal file
@ -0,0 +1,414 @@
|
|||||||
|
From 3f3b70e39a529369e4b2936f35034215a45436a3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:23:50 +0800
|
||||||
|
Subject: [PATCH 4/6] LoongArch: Optimize string functions strcmp, strncmp.
|
||||||
|
|
||||||
|
Change-Id: I436138a312e8ebb668223cafef84fd74dcde72fd
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/strcmp.S | 161 ++++++++++++++++++++++
|
||||||
|
sysdeps/loongarch/lp64/strncmp.S | 225 +++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 386 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strcmp.S
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strncmp.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..0f7a6d55
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strcmp.S
|
||||||
|
@@ -0,0 +1,161 @@
|
||||||
|
+/* Optimized strcmp implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sys/asm.h>
|
||||||
|
+
|
||||||
|
+/* Parameters and Results */
|
||||||
|
+#define src1 a0
|
||||||
|
+#define src2 a1
|
||||||
|
+#define result v0
|
||||||
|
+
|
||||||
|
+/* Internal variable */
|
||||||
|
+#define src1_off a2
|
||||||
|
+#define src2_off a3
|
||||||
|
+#define data1 t0
|
||||||
|
+#define data2 t1
|
||||||
|
+#define has_nul t2
|
||||||
|
+#define diff t3
|
||||||
|
+#define syndrome t4
|
||||||
|
+#define zeroones t5
|
||||||
|
+#define sevenf t6
|
||||||
|
+#define pos t7
|
||||||
|
+#define exchange t8
|
||||||
|
+#define tmp1 a4
|
||||||
|
+#define tmp2 a5
|
||||||
|
+#define tmp3 a6
|
||||||
|
+#define tmp4 a7
|
||||||
|
+
|
||||||
|
+/* rd <- if rc then ra else rb
|
||||||
|
+ tmp3 will be destroyed */
|
||||||
|
+#define CONDITIONSEL(rd, rc, ra, rb)\
|
||||||
|
+ masknez tmp3, rb, rc;\
|
||||||
|
+ maskeqz rd, ra, rc;\
|
||||||
|
+ or rd, rd, tmp3
|
||||||
|
+
|
||||||
|
+LEAF(strcmp)
|
||||||
|
+ .align 4
|
||||||
|
+
|
||||||
|
+ xor tmp1, src1, src2
|
||||||
|
+ lu12i.w zeroones, 0x01010
|
||||||
|
+ lu12i.w sevenf, 0x7f7f7
|
||||||
|
+ andi src1_off, src1, 0x7
|
||||||
|
+ ori zeroones, zeroones, 0x101
|
||||||
|
+ ori sevenf, sevenf, 0xf7f
|
||||||
|
+ andi tmp1, tmp1, 0x7
|
||||||
|
+ bstrins.d zeroones, zeroones, 63, 32
|
||||||
|
+ bstrins.d sevenf, sevenf, 63, 32
|
||||||
|
+ bnez tmp1, strcmp_misaligned8
|
||||||
|
+ bnez src1_off, strcmp_mutual_align
|
||||||
|
+strcmp_loop_aligned:
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+strcmp_start_realigned:
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ xor diff, data1, data2
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ or syndrome, diff, has_nul
|
||||||
|
+ beqz syndrome, strcmp_loop_aligned
|
||||||
|
+
|
||||||
|
+strcmp_end:
|
||||||
|
+ ctz.d pos, syndrome
|
||||||
|
+ bstrins.d pos, zero, 2, 0
|
||||||
|
+ srl.d data1, data1, pos
|
||||||
|
+ srl.d data2, data2, pos
|
||||||
|
+ andi data1, data1, 0xff
|
||||||
|
+ andi data2, data2, 0xff
|
||||||
|
+ sub.d result, data1, data2
|
||||||
|
+ jr ra
|
||||||
|
+strcmp_mutual_align:
|
||||||
|
+ bstrins.d src1, zero, 2, 0
|
||||||
|
+ bstrins.d src2, zero, 2, 0
|
||||||
|
+ slli.d tmp1, src1_off, 0x3
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ sub.d tmp1, zero, tmp1
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+ nor tmp2, zero, zero
|
||||||
|
+ srl.d tmp2, tmp2, tmp1
|
||||||
|
+ or data1, data1, tmp2
|
||||||
|
+ or data2, data2, tmp2
|
||||||
|
+ b strcmp_start_realigned
|
||||||
|
+
|
||||||
|
+strcmp_misaligned8:
|
||||||
|
+ /* check
|
||||||
|
+ if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
|
||||||
|
+ then exchange(src1,src2). */
|
||||||
|
+ andi src2_off, src2, 0x7
|
||||||
|
+ slt tmp2, src1_off, src2_off
|
||||||
|
+ CONDITIONSEL(tmp2, src2_off, tmp2, tmp1)
|
||||||
|
+ maskeqz exchange, tmp2, src1_off
|
||||||
|
+ xor tmp3, src1, src2
|
||||||
|
+ maskeqz tmp3, tmp3, exchange
|
||||||
|
+ xor src1, src1, tmp3
|
||||||
|
+ xor src2, src2, tmp3
|
||||||
|
+
|
||||||
|
+ andi src1_off, src1, 0x7
|
||||||
|
+ beqz src1_off, strcmp_loop_misaligned
|
||||||
|
+strcmp_do_misaligned:
|
||||||
|
+ ld.bu data1, src1, 0
|
||||||
|
+ ld.bu data2, src2, 0
|
||||||
|
+ xor tmp3, data1, data2
|
||||||
|
+ addi.d src1, src1, 1
|
||||||
|
+ masknez tmp3, data1, tmp3
|
||||||
|
+ addi.d src2, src2, 1
|
||||||
|
+ beqz tmp3, strcmp_done
|
||||||
|
+ andi src1_off, src1, 0x7
|
||||||
|
+ bnez src1_off, strcmp_do_misaligned
|
||||||
|
+
|
||||||
|
+strcmp_loop_misaligned:
|
||||||
|
+ andi tmp1, src2, 0xff8
|
||||||
|
+ xori tmp1, tmp1, 0xff8
|
||||||
|
+ beqz tmp1, strcmp_do_misaligned
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ xor diff, data1, data2
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ or syndrome, diff, has_nul
|
||||||
|
+ beqz syndrome, strcmp_loop_misaligned
|
||||||
|
+strcmp_misalign_end:
|
||||||
|
+ ctz.d pos, syndrome
|
||||||
|
+ bstrins.d pos, zero, 2, 0
|
||||||
|
+ srl.d data1, data1, pos
|
||||||
|
+ srl.d data2, data2, pos
|
||||||
|
+ andi data1, data1, 0xff
|
||||||
|
+ andi data2, data2, 0xff
|
||||||
|
+ sub.d tmp1, data1, data2
|
||||||
|
+ sub.d tmp2, data2, data1
|
||||||
|
+ CONDITIONSEL(result, exchange, tmp2, tmp1)
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+strcmp_done:
|
||||||
|
+ sub.d tmp1, data1, data2
|
||||||
|
+ sub.d tmp2, data2, data1
|
||||||
|
+ CONDITIONSEL(result, exchange, tmp2, tmp1)
|
||||||
|
+ jr ra
|
||||||
|
+END(strcmp)
|
||||||
|
+
|
||||||
|
+libc_hidden_builtin_def (strcmp)
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..979ea40a
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strncmp.S
|
||||||
|
@@ -0,0 +1,225 @@
|
||||||
|
+/* Optimized strncmp implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sys/asm.h>
|
||||||
|
+
|
||||||
|
+/* Parameters and Results */
|
||||||
|
+#define src1 a0
|
||||||
|
+#define src2 a1
|
||||||
|
+#define limit a2
|
||||||
|
+#define result v0
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+/* Internal variable */
|
||||||
|
+#define data1 t0
|
||||||
|
+#define data2 t1
|
||||||
|
+#define has_nul t2
|
||||||
|
+#define diff t3
|
||||||
|
+#define syndrome t4
|
||||||
|
+#define zeroones t5
|
||||||
|
+#define sevenf t6
|
||||||
|
+#define pos t7
|
||||||
|
+#define exchange t8
|
||||||
|
+#define tmp1 a5
|
||||||
|
+#define tmp2 a6
|
||||||
|
+#define tmp3 a7
|
||||||
|
+#define src1_off a3
|
||||||
|
+#define limit_wd a4
|
||||||
|
+
|
||||||
|
+LEAF(strncmp)
|
||||||
|
+ .align 4
|
||||||
|
+ beqz limit, strncmp_ret0
|
||||||
|
+
|
||||||
|
+ xor tmp1, src1, src2
|
||||||
|
+ lu12i.w zeroones, 0x01010
|
||||||
|
+ lu12i.w sevenf, 0x7f7f7
|
||||||
|
+ andi src1_off, src1, 0x7
|
||||||
|
+ ori zeroones, zeroones, 0x101
|
||||||
|
+ andi tmp1, tmp1, 0x7
|
||||||
|
+ ori sevenf, sevenf, 0xf7f
|
||||||
|
+ bstrins.d zeroones, zeroones, 63, 32
|
||||||
|
+ bstrins.d sevenf, sevenf, 63, 32
|
||||||
|
+ bnez tmp1, strncmp_misaligned8
|
||||||
|
+ bnez src1_off, strncmp_mutual_align
|
||||||
|
+ addi.d limit_wd, limit, -1
|
||||||
|
+ srli.d limit_wd, limit_wd, 3
|
||||||
|
+
|
||||||
|
+strncmp_loop_aligned:
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+strncmp_start_realigned:
|
||||||
|
+ addi.d limit_wd, limit_wd, -1
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ xor diff, data1, data2
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ srli.d tmp1, limit_wd, 63
|
||||||
|
+ or syndrome, diff, has_nul
|
||||||
|
+ or tmp2, syndrome, tmp1
|
||||||
|
+ beqz tmp2, strncmp_loop_aligned
|
||||||
|
+
|
||||||
|
+ /* if not reach limit */
|
||||||
|
+ bge limit_wd, zero, strncmp_not_limit
|
||||||
|
+ /* if reach limit */
|
||||||
|
+ andi limit, limit, 0x7
|
||||||
|
+ li.w tmp1, 0x8
|
||||||
|
+ sub.d limit, tmp1, limit
|
||||||
|
+ slli.d limit, limit, 0x3
|
||||||
|
+ li.d tmp1, -1
|
||||||
|
+ srl.d tmp1, tmp1, limit
|
||||||
|
+ and data1, data1, tmp1
|
||||||
|
+ and data2, data2, tmp1
|
||||||
|
+ orn syndrome, syndrome, tmp1
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+strncmp_not_limit:
|
||||||
|
+ ctz.d pos, syndrome
|
||||||
|
+ bstrins.d pos, zero, 2, 0
|
||||||
|
+ srl.d data1, data1, pos
|
||||||
|
+ srl.d data2, data2, pos
|
||||||
|
+ andi data1, data1, 0xff
|
||||||
|
+ andi data2, data2, 0xff
|
||||||
|
+ sub.d result, data1, data2
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+strncmp_mutual_align:
|
||||||
|
+ bstrins.d src1, zero, 2, 0
|
||||||
|
+ bstrins.d src2, zero, 2, 0
|
||||||
|
+ slli.d tmp1, src1_off, 0x3
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+
|
||||||
|
+ addi.d limit_wd, limit, -1
|
||||||
|
+ andi tmp3, limit_wd, 0x7
|
||||||
|
+ srli.d limit_wd, limit_wd, 3
|
||||||
|
+ add.d limit, limit, src1_off
|
||||||
|
+ add.d tmp3, tmp3, src1_off
|
||||||
|
+ srli.d tmp3, tmp3, 3
|
||||||
|
+ add.d limit_wd, limit_wd, tmp3
|
||||||
|
+
|
||||||
|
+ sub.d tmp1, zero, tmp1
|
||||||
|
+ nor tmp2, zero, zero
|
||||||
|
+ srl.d tmp2, tmp2, tmp1
|
||||||
|
+ or data1, data1, tmp2
|
||||||
|
+ or data2, data2, tmp2
|
||||||
|
+ b strncmp_start_realigned
|
||||||
|
+
|
||||||
|
+strncmp_misaligned8:
|
||||||
|
+
|
||||||
|
+ li.w tmp1, 0x10
|
||||||
|
+ bge limit, tmp1, strncmp_try_words
|
||||||
|
+strncmp_byte_loop:
|
||||||
|
+ ld.bu data1, src1, 0
|
||||||
|
+ ld.bu data2, src2, 0
|
||||||
|
+ addi.d limit, limit, -1
|
||||||
|
+ xor tmp1, data1, data2
|
||||||
|
+ masknez tmp1, data1, tmp1
|
||||||
|
+ maskeqz tmp1, limit, tmp1
|
||||||
|
+ beqz tmp1, strncmp_done
|
||||||
|
+
|
||||||
|
+ ld.bu data1, src1, 1
|
||||||
|
+ ld.bu data2, src2, 1
|
||||||
|
+ addi.d src1, src1, 2
|
||||||
|
+ addi.d src2, src2, 2
|
||||||
|
+ addi.d limit, limit, -1
|
||||||
|
+ xor tmp1, data1, data2
|
||||||
|
+ masknez tmp1, data1, tmp1
|
||||||
|
+ maskeqz tmp1, limit, tmp1
|
||||||
|
+ bnez tmp1, strncmp_byte_loop
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+strncmp_done:
|
||||||
|
+ sub.d result, data1, data2
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+strncmp_try_words:
|
||||||
|
+ srli.d limit_wd, limit, 3
|
||||||
|
+ beqz src1_off, strncmp_do_misaligned
|
||||||
|
+
|
||||||
|
+ sub.d src1_off, zero, src1_off
|
||||||
|
+ andi src1_off, src1_off, 0x7
|
||||||
|
+ sub.d limit, limit, src1_off
|
||||||
|
+ srli.d limit_wd, limit, 0x3
|
||||||
|
+
|
||||||
|
+strncmp_page_end_loop:
|
||||||
|
+ ld.bu data1, src1, 0
|
||||||
|
+ ld.bu data2, src2, 0
|
||||||
|
+ addi.d src1, src1, 1
|
||||||
|
+ addi.d src2, src2, 1
|
||||||
|
+ xor tmp1, data1, data2
|
||||||
|
+ masknez tmp1, data1, tmp1
|
||||||
|
+ beqz tmp1, strncmp_done
|
||||||
|
+ andi tmp1, src1, 0x7
|
||||||
|
+ bnez tmp1, strncmp_page_end_loop
|
||||||
|
+strncmp_do_misaligned:
|
||||||
|
+ li.w src1_off, 0x8
|
||||||
|
+ addi.d limit_wd, limit_wd, -1
|
||||||
|
+ blt limit_wd, zero, strncmp_done_loop
|
||||||
|
+
|
||||||
|
+strncmp_loop_misaligned:
|
||||||
|
+ andi tmp2, src2, 0xff8
|
||||||
|
+ xori tmp2, tmp2, 0xff8
|
||||||
|
+ beqz tmp2, strncmp_page_end_loop
|
||||||
|
+
|
||||||
|
+ ld.d data1, src1, 0
|
||||||
|
+ ld.d data2, src2, 0
|
||||||
|
+ addi.d src1, src1, 8
|
||||||
|
+ addi.d src2, src2, 8
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ xor diff, data1, data2
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ or syndrome, diff, has_nul
|
||||||
|
+ bnez syndrome, strncmp_not_limit
|
||||||
|
+ addi.d limit_wd, limit_wd, -1
|
||||||
|
+ bge limit_wd, zero, strncmp_loop_misaligned
|
||||||
|
+
|
||||||
|
+strncmp_done_loop:
|
||||||
|
+ andi limit, limit, 0x7
|
||||||
|
+ beqz limit, strncmp_not_limit
|
||||||
|
+ /* Read the last double word
|
||||||
|
+ check if the final part is about to exceed the page */
|
||||||
|
+ andi tmp1, src2, 0x7
|
||||||
|
+ andi tmp2, src2, 0xff8
|
||||||
|
+ add.d tmp1, tmp1, limit
|
||||||
|
+ xori tmp2, tmp2, 0xff8
|
||||||
|
+ andi tmp1, tmp1, 0x8
|
||||||
|
+ masknez tmp1, tmp1, tmp2
|
||||||
|
+ bnez tmp1, strncmp_byte_loop
|
||||||
|
+ addi.d src1, src1, -8
|
||||||
|
+ addi.d src2, src2, -8
|
||||||
|
+ ldx.d data1, src1, limit
|
||||||
|
+ ldx.d data2, src2, limit
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ xor diff, data1, data2
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ or syndrome, diff, has_nul
|
||||||
|
+ bnez syndrome, strncmp_not_limit
|
||||||
|
+
|
||||||
|
+strncmp_ret0:
|
||||||
|
+ move result, zero
|
||||||
|
+ jr ra
|
||||||
|
+END(strncmp)
|
||||||
|
+libc_hidden_builtin_def (strncmp)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
195
5_6-LoongArch-Optimize-string-function-strcpy.patch
Normal file
195
5_6-LoongArch-Optimize-string-function-strcpy.patch
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:25:18 +0800
|
||||||
|
Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy.
|
||||||
|
|
||||||
|
Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++
|
||||||
|
1 file changed, 175 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strcpy.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..03d9d361
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strcpy.S
|
||||||
|
@@ -0,0 +1,175 @@
|
||||||
|
+/* Optimized strcpy implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sys/asm.h>
|
||||||
|
+
|
||||||
|
+/* Parameters and Results */
|
||||||
|
+#define dest a0
|
||||||
|
+#define src a1
|
||||||
|
+#define result v0
|
||||||
|
+
|
||||||
|
+/* Internal variable */
|
||||||
|
+#define data t0
|
||||||
|
+#define data1 t1
|
||||||
|
+#define has_nul t2
|
||||||
|
+#define diff t3
|
||||||
|
+#define syndrome t4
|
||||||
|
+#define zeroones t5
|
||||||
|
+#define sevenf t6
|
||||||
|
+#define pos t7
|
||||||
|
+#define dest_backup t8
|
||||||
|
+#define tmp1 a4
|
||||||
|
+#define tmp2 a5
|
||||||
|
+#define tmp3 a6
|
||||||
|
+#define dest_off a2
|
||||||
|
+#define src_off a3
|
||||||
|
+#define tmp4 a7
|
||||||
|
+
|
||||||
|
+/* rd <- if rc then ra else rb
|
||||||
|
+ tmp3 will be destroyed. */
|
||||||
|
+#define CONDITIONSEL(rd, rc, ra, rb)\
|
||||||
|
+ masknez tmp3, rb, rc;\
|
||||||
|
+ maskeqz rd, ra, rc;\
|
||||||
|
+ or rd, rd, tmp3
|
||||||
|
+
|
||||||
|
+/* int strcpy (const char *s1, const char *s2); */
|
||||||
|
+LEAF(strcpy)
|
||||||
|
+ .align 4
|
||||||
|
+
|
||||||
|
+ move dest_backup, dest
|
||||||
|
+ lu12i.w zeroones, 0x01010
|
||||||
|
+ lu12i.w sevenf, 0x7f7f7
|
||||||
|
+ ori zeroones, zeroones, 0x101
|
||||||
|
+ ori sevenf, sevenf, 0xf7f
|
||||||
|
+ bstrins.d zeroones, zeroones, 63, 32
|
||||||
|
+ bstrins.d sevenf, sevenf, 63, 32
|
||||||
|
+ andi src_off, src, 0x7
|
||||||
|
+ beqz src_off, strcpy_loop_aligned_1
|
||||||
|
+ b strcpy_mutual_align
|
||||||
|
+strcpy_loop_aligned:
|
||||||
|
+ st.d data, dest, 0
|
||||||
|
+ addi.d dest, dest, 8
|
||||||
|
+strcpy_loop_aligned_1:
|
||||||
|
+ ld.d data, src, 0
|
||||||
|
+ addi.d src, src, 8
|
||||||
|
+strcpy_start_realigned:
|
||||||
|
+ sub.d tmp1, data, zeroones
|
||||||
|
+ or tmp2, data, sevenf
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ beqz has_nul, strcpy_loop_aligned
|
||||||
|
+
|
||||||
|
+strcpy_end:
|
||||||
|
+
|
||||||
|
+ /* 8 4 2 1 */
|
||||||
|
+ ctz.d pos, has_nul
|
||||||
|
+ srli.d pos, pos, 3
|
||||||
|
+ addi.d pos, pos, 1
|
||||||
|
+ /* Do 8/4/2/1 strcpy based on pos value.
|
||||||
|
+ pos value is the number of bytes to be copied
|
||||||
|
+ the bytes include the final \0 so the max length is 8 and the min length is 1. */
|
||||||
|
+strcpy_end_8:
|
||||||
|
+ andi tmp1, pos, 0x8
|
||||||
|
+ beqz tmp1, strcpy_end_4
|
||||||
|
+ st.d data, dest, 0
|
||||||
|
+ move dest, dest_backup
|
||||||
|
+ jr ra
|
||||||
|
+strcpy_end_4:
|
||||||
|
+ andi tmp1, pos, 0x4
|
||||||
|
+ beqz tmp1, strcpy_end_2
|
||||||
|
+ st.w data, dest, 0
|
||||||
|
+ srli.d data, data, 32
|
||||||
|
+ addi.d dest, dest, 4
|
||||||
|
+strcpy_end_2:
|
||||||
|
+ andi tmp1, pos, 0x2
|
||||||
|
+ beqz tmp1, strcpy_end_1
|
||||||
|
+ st.h data, dest, 0
|
||||||
|
+ srli.d data, data, 16
|
||||||
|
+ addi.d dest, dest, 2
|
||||||
|
+strcpy_end_1:
|
||||||
|
+ andi tmp1, pos, 0x1
|
||||||
|
+ beqz tmp1, strcpy_end_ret
|
||||||
|
+ st.b data, dest, 0
|
||||||
|
+strcpy_end_ret:
|
||||||
|
+ move result, dest_backup
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+strcpy_mutual_align:
|
||||||
|
+ /* Check if around src page bound.
|
||||||
|
+ if not go to page cross ok.
|
||||||
|
+ if it is, do further check.
|
||||||
|
+ use tmp2 to accelerate. */
|
||||||
|
+
|
||||||
|
+ li.w tmp2, 0xff8
|
||||||
|
+ andi tmp1, src, 0xff8
|
||||||
|
+ beq tmp1, tmp2, strcpy_page_cross
|
||||||
|
+
|
||||||
|
+strcpy_page_cross_ok:
|
||||||
|
+ /* Load a misaligned double word and check if has \0
|
||||||
|
+ If no, do a misaligned double word paste.
|
||||||
|
+ If yes, calculate the number of avaliable bytes,
|
||||||
|
+ then jump to 4/2/1 end. */
|
||||||
|
+ ld.d data, src, 0
|
||||||
|
+ sub.d tmp1, data, zeroones
|
||||||
|
+ or tmp2, data, sevenf
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ bnez has_nul, strcpy_end
|
||||||
|
+strcpy_mutual_align_finish:
|
||||||
|
+ /* Before jump back to align loop, make dest/src aligned.
|
||||||
|
+ This will cause a duplicated paste for several bytes between the first double word and the second double word,
|
||||||
|
+ but should not bring a problem. */
|
||||||
|
+ li.w tmp1, 8
|
||||||
|
+ st.d data, dest, 0
|
||||||
|
+ sub.d tmp1, tmp1, src_off
|
||||||
|
+ add.d src, src, tmp1
|
||||||
|
+ add.d dest, dest, tmp1
|
||||||
|
+
|
||||||
|
+ b strcpy_loop_aligned_1
|
||||||
|
+
|
||||||
|
+strcpy_page_cross:
|
||||||
|
+ /*
|
||||||
|
+ ld.d from aligned address(src & ~0x7).
|
||||||
|
+ check if high bytes have \0.
|
||||||
|
+ it not, go back to page cross ok,
|
||||||
|
+ since the string is supposed to cross the page bound in such situation.
|
||||||
|
+ if it is, do a srl for data to make it seems like a direct double word from src,
|
||||||
|
+ then go to 4/2/1 strcpy end.
|
||||||
|
+
|
||||||
|
+ tmp4 is 0xffff...ffff mask
|
||||||
|
+ tmp2 demonstrate the bytes to be masked
|
||||||
|
+ tmp2 = src_off << 3
|
||||||
|
+ data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
|
||||||
|
+ and
|
||||||
|
+ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
|
||||||
|
+
|
||||||
|
+ li.w tmp1, 0x7
|
||||||
|
+ andn tmp3, src, tmp1
|
||||||
|
+ ld.d data, tmp3, 0
|
||||||
|
+ li.w tmp4, -1
|
||||||
|
+ slli.d tmp2, src_off, 3
|
||||||
|
+ srl.d tmp4, tmp4, tmp2
|
||||||
|
+ srl.d data, data, tmp2
|
||||||
|
+ nor tmp4, tmp4, zero
|
||||||
|
+ or data, data, tmp4
|
||||||
|
+ sub.d tmp1, data, zeroones
|
||||||
|
+ or tmp2, data, sevenf
|
||||||
|
+ andn has_nul, tmp1, tmp2
|
||||||
|
+ beqz has_nul, strcpy_page_cross_ok
|
||||||
|
+ b strcpy_end
|
||||||
|
+END(strcpy)
|
||||||
|
+libc_hidden_builtin_def (strcpy)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
255
6_6-LoongArch-Optimize-string-functions-strlen-strnlen.patch
Normal file
255
6_6-LoongArch-Optimize-string-functions-strlen-strnlen.patch
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
From 86290dbec63a9688ab0e0085ab8ab686fa256f18 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Xue Liu <liuxue@loongson.cn>
|
||||||
|
Date: Sun, 29 Jan 2023 10:25:47 +0800
|
||||||
|
Subject: [PATCH 6/6] LoongArch: Optimize string functions strlen, strnlen.
|
||||||
|
|
||||||
|
Change-Id: I5df3398f9dbd9ea72c3de14e1e5f7793f6dbd794
|
||||||
|
---
|
||||||
|
sysdeps/loongarch/lp64/strlen.S | 102 +++++++++++++++++++++++++
|
||||||
|
sysdeps/loongarch/lp64/strnlen.S | 125 +++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 227 insertions(+)
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strlen.S
|
||||||
|
create mode 100644 sysdeps/loongarch/lp64/strnlen.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..3569598c
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strlen.S
|
||||||
|
@@ -0,0 +1,102 @@
|
||||||
|
+/* Optimized strlen implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+#include <sys/asm.h>
|
||||||
|
+
|
||||||
|
+/* size_t strlen (const char *s1); */
|
||||||
|
+LEAF(strlen)
|
||||||
|
+ .align 5
|
||||||
|
+
|
||||||
|
+ nor t4, zero, zero
|
||||||
|
+ lu12i.w a2, 0x01010
|
||||||
|
+ andi t5, a0, 0x7
|
||||||
|
+
|
||||||
|
+ li.w t7, 0x7
|
||||||
|
+ slli.d t6, t5, 0x3
|
||||||
|
+ andn t7, a0, t7
|
||||||
|
+ ld.d a1, t7, 0
|
||||||
|
+ sub.d t7, zero, t6
|
||||||
|
+ sll.d t4, t4, t7
|
||||||
|
+ maskeqz t4, t4, t6
|
||||||
|
+ srl.d a1, a1, t6
|
||||||
|
+ or a1, a1, t4
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+ ori a2, a2, 0x101
|
||||||
|
+ nor t1, a1, zero
|
||||||
|
+ li.w a4, 8
|
||||||
|
+
|
||||||
|
+ bstrins.d a2, a2, 63, 32
|
||||||
|
+ sub.d a5, a4, t5
|
||||||
|
+ move t5, a0
|
||||||
|
+
|
||||||
|
+ sub.d t0, a1, a2
|
||||||
|
+ slli.d t4, a2, 7
|
||||||
|
+ nor a3, zero, t4
|
||||||
|
+ nor t1, a1, a3
|
||||||
|
+
|
||||||
|
+ and t0, t0, t1
|
||||||
|
+ bnez t0, strlen_count1
|
||||||
|
+ add.d a0, a0, a5
|
||||||
|
+strlen_loop:
|
||||||
|
+ ld.d a1, a0, 0
|
||||||
|
+ sub.d t0, a1, a2
|
||||||
|
+ and t1, t0, t4
|
||||||
|
+ bnez t1, strlen_count_pre
|
||||||
|
+ ld.d a1, a0, 8
|
||||||
|
+ sub.d t0, a1, a2
|
||||||
|
+ and t1, t0, t4
|
||||||
|
+ addi.d a0, a0, 16
|
||||||
|
+ beqz t1, strlen_loop
|
||||||
|
+strlen_count:
|
||||||
|
+ addi.d a0, a0, -8
|
||||||
|
+strlen_count_pre:
|
||||||
|
+ nor t1, a1, a3
|
||||||
|
+ and t0, t0, t1
|
||||||
|
+ beqz t0, strlen_noascii_start
|
||||||
|
+strlen_count1:
|
||||||
|
+ ctz.d t1, t0
|
||||||
|
+ sub.d v0, a0, t5
|
||||||
|
+ srli.w t1, t1, 3
|
||||||
|
+ add.d v0, v0, t1
|
||||||
|
+ jr ra
|
||||||
|
+strlen_noascii_start:
|
||||||
|
+ addi.d a0, a0, 8
|
||||||
|
+strlen_loop_noascii:
|
||||||
|
+ ld.d a1, a0, 0
|
||||||
|
+ sub.d t0, a1, a2
|
||||||
|
+ nor t1, a1, a3
|
||||||
|
+ and t0, t0, t1
|
||||||
|
+ bnez t0, strlen_count1
|
||||||
|
+ ld.d a1, a0, 8
|
||||||
|
+ sub.d t0, a1, a2
|
||||||
|
+ nor t1, a1, a3
|
||||||
|
+ and t0, t0, t1
|
||||||
|
+ addi.d a0, a0, 16
|
||||||
|
+ beqz t0, strlen_loop_noascii
|
||||||
|
+ addi.d a0, a0, -8
|
||||||
|
+ ctz.d t1, t0
|
||||||
|
+ sub.d v0, a0, t5
|
||||||
|
+ srli.w t1, t1, 3
|
||||||
|
+ add.d v0, v0, t1
|
||||||
|
+ jr ra
|
||||||
|
+END(strlen)
|
||||||
|
+
|
||||||
|
+libc_hidden_builtin_def (strlen)
|
||||||
|
+
|
||||||
|
diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..8eaa60e2
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/loongarch/lp64/strnlen.S
|
||||||
|
@@ -0,0 +1,125 @@
|
||||||
|
+/* Optimized strlen implementation for LoongArch.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sys/asm.h>
|
||||||
|
+
|
||||||
|
+/* rd <- if rc then ra else rb
|
||||||
|
+ a5 will be destroyed. */
|
||||||
|
+#define CONDITIONSEL(rd, ra, rb, rc)\
|
||||||
|
+ masknez a5, rb, rc;\
|
||||||
|
+ maskeqz rd, ra, rc;\
|
||||||
|
+ or rd, rd, a5
|
||||||
|
+
|
||||||
|
+/* Parameters and Results */
|
||||||
|
+#define srcin a0
|
||||||
|
+#define limit a1
|
||||||
|
+#define len v0
|
||||||
|
+
|
||||||
|
+/* Internal variable */
|
||||||
|
+#define data1 t0
|
||||||
|
+#define data2 t1
|
||||||
|
+#define has_nul1 t2
|
||||||
|
+#define has_nul2 t3
|
||||||
|
+#define src t4
|
||||||
|
+#define zeroones t5
|
||||||
|
+#define sevenf t6
|
||||||
|
+#define data2a t7
|
||||||
|
+#define tmp6 t7
|
||||||
|
+#define pos t8
|
||||||
|
+#define tmp1 a2
|
||||||
|
+#define tmp2 a3
|
||||||
|
+#define tmp3 a4
|
||||||
|
+#define tmp4 a5
|
||||||
|
+#define tmp5 a6
|
||||||
|
+#define limit_wd a7
|
||||||
|
+
|
||||||
|
+/* size_t strnlen (const char *s1,size_t maxlen); */
|
||||||
|
+LEAF(__strnlen)
|
||||||
|
+ .align 4
|
||||||
|
+ beqz limit, _hit_limit
|
||||||
|
+ lu12i.w zeroones, 0x01010
|
||||||
|
+ lu12i.w sevenf, 0x7f7f7
|
||||||
|
+ ori zeroones, zeroones, 0x101
|
||||||
|
+ ori sevenf, sevenf, 0xf7f
|
||||||
|
+ bstrins.d zeroones, zeroones, 63, 32
|
||||||
|
+ bstrins.d sevenf, sevenf, 63, 32
|
||||||
|
+ andi tmp1, srcin, 15
|
||||||
|
+ sub.d src, srcin, tmp1
|
||||||
|
+ bnez tmp1, misaligned
|
||||||
|
+ addi.d limit_wd, limit, -1
|
||||||
|
+ srli.d limit_wd, limit_wd, 4
|
||||||
|
+_loop:
|
||||||
|
+ ld.d data1, src, 0
|
||||||
|
+ ld.d data2, src, 8
|
||||||
|
+ addi.d src, src, 16
|
||||||
|
+_realigned:
|
||||||
|
+ sub.d tmp1, data1, zeroones
|
||||||
|
+ or tmp2, data1, sevenf
|
||||||
|
+ sub.d tmp3, data2, zeroones
|
||||||
|
+ or tmp4, data2, sevenf
|
||||||
|
+ andn has_nul1, tmp1, tmp2
|
||||||
|
+ andn has_nul2, tmp3, tmp4
|
||||||
|
+ addi.d limit_wd, limit_wd, -1
|
||||||
|
+ srli.d tmp1, limit_wd, 63
|
||||||
|
+ or tmp2, has_nul1, has_nul2
|
||||||
|
+ or tmp3, tmp1, tmp2
|
||||||
|
+ beqz tmp3, _loop
|
||||||
|
+ beqz tmp2, _hit_limit
|
||||||
|
+ sub.d len, src, srcin
|
||||||
|
+ beqz has_nul1, _nul_in_data2
|
||||||
|
+ move has_nul2, has_nul1
|
||||||
|
+ addi.d len, len, -8
|
||||||
|
+_nul_in_data2:
|
||||||
|
+ ctz.d pos, has_nul2
|
||||||
|
+ srli.d pos, pos, 3
|
||||||
|
+ addi.d len, len, -8
|
||||||
|
+ add.d len, len, pos
|
||||||
|
+ sltu tmp1, len, limit
|
||||||
|
+ CONDITIONSEL(len, len, limit, tmp1)
|
||||||
|
+ jr ra
|
||||||
|
+
|
||||||
|
+misaligned:
|
||||||
|
+ addi.d limit_wd, limit, -1
|
||||||
|
+ sub.d tmp4, zero, tmp1
|
||||||
|
+ andi tmp3, limit_wd, 15
|
||||||
|
+ srli.d limit_wd, limit_wd, 4
|
||||||
|
+ li.d tmp5, -1
|
||||||
|
+ ld.d data1, src, 0
|
||||||
|
+ ld.d data2, src, 8
|
||||||
|
+ addi.d src, src, 16
|
||||||
|
+ slli.d tmp4, tmp4, 3
|
||||||
|
+ add.d tmp3, tmp3, tmp1
|
||||||
|
+ srl.d tmp2, tmp5, tmp4
|
||||||
|
+ srli.d tmp3, tmp3, 4
|
||||||
|
+ add.d limit_wd, limit_wd, tmp3
|
||||||
|
+ or data1, data1, tmp2
|
||||||
|
+ or data2a, data2, tmp2
|
||||||
|
+ li.w tmp3, 9
|
||||||
|
+ sltu tmp1, tmp1, tmp3
|
||||||
|
+ CONDITIONSEL(data1, data1, tmp5, tmp1)
|
||||||
|
+ CONDITIONSEL(data2, data2, data2a, tmp1)
|
||||||
|
+ b _realigned
|
||||||
|
+
|
||||||
|
+_hit_limit:
|
||||||
|
+ move len, limit
|
||||||
|
+ jr ra
|
||||||
|
+END(__strnlen)
|
||||||
|
+
|
||||||
|
+weak_alias (__strnlen, strnlen)
|
||||||
|
+libc_hidden_def (strnlen)
|
||||||
|
+libc_hidden_def (__strnlen)
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
12
glibc.spec
12
glibc.spec
@ -66,7 +66,7 @@
|
|||||||
##############################################################################
|
##############################################################################
|
||||||
Name: glibc
|
Name: glibc
|
||||||
Version: 2.34
|
Version: 2.34
|
||||||
Release: 105
|
Release: 106
|
||||||
Summary: The GNU libc libraries
|
Summary: The GNU libc libraries
|
||||||
License: %{all_license}
|
License: %{all_license}
|
||||||
URL: http://www.gnu.org/software/glibc/
|
URL: http://www.gnu.org/software/glibc/
|
||||||
@ -272,6 +272,12 @@ Patch9021: x86-use-total-l3cache-for-non_temporal_threshold.patch
|
|||||||
Patch9022: login-Add-back-libutil-as-an-empty-library.patch
|
Patch9022: login-Add-back-libutil-as-an-empty-library.patch
|
||||||
Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch
|
Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch
|
||||||
Patch9024: LoongArch-Port.patch
|
Patch9024: LoongArch-Port.patch
|
||||||
|
Patch9025: 1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
|
||||||
|
Patch9026: 2_6-LoongArch-Optimize-string-functions-strchr-strchrnul.patch
|
||||||
|
Patch9027: 3_6-LoongArch-Optimize-string-function-memset.patch
|
||||||
|
Patch9028: 4_6-LoongArch-Optimize-string-functions-strcmp-strncmp.patch
|
||||||
|
Patch9029: 5_6-LoongArch-Optimize-string-function-strcpy.patch
|
||||||
|
Patch9030: 6_6-LoongArch-Optimize-string-functions-strlen-strnlen.patch
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
|
Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
|
||||||
@ -1438,6 +1444,10 @@ fi
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Sun Jan 29 2023 Xue Liu <liuxue@loongson.cn> - 2.34-106
|
||||||
|
- LoongArch: Optimize some string functions including memcpy, memmove,
|
||||||
|
memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.
|
||||||
|
|
||||||
* Wed Dec 21 2022 wanghongliang <wanghongliang@loongson.cn> - 2.34-105
|
* Wed Dec 21 2022 wanghongliang <wanghongliang@loongson.cn> - 2.34-105
|
||||||
- LoongArch Port
|
- LoongArch Port
|
||||||
- Add login-Add-back-libutil-as-an-empty-library.patch from upstream
|
- Add login-Add-back-libutil-as-an-empty-library.patch from upstream
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user