glibc/3_6-LoongArch-Optimize-string-function-memset.patch
Xue Liu 4adfab66ba LoongArch: Optimize string functions including memcpy, memmove,
memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.

(cherry picked from commit 4ba365320a633ecd4cb47d8f171aa81fcd1dd6ef)
2023-01-29 14:28:25 +08:00

191 lines
4.0 KiB
Diff

From 603aa93569ec4034aa1d5a310f59504b5d6aad4d Mon Sep 17 00:00:00 2001
From: Xue Liu <liuxue@loongson.cn>
Date: Sun, 29 Jan 2023 10:23:06 +0800
Subject: [PATCH 3/6] LoongArch: Optimize string function memset.
Change-Id: I04906c31a2eabd380b19bb3a4cab603128526cd1
---
sysdeps/loongarch/lp64/memset.S | 170 ++++++++++++++++++++++++++++++++
1 file changed, 170 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/memset.S
diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
new file mode 100644
index 00000000..261504b1
--- /dev/null
+++ b/sysdeps/loongarch/lp64/memset.S
@@ -0,0 +1,170 @@
+/* Optimized memset implementation for LoongArch.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ST_128(n) \
+ st.d a1, a0, n; \
+ st.d a1, a0, n+8 ; \
+ st.d a1, a0, n+16 ; \
+ st.d a1, a0, n+24 ; \
+ st.d a1, a0, n+32 ; \
+ st.d a1, a0, n+40 ; \
+ st.d a1, a0, n+48 ; \
+ st.d a1, a0, n+56 ; \
+ st.d a1, a0, n+64 ; \
+ st.d a1, a0, n+72 ; \
+ st.d a1, a0, n+80 ; \
+ st.d a1, a0, n+88 ; \
+ st.d a1, a0, n+96 ; \
+ st.d a1, a0, n+104; \
+ st.d a1, a0, n+112; \
+ st.d a1, a0, n+120; \
+
+/* void *memset(void *s, int c, size_t n); */
+LEAF(memset)
+ .align 6
+
+ bstrins.d a1, a1, 15, 8
+ add.d t7, a0, a2
+ bstrins.d a1, a1, 31, 16
+ move t0, a0
+ bstrins.d a1, a1, 63, 32
+ srai.d t8, a2, 4 #num/16
+ beqz t8, less_16bytes #num<16
+ srai.d t8, a2, 6 #num/64
+ bnez t8, more_64bytes #num>64
+ srai.d t8, a2, 5 #num/32
+ beqz t8, less_32bytes #num<32
+ st.d a1, a0, 0 #32<num<64
+ st.d a1, a0, 8
+ st.d a1, a0, 16
+ st.d a1, a0, 24
+ st.d a1, t7, -32
+ st.d a1, t7, -24
+ st.d a1, t7, -16
+ st.d a1, t7, -8
+
+ jr ra
+
+less_32bytes:
+ st.d a1, a0, 0
+ st.d a1, a0, 8
+ st.d a1, t7, -16
+ st.d a1, t7, -8
+
+ jr ra
+
+less_16bytes:
+ srai.d t8, a2, 3 #num/8
+ beqz t8, less_8bytes
+ st.d a1, a0, 0
+ st.d a1, t7, -8
+
+ jr ra
+
+less_8bytes:
+ srai.d t8, a2, 2
+ beqz t8, less_4bytes
+ st.w a1, a0, 0
+ st.w a1, t7, -4
+
+ jr ra
+
+less_4bytes:
+ srai.d t8, a2, 1
+ beqz t8, less_2bytes
+ st.h a1, a0, 0
+ st.h a1, t7, -2
+
+ jr ra
+
+less_2bytes:
+ beqz a2, less_1bytes
+ st.b a1, a0, 0
+
+ jr ra
+
+less_1bytes:
+ jr ra
+
+more_64bytes:
+ srli.d a0, a0, 3
+ slli.d a0, a0, 3
+ addi.d a0, a0, 0x8
+ st.d a1, t0, 0
+ sub.d t2, t0, a0
+ add.d a2, t2, a2
+
+ addi.d a2, a2, -0x80
+ blt a2, zero, end_unalign_proc
+
+loop_less:
+ ST_128(0)
+ addi.d a0, a0, 0x80
+ addi.d a2, a2, -0x80
+ bge a2, zero, loop_less
+
+end_unalign_proc:
+ addi.d a2, a2, 0x80
+
+ pcaddi t1, 20
+ andi t5, a2, 0x78
+ srli.d t5, t5, 1
+ sub.d t1, t1, t5
+ jirl zero, t1, 0
+
+end_120_128_unalign:
+ st.d a1, a0, 112
+end_112_120_unalign:
+ st.d a1, a0, 104
+end_104_112_unalign:
+ st.d a1, a0, 96
+end_96_104_unalign:
+ st.d a1, a0, 88
+end_88_96_unalign:
+ st.d a1, a0, 80
+end_80_88_unalign:
+ st.d a1, a0, 72
+end_72_80_unalign:
+ st.d a1, a0, 64
+end_64_72_unalign:
+ st.d a1, a0, 56
+end_56_64_unalign:
+ st.d a1, a0, 48
+end_48_56_unalign:
+ st.d a1, a0, 40
+end_40_48_unalign:
+ st.d a1, a0, 32
+end_32_40_unalign:
+ st.d a1, a0, 24
+end_24_32_unalign:
+ st.d a1, a0, 16
+end_16_24_unalign:
+ st.d a1, a0, 8
+end_8_16_unalign:
+ st.d a1, a0, 0
+end_0_8_unalign:
+ st.d a1, t7, -8
+
+ move v0, t0
+ jr ra
+
+END(memset)
+
+libc_hidden_builtin_def (memset)
--
2.33.0