glibc/5_6-LoongArch-Optimize-string-function-strcpy.patch
Xue Liu 4adfab66ba LoongArch: Optimize string functions including memcpy, memmove,
memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.

(cherry picked from commit 4ba365320a633ecd4cb47d8f171aa81fcd1dd6ef)
2023-01-29 14:28:25 +08:00

196 lines
5.4 KiB
Diff

From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001
From: Xue Liu <liuxue@loongson.cn>
Date: Sun, 29 Jan 2023 10:25:18 +0800
Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy.
Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be
---
sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++
1 file changed, 175 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/strcpy.S
diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
new file mode 100644
index 00000000..03d9d361
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcpy.S
@@ -0,0 +1,175 @@
+/* Optimized strcpy implementation for LoongArch.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sys/asm.h>
+
+/* Parameters and Results */
+#define dest a0
+#define src a1
+#define result v0
+
+/* Internal variable */
+#define data t0
+#define data1 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define dest_backup t8
+#define tmp1 a4
+#define tmp2 a5
+#define tmp3 a6
+#define dest_off a2
+#define src_off a3
+#define tmp4 a7
+
+/* rd <- if rc then ra else rb
+ tmp3 will be destroyed. */
+#define CONDITIONSEL(rd, rc, ra, rb)\
+ masknez tmp3, rb, rc;\
+ maskeqz rd, ra, rc;\
+ or rd, rd, tmp3
+
+/* int strcpy (const char *s1, const char *s2); */
+LEAF(strcpy)
+ .align 4
+
+ move dest_backup, dest
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ ori zeroones, zeroones, 0x101
+ ori sevenf, sevenf, 0xf7f
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ andi src_off, src, 0x7
+ beqz src_off, strcpy_loop_aligned_1
+ b strcpy_mutual_align
+strcpy_loop_aligned:
+ st.d data, dest, 0
+ addi.d dest, dest, 8
+strcpy_loop_aligned_1:
+ ld.d data, src, 0
+ addi.d src, src, 8
+strcpy_start_realigned:
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ beqz has_nul, strcpy_loop_aligned
+
+strcpy_end:
+
+ /* 8 4 2 1 */
+ ctz.d pos, has_nul
+ srli.d pos, pos, 3
+ addi.d pos, pos, 1
+ /* Do 8/4/2/1 strcpy based on pos value.
+ pos value is the number of bytes to be copied
+ the bytes include the final \0 so the max length is 8 and the min length is 1. */
+strcpy_end_8:
+ andi tmp1, pos, 0x8
+ beqz tmp1, strcpy_end_4
+ st.d data, dest, 0
+ move dest, dest_backup
+ jr ra
+strcpy_end_4:
+ andi tmp1, pos, 0x4
+ beqz tmp1, strcpy_end_2
+ st.w data, dest, 0
+ srli.d data, data, 32
+ addi.d dest, dest, 4
+strcpy_end_2:
+ andi tmp1, pos, 0x2
+ beqz tmp1, strcpy_end_1
+ st.h data, dest, 0
+ srli.d data, data, 16
+ addi.d dest, dest, 2
+strcpy_end_1:
+ andi tmp1, pos, 0x1
+ beqz tmp1, strcpy_end_ret
+ st.b data, dest, 0
+strcpy_end_ret:
+ move result, dest_backup
+ jr ra
+
+
+strcpy_mutual_align:
+ /* Check if around src page bound.
+ if not go to page cross ok.
+ if it is, do further check.
+ use tmp2 to accelerate. */
+
+ li.w tmp2, 0xff8
+ andi tmp1, src, 0xff8
+ beq tmp1, tmp2, strcpy_page_cross
+
+strcpy_page_cross_ok:
+ /* Load a misaligned double word and check if has \0
+ If no, do a misaligned double word paste.
+ If yes, calculate the number of avaliable bytes,
+ then jump to 4/2/1 end. */
+ ld.d data, src, 0
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ bnez has_nul, strcpy_end
+strcpy_mutual_align_finish:
+ /* Before jump back to align loop, make dest/src aligned.
+ This will cause a duplicated paste for several bytes between the first double word and the second double word,
+ but should not bring a problem. */
+ li.w tmp1, 8
+ st.d data, dest, 0
+ sub.d tmp1, tmp1, src_off
+ add.d src, src, tmp1
+ add.d dest, dest, tmp1
+
+ b strcpy_loop_aligned_1
+
+strcpy_page_cross:
+ /*
+ ld.d from aligned address(src & ~0x7).
+ check if high bytes have \0.
+ it not, go back to page cross ok,
+ since the string is supposed to cross the page bound in such situation.
+ if it is, do a srl for data to make it seems like a direct double word from src,
+ then go to 4/2/1 strcpy end.
+
+ tmp4 is 0xffff...ffff mask
+ tmp2 demonstrate the bytes to be masked
+ tmp2 = src_off << 3
+ data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
+ and
+ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
+
+ li.w tmp1, 0x7
+ andn tmp3, src, tmp1
+ ld.d data, tmp3, 0
+ li.w tmp4, -1
+ slli.d tmp2, src_off, 3
+ srl.d tmp4, tmp4, tmp2
+ srl.d data, data, tmp2
+ nor tmp4, tmp4, zero
+ or data, data, tmp4
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ beqz has_nul, strcpy_page_cross_ok
+ b strcpy_end
+END(strcpy)
+libc_hidden_builtin_def (strcpy)
--
2.33.0