memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen. (cherry picked from commit 4ba365320a633ecd4cb47d8f171aa81fcd1dd6ef)
415 lines
11 KiB
Diff
415 lines
11 KiB
Diff
From 3f3b70e39a529369e4b2936f35034215a45436a3 Mon Sep 17 00:00:00 2001
|
|
From: Xue Liu <liuxue@loongson.cn>
|
|
Date: Sun, 29 Jan 2023 10:23:50 +0800
|
|
Subject: [PATCH 4/6] LoongArch: Optimize string functions strcmp, strncmp.
|
|
|
|
Change-Id: I436138a312e8ebb668223cafef84fd74dcde72fd
|
|
---
|
|
sysdeps/loongarch/lp64/strcmp.S | 161 ++++++++++++++++++++++
|
|
sysdeps/loongarch/lp64/strncmp.S | 225 +++++++++++++++++++++++++++++++
|
|
2 files changed, 386 insertions(+)
|
|
create mode 100644 sysdeps/loongarch/lp64/strcmp.S
|
|
create mode 100644 sysdeps/loongarch/lp64/strncmp.S
|
|
|
|
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
|
|
new file mode 100644
|
|
index 00000000..0f7a6d55
|
|
--- /dev/null
|
|
+++ b/sysdeps/loongarch/lp64/strcmp.S
|
|
@@ -0,0 +1,161 @@
|
|
+/* Optimized strcmp implementation for LoongArch.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include <sys/asm.h>
|
|
+
|
|
+/* Parameters and Results */
|
|
+#define src1 a0
|
|
+#define src2 a1
|
|
+#define result v0
|
|
+
|
|
+/* Internal variable */
|
|
+#define src1_off a2
|
|
+#define src2_off a3
|
|
+#define data1 t0
|
|
+#define data2 t1
|
|
+#define has_nul t2
|
|
+#define diff t3
|
|
+#define syndrome t4
|
|
+#define zeroones t5
|
|
+#define sevenf t6
|
|
+#define pos t7
|
|
+#define exchange t8
|
|
+#define tmp1 a4
|
|
+#define tmp2 a5
|
|
+#define tmp3 a6
|
|
+#define tmp4 a7
|
|
+
|
|
+/* rd <- if rc then ra else rb
|
|
+ tmp3 will be destroyed */
|
|
+#define CONDITIONSEL(rd, rc, ra, rb)\
|
|
+ masknez tmp3, rb, rc;\
|
|
+ maskeqz rd, ra, rc;\
|
|
+ or rd, rd, tmp3
|
|
+
|
|
+LEAF(strcmp)
|
|
+ .align 4
|
|
+
|
|
+ xor tmp1, src1, src2
|
|
+ lu12i.w zeroones, 0x01010
|
|
+ lu12i.w sevenf, 0x7f7f7
|
|
+ andi src1_off, src1, 0x7
|
|
+ ori zeroones, zeroones, 0x101
|
|
+ ori sevenf, sevenf, 0xf7f
|
|
+ andi tmp1, tmp1, 0x7
|
|
+ bstrins.d zeroones, zeroones, 63, 32
|
|
+ bstrins.d sevenf, sevenf, 63, 32
|
|
+ bnez tmp1, strcmp_misaligned8
|
|
+ bnez src1_off, strcmp_mutual_align
|
|
+strcmp_loop_aligned:
|
|
+ ld.d data1, src1, 0
|
|
+ addi.d src1, src1, 8
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src2, src2, 8
|
|
+strcmp_start_realigned:
|
|
+ sub.d tmp1, data1, zeroones
|
|
+ or tmp2, data1, sevenf
|
|
+ xor diff, data1, data2
|
|
+ andn has_nul, tmp1, tmp2
|
|
+ or syndrome, diff, has_nul
|
|
+ beqz syndrome, strcmp_loop_aligned
|
|
+
|
|
+strcmp_end:
|
|
+ ctz.d pos, syndrome
|
|
+ bstrins.d pos, zero, 2, 0
|
|
+ srl.d data1, data1, pos
|
|
+ srl.d data2, data2, pos
|
|
+ andi data1, data1, 0xff
|
|
+ andi data2, data2, 0xff
|
|
+ sub.d result, data1, data2
|
|
+ jr ra
|
|
+strcmp_mutual_align:
|
|
+ bstrins.d src1, zero, 2, 0
|
|
+ bstrins.d src2, zero, 2, 0
|
|
+ slli.d tmp1, src1_off, 0x3
|
|
+ ld.d data1, src1, 0
|
|
+ sub.d tmp1, zero, tmp1
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src1, src1, 8
|
|
+ addi.d src2, src2, 8
|
|
+ nor tmp2, zero, zero
|
|
+ srl.d tmp2, tmp2, tmp1
|
|
+ or data1, data1, tmp2
|
|
+ or data2, data2, tmp2
|
|
+ b strcmp_start_realigned
|
|
+
|
|
+strcmp_misaligned8:
|
|
+ /* check
|
|
+ if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
|
|
+ then exchange(src1,src2). */
|
|
+ andi src2_off, src2, 0x7
|
|
+ slt tmp2, src1_off, src2_off
|
|
+ CONDITIONSEL(tmp2, src2_off, tmp2, tmp1)
|
|
+ maskeqz exchange, tmp2, src1_off
|
|
+ xor tmp3, src1, src2
|
|
+ maskeqz tmp3, tmp3, exchange
|
|
+ xor src1, src1, tmp3
|
|
+ xor src2, src2, tmp3
|
|
+
|
|
+ andi src1_off, src1, 0x7
|
|
+ beqz src1_off, strcmp_loop_misaligned
|
|
+strcmp_do_misaligned:
|
|
+ ld.bu data1, src1, 0
|
|
+ ld.bu data2, src2, 0
|
|
+ xor tmp3, data1, data2
|
|
+ addi.d src1, src1, 1
|
|
+ masknez tmp3, data1, tmp3
|
|
+ addi.d src2, src2, 1
|
|
+ beqz tmp3, strcmp_done
|
|
+ andi src1_off, src1, 0x7
|
|
+ bnez src1_off, strcmp_do_misaligned
|
|
+
|
|
+strcmp_loop_misaligned:
|
|
+ andi tmp1, src2, 0xff8
|
|
+ xori tmp1, tmp1, 0xff8
|
|
+ beqz tmp1, strcmp_do_misaligned
|
|
+ ld.d data1, src1, 0
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src1, src1, 8
|
|
+ addi.d src2, src2, 8
|
|
+
|
|
+ sub.d tmp1, data1, zeroones
|
|
+ or tmp2, data1, sevenf
|
|
+ xor diff, data1, data2
|
|
+ andn has_nul, tmp1, tmp2
|
|
+ or syndrome, diff, has_nul
|
|
+ beqz syndrome, strcmp_loop_misaligned
|
|
+strcmp_misalign_end:
|
|
+ ctz.d pos, syndrome
|
|
+ bstrins.d pos, zero, 2, 0
|
|
+ srl.d data1, data1, pos
|
|
+ srl.d data2, data2, pos
|
|
+ andi data1, data1, 0xff
|
|
+ andi data2, data2, 0xff
|
|
+ sub.d tmp1, data1, data2
|
|
+ sub.d tmp2, data2, data1
|
|
+ CONDITIONSEL(result, exchange, tmp2, tmp1)
|
|
+ jr ra
|
|
+
|
|
+strcmp_done:
|
|
+ sub.d tmp1, data1, data2
|
|
+ sub.d tmp2, data2, data1
|
|
+ CONDITIONSEL(result, exchange, tmp2, tmp1)
|
|
+ jr ra
|
|
+END(strcmp)
|
|
+
|
|
+libc_hidden_builtin_def (strcmp)
|
|
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
|
|
new file mode 100644
|
|
index 00000000..979ea40a
|
|
--- /dev/null
|
|
+++ b/sysdeps/loongarch/lp64/strncmp.S
|
|
@@ -0,0 +1,225 @@
|
|
+/* Optimized strncmp implementation for LoongArch.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include <sys/asm.h>
|
|
+
|
|
+/* Parameters and Results */
|
|
+#define src1 a0
|
|
+#define src2 a1
|
|
+#define limit a2
|
|
+#define result v0
|
|
+
|
|
+
|
|
+/* Internal variable */
|
|
+#define data1 t0
|
|
+#define data2 t1
|
|
+#define has_nul t2
|
|
+#define diff t3
|
|
+#define syndrome t4
|
|
+#define zeroones t5
|
|
+#define sevenf t6
|
|
+#define pos t7
|
|
+#define exchange t8
|
|
+#define tmp1 a5
|
|
+#define tmp2 a6
|
|
+#define tmp3 a7
|
|
+#define src1_off a3
|
|
+#define limit_wd a4
|
|
+
|
|
+LEAF(strncmp)
|
|
+ .align 4
|
|
+ beqz limit, strncmp_ret0
|
|
+
|
|
+ xor tmp1, src1, src2
|
|
+ lu12i.w zeroones, 0x01010
|
|
+ lu12i.w sevenf, 0x7f7f7
|
|
+ andi src1_off, src1, 0x7
|
|
+ ori zeroones, zeroones, 0x101
|
|
+ andi tmp1, tmp1, 0x7
|
|
+ ori sevenf, sevenf, 0xf7f
|
|
+ bstrins.d zeroones, zeroones, 63, 32
|
|
+ bstrins.d sevenf, sevenf, 63, 32
|
|
+ bnez tmp1, strncmp_misaligned8
|
|
+ bnez src1_off, strncmp_mutual_align
|
|
+ addi.d limit_wd, limit, -1
|
|
+ srli.d limit_wd, limit_wd, 3
|
|
+
|
|
+strncmp_loop_aligned:
|
|
+ ld.d data1, src1, 0
|
|
+ addi.d src1, src1, 8
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src2, src2, 8
|
|
+strncmp_start_realigned:
|
|
+ addi.d limit_wd, limit_wd, -1
|
|
+ sub.d tmp1, data1, zeroones
|
|
+ or tmp2, data1, sevenf
|
|
+ xor diff, data1, data2
|
|
+ andn has_nul, tmp1, tmp2
|
|
+ srli.d tmp1, limit_wd, 63
|
|
+ or syndrome, diff, has_nul
|
|
+ or tmp2, syndrome, tmp1
|
|
+ beqz tmp2, strncmp_loop_aligned
|
|
+
|
|
+ /* if not reach limit */
|
|
+ bge limit_wd, zero, strncmp_not_limit
|
|
+ /* if reach limit */
|
|
+ andi limit, limit, 0x7
|
|
+ li.w tmp1, 0x8
|
|
+ sub.d limit, tmp1, limit
|
|
+ slli.d limit, limit, 0x3
|
|
+ li.d tmp1, -1
|
|
+ srl.d tmp1, tmp1, limit
|
|
+ and data1, data1, tmp1
|
|
+ and data2, data2, tmp1
|
|
+ orn syndrome, syndrome, tmp1
|
|
+
|
|
+
|
|
+strncmp_not_limit:
|
|
+ ctz.d pos, syndrome
|
|
+ bstrins.d pos, zero, 2, 0
|
|
+ srl.d data1, data1, pos
|
|
+ srl.d data2, data2, pos
|
|
+ andi data1, data1, 0xff
|
|
+ andi data2, data2, 0xff
|
|
+ sub.d result, data1, data2
|
|
+ jr ra
|
|
+
|
|
+
|
|
+
|
|
+strncmp_mutual_align:
|
|
+ bstrins.d src1, zero, 2, 0
|
|
+ bstrins.d src2, zero, 2, 0
|
|
+ slli.d tmp1, src1_off, 0x3
|
|
+ ld.d data1, src1, 0
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src2, src2, 8
|
|
+ addi.d src1, src1, 8
|
|
+
|
|
+ addi.d limit_wd, limit, -1
|
|
+ andi tmp3, limit_wd, 0x7
|
|
+ srli.d limit_wd, limit_wd, 3
|
|
+ add.d limit, limit, src1_off
|
|
+ add.d tmp3, tmp3, src1_off
|
|
+ srli.d tmp3, tmp3, 3
|
|
+ add.d limit_wd, limit_wd, tmp3
|
|
+
|
|
+ sub.d tmp1, zero, tmp1
|
|
+ nor tmp2, zero, zero
|
|
+ srl.d tmp2, tmp2, tmp1
|
|
+ or data1, data1, tmp2
|
|
+ or data2, data2, tmp2
|
|
+ b strncmp_start_realigned
|
|
+
|
|
+strncmp_misaligned8:
|
|
+
|
|
+ li.w tmp1, 0x10
|
|
+ bge limit, tmp1, strncmp_try_words
|
|
+strncmp_byte_loop:
|
|
+ ld.bu data1, src1, 0
|
|
+ ld.bu data2, src2, 0
|
|
+ addi.d limit, limit, -1
|
|
+ xor tmp1, data1, data2
|
|
+ masknez tmp1, data1, tmp1
|
|
+ maskeqz tmp1, limit, tmp1
|
|
+ beqz tmp1, strncmp_done
|
|
+
|
|
+ ld.bu data1, src1, 1
|
|
+ ld.bu data2, src2, 1
|
|
+ addi.d src1, src1, 2
|
|
+ addi.d src2, src2, 2
|
|
+ addi.d limit, limit, -1
|
|
+ xor tmp1, data1, data2
|
|
+ masknez tmp1, data1, tmp1
|
|
+ maskeqz tmp1, limit, tmp1
|
|
+ bnez tmp1, strncmp_byte_loop
|
|
+
|
|
+
|
|
+strncmp_done:
|
|
+ sub.d result, data1, data2
|
|
+ jr ra
|
|
+
|
|
+strncmp_try_words:
|
|
+ srli.d limit_wd, limit, 3
|
|
+ beqz src1_off, strncmp_do_misaligned
|
|
+
|
|
+ sub.d src1_off, zero, src1_off
|
|
+ andi src1_off, src1_off, 0x7
|
|
+ sub.d limit, limit, src1_off
|
|
+ srli.d limit_wd, limit, 0x3
|
|
+
|
|
+strncmp_page_end_loop:
|
|
+ ld.bu data1, src1, 0
|
|
+ ld.bu data2, src2, 0
|
|
+ addi.d src1, src1, 1
|
|
+ addi.d src2, src2, 1
|
|
+ xor tmp1, data1, data2
|
|
+ masknez tmp1, data1, tmp1
|
|
+ beqz tmp1, strncmp_done
|
|
+ andi tmp1, src1, 0x7
|
|
+ bnez tmp1, strncmp_page_end_loop
|
|
+strncmp_do_misaligned:
|
|
+ li.w src1_off, 0x8
|
|
+ addi.d limit_wd, limit_wd, -1
|
|
+ blt limit_wd, zero, strncmp_done_loop
|
|
+
|
|
+strncmp_loop_misaligned:
|
|
+ andi tmp2, src2, 0xff8
|
|
+ xori tmp2, tmp2, 0xff8
|
|
+ beqz tmp2, strncmp_page_end_loop
|
|
+
|
|
+ ld.d data1, src1, 0
|
|
+ ld.d data2, src2, 0
|
|
+ addi.d src1, src1, 8
|
|
+ addi.d src2, src2, 8
|
|
+ sub.d tmp1, data1, zeroones
|
|
+ or tmp2, data1, sevenf
|
|
+ xor diff, data1, data2
|
|
+ andn has_nul, tmp1, tmp2
|
|
+ or syndrome, diff, has_nul
|
|
+ bnez syndrome, strncmp_not_limit
|
|
+ addi.d limit_wd, limit_wd, -1
|
|
+ bge limit_wd, zero, strncmp_loop_misaligned
|
|
+
|
|
+strncmp_done_loop:
|
|
+ andi limit, limit, 0x7
|
|
+ beqz limit, strncmp_not_limit
|
|
+ /* Read the last double word
|
|
+ check if the final part is about to exceed the page */
|
|
+ andi tmp1, src2, 0x7
|
|
+ andi tmp2, src2, 0xff8
|
|
+ add.d tmp1, tmp1, limit
|
|
+ xori tmp2, tmp2, 0xff8
|
|
+ andi tmp1, tmp1, 0x8
|
|
+ masknez tmp1, tmp1, tmp2
|
|
+ bnez tmp1, strncmp_byte_loop
|
|
+ addi.d src1, src1, -8
|
|
+ addi.d src2, src2, -8
|
|
+ ldx.d data1, src1, limit
|
|
+ ldx.d data2, src2, limit
|
|
+ sub.d tmp1, data1, zeroones
|
|
+ or tmp2, data1, sevenf
|
|
+ xor diff, data1, data2
|
|
+ andn has_nul, tmp1, tmp2
|
|
+ or syndrome, diff, has_nul
|
|
+ bnez syndrome, strncmp_not_limit
|
|
+
|
|
+strncmp_ret0:
|
|
+ move result, zero
|
|
+ jr ra
|
|
+END(strncmp)
|
|
+libc_hidden_builtin_def (strncmp)
|
|
--
|
|
2.33.0
|
|
|