I9KBT5:8256488: Use ldpq/stpq instead of ld4/st4 for small copies in StubGenerator::copy_memory

2024-04-28 14:47:21 +08:00 · 2024-04-28 14:47:21 +08:00 · 36a8eebca3
commit 36a8eebca3
parent b4ad046d51
2 changed files with 66 additions and 0 deletions
--- a/8256488-Use-ldpq-stpq-instead-of-ld4-st4-for-small-c.patch
+++ b/8256488-Use-ldpq-stpq-instead-of-ld4-st4-for-small-c.patch
@ -0,0 +1,60 @@
+Subject: 8256488: Use ldpq/stpq instead of ld4/st4 for small copies in StubGenerator::copy_memory
+
+--
+ .../cpu/aarch64/vm/stubGenerator_aarch64.cpp  | 30 ++++++++++++++++---
+ 1 file changed, 26 insertions(+), 4 deletions(-)
+
+diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+index f61028d5007..cf66df296e4 100644
+--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+@@ -1149,10 +1149,10 @@ class StubGenerator: public StubCodeGenerator {
+                    Register count, Register tmp, int step) {
+     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
+     bool is_backwards = step < 0;
+-    int granularity = uabs(step);
+    unsigned granularity = uabs(step);
+     const Register t0 = r3, t1 = r4;
+ 
+-    // <= 96 bytes do inline. Direction doesn't matter because we always
+    // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
+     // load all the data before writing anything
+     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
+     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
+@@ -1207,9 +1207,31 @@ class StubGenerator: public StubCodeGenerator {
+     // (96 bytes if SIMD because we do 32 byes per instruction)
+     __ bind(copy80);
+     if (UseSIMDForMemoryOps) {
+-      __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
+      __ ldpq(v0, v1, Address(s, 0));
+      __ ldpq(v2, v3, Address(s, 32));
+      // Unaligned pointers can be an issue for copying.
+      // The issue has more chances to happen when granularity of data is
+      // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
+      // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
+      // The most performance drop has been seen for the range 65-80 bytes.
+      // For such cases using the pair of ldp/stp instead of the third pair of
+      // ldpq/stpq fixes the performance issue.
+      if (granularity < sizeof (jint)) {
+        Label copy96;
+        __ cmp(count, u1(80/granularity));
+        __ br(Assembler::HI, copy96);
+        __ ldp(t0, t1, Address(send, -16));
+
+        __ stpq(v0, v1, Address(d, 0));
+        __ stpq(v2, v3, Address(d, 32));
+        __ stp(t0, t1, Address(dend, -16));
+        __ b(finish);
+
+        __ bind(copy96);
+      }
+       __ ldpq(v4, v5, Address(send, -32));
+-      __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
+      __ stpq(v0, v1, Address(d, 0));
+      __ stpq(v2, v3, Address(d, 32));
+       __ stpq(v4, v5, Address(dend, -32));
+     } else {
+       __ ldp(t0, t1, Address(s, 0));
+-- 
+2.19.1
+
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@ -1318,6 +1318,8 @@ Patch426: fix-GCC-12-build-jdk8-fastdebug-error.patch
 Patch427: 8223485-C2-PhaseIdealLoop-create_new_if_for_predicat.patch
 Patch428: 8223486-split-if-update_uses-accesses-stale-idom-dat.patch

+#412
+Patch429: 8256488-Use-ldpq-stpq-instead-of-ld4-st4-for-small-c.patch
 #############################################
 #
 # Upstreamable patches
@ -1959,6 +1961,7 @@ pushd %{top_level_dir_name}
 %patch426 -p1
 %patch427 -p1
 %patch428 -p1
+%patch429 -p1
 %endif

 %ifarch loongarch64
@ -2617,6 +2620,9 @@ cjc.mainProgram(arg)
 %endif

 %changelog
+* Sun Apr 28 2024 Autistic_boyya <wangzhongyi7@huawei.com> -1:1.8.0.412-b08.1
+- add 8256488-Use-ldpq-stpq-instead-of-ld4-st4-for-small-c.patch
+
 * Thu Apr 18 2024 Autistic_boyya <wangzhongyi7@huawei.com> -1:1.8.0.412-b08.0
 - del 8322725-tz-Update-Timezone-Data-to-2023d.patch
 - del 8325150-tz-Update-Timezone-Data-to-2024a.patch