466 lines
17 KiB
Diff
466 lines
17 KiB
Diff
From 81a80dbe9f47f728bc593d05cd5708a653a23f1c Mon Sep 17 00:00:00 2001
|
|
From: xiongzhou4 <xiongzhou4@huawei.com>
|
|
Date: Mon, 11 Sep 2023 11:33:41 +0800
|
|
Subject: [PATCH] [AArch64] Add AArch64 support for hugify.
|
|
|
|
---
|
|
bolt/CMakeLists.txt | 4 +-
|
|
bolt/runtime/CMakeLists.txt | 28 ++-
|
|
bolt/runtime/common.h | 224 ++++++++++++++++++
|
|
bolt/runtime/hugify.cpp | 21 +-
|
|
.../AArch64/Inputs/user_func_order.txt | 2 +
|
|
bolt/test/runtime/AArch64/user-func-reorder.c | 44 ++++
|
|
6 files changed, 305 insertions(+), 18 deletions(-)
|
|
create mode 100644 bolt/test/runtime/AArch64/Inputs/user_func_order.txt
|
|
create mode 100644 bolt/test/runtime/AArch64/user-func-reorder.c
|
|
|
|
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
|
|
index a97878cd3..3de930496 100644
|
|
--- a/bolt/CMakeLists.txt
|
|
+++ b/bolt/CMakeLists.txt
|
|
@@ -5,7 +5,7 @@ set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
|
|
set(CMAKE_CXX_STANDARD 14)
|
|
|
|
set(BOLT_ENABLE_RUNTIME OFF)
|
|
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
|
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|aarch64")
|
|
set(BOLT_ENABLE_RUNTIME ON)
|
|
endif()
|
|
|
|
@@ -45,7 +45,7 @@ if (LLVM_INCLUDE_TESTS)
|
|
endif()
|
|
|
|
if (BOLT_ENABLE_RUNTIME)
|
|
- message(STATUS "Building BOLT runtime libraries for X86")
|
|
+ message(STATUS "Building BOLT runtime libraries")
|
|
ExternalProject_Add(bolt_rt
|
|
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
|
|
STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
|
|
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
|
|
index 7c1b79af4..ee6ab7bd4 100644
|
|
--- a/bolt/runtime/CMakeLists.txt
|
|
+++ b/bolt/runtime/CMakeLists.txt
|
|
@@ -10,10 +10,12 @@ check_include_files(elf.h HAVE_ELF_H)
|
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in
|
|
${CMAKE_CURRENT_BINARY_DIR}/config.h)
|
|
|
|
-add_library(bolt_rt_instr STATIC
|
|
- instr.cpp
|
|
- ${CMAKE_CURRENT_BINARY_DIR}/config.h
|
|
- )
|
|
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
|
+ add_library(bolt_rt_instr STATIC
|
|
+ instr.cpp
|
|
+ ${CMAKE_CURRENT_BINARY_DIR}/config.h
|
|
+ )
|
|
+endif()
|
|
add_library(bolt_rt_hugify STATIC
|
|
hugify.cpp
|
|
${CMAKE_CURRENT_BINARY_DIR}/config.h
|
|
@@ -23,16 +25,24 @@ set(BOLT_RT_FLAGS
|
|
-ffreestanding
|
|
-fno-exceptions
|
|
-fno-rtti
|
|
- -fno-stack-protector
|
|
- -mno-sse)
|
|
+ -fno-stack-protector)
|
|
+
|
|
+# x86 exclusive option
|
|
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
|
+ list(APPEND BOLT_RT_FLAGS -mno-sse)
|
|
+endif()
|
|
|
|
# Don't let the compiler think it can create calls to standard libs
|
|
-target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE)
|
|
-target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
|
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
|
+ target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE)
|
|
+ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
|
+endif()
|
|
target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
|
|
target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
|
|
|
-install(TARGETS bolt_rt_instr DESTINATION lib)
|
|
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
|
+ install(TARGETS bolt_rt_instr DESTINATION lib)
|
|
+endif()
|
|
install(TARGETS bolt_rt_hugify DESTINATION lib)
|
|
|
|
if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*")
|
|
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
|
|
index 008dbb6c3..6869742e7 100644
|
|
--- a/bolt/runtime/common.h
|
|
+++ b/bolt/runtime/common.h
|
|
@@ -39,6 +39,45 @@ typedef int int32_t;
|
|
#endif
|
|
|
|
// Save all registers while keeping 16B stack alignment
|
|
+#if defined (__aarch64__)
|
|
+#define SAVE_ALL \
|
|
+ "stp x0, x1, [sp, #-16]!\n" \
|
|
+ "stp x2, x3, [sp, #-16]!\n" \
|
|
+ "stp x4, x5, [sp, #-16]!\n" \
|
|
+ "stp x6, x7, [sp, #-16]!\n" \
|
|
+ "stp x8, x9, [sp, #-16]!\n" \
|
|
+ "stp x10, x11, [sp, #-16]!\n" \
|
|
+ "stp x12, x13, [sp, #-16]!\n" \
|
|
+ "stp x14, x15, [sp, #-16]!\n" \
|
|
+ "stp x16, x17, [sp, #-16]!\n" \
|
|
+ "stp x18, x19, [sp, #-16]!\n" \
|
|
+ "stp x20, x21, [sp, #-16]!\n" \
|
|
+ "stp x22, x23, [sp, #-16]!\n" \
|
|
+ "stp x24, x25, [sp, #-16]!\n" \
|
|
+ "stp x26, x27, [sp, #-16]!\n" \
|
|
+ "stp x28, x29, [sp, #-16]!\n" \
|
|
+ "stp x30, xzr, [sp, #-16]!\n"
|
|
+
|
|
+// Mirrors SAVE_ALL
|
|
+#define RESTORE_ALL \
|
|
+ "ldp x30, xzr, [sp], #16\n" \
|
|
+ "ldp x28, x29, [sp], #16\n" \
|
|
+ "ldp x26, x27, [sp], #16\n" \
|
|
+ "ldp x24, x25, [sp], #16\n" \
|
|
+ "ldp x22, x23, [sp], #16\n" \
|
|
+ "ldp x20, x21, [sp], #16\n" \
|
|
+ "ldp x18, x19, [sp], #16\n" \
|
|
+ "ldp x16, x17, [sp], #16\n" \
|
|
+ "ldp x14, x15, [sp], #16\n" \
|
|
+ "ldp x12, x13, [sp], #16\n" \
|
|
+ "ldp x10, x11, [sp], #16\n" \
|
|
+ "ldp x8, x9, [sp], #16\n" \
|
|
+ "ldp x6, x7, [sp], #16\n" \
|
|
+ "ldp x4, x5, [sp], #16\n" \
|
|
+ "ldp x2, x3, [sp], #16\n" \
|
|
+ "ldp x0, x1, [sp], #16\n"
|
|
+
|
|
+#else
|
|
#define SAVE_ALL \
|
|
"push %%rax\n" \
|
|
"push %%rbx\n" \
|
|
@@ -75,6 +114,7 @@ typedef int int32_t;
|
|
"pop %%rcx\n" \
|
|
"pop %%rbx\n" \
|
|
"pop %%rax\n"
|
|
+#endif
|
|
|
|
// Functions that are required by freestanding environment. Compiler may
|
|
// generate calls to these implicitly.
|
|
@@ -129,6 +169,189 @@ constexpr uint32_t BufSize = 10240;
|
|
#define _STRINGIFY(x) #x
|
|
#define STRINGIFY(x) _STRINGIFY(x)
|
|
|
|
+#if defined (__aarch64__)
|
|
+// Declare some syscall wrappers we use throughout this code to avoid linking
|
|
+// against system libc.
|
|
+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
|
|
+ uint64_t ret;
|
|
+ register uint64_t x0 __asm__("x0") = fd;
|
|
+ register const void *x1 __asm__("x1") = buf;
|
|
+ register uint64_t x2 __asm__("x2") = count;
|
|
+ register uint32_t w8 __asm__("w8") = 63;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
|
|
+ uint64_t ret;
|
|
+ register uint64_t x0 __asm__("x0") = fd;
|
|
+ register const void *x1 __asm__("x1") = buf;
|
|
+ register uint64_t x2 __asm__("x2") = count;
|
|
+ register uint32_t w8 __asm__("w8") = 64;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
|
|
+ uint64_t fd, uint64_t offset) {
|
|
+ void *ret;
|
|
+ register uint64_t x0 __asm__("x0") = addr;
|
|
+ register uint64_t x1 __asm__("x1") = size;
|
|
+ register uint64_t x2 __asm__("x2") = prot;
|
|
+ register uint64_t x3 __asm__("x3") = flags;
|
|
+ register uint64_t x4 __asm__("x4") = fd;
|
|
+ register uint64_t x5 __asm__("x5") = offset;
|
|
+ register uint32_t w8 __asm__("w8") = 222;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+uint64_t __munmap(void *addr, uint64_t size) {
|
|
+ uint64_t ret;
|
|
+ register void *x0 __asm__("x0") = addr;
|
|
+ register uint64_t x1 __asm__("x1") = size;
|
|
+ register uint32_t w8 __asm__("w8") = 215;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+uint64_t __exit(uint64_t code) {
|
|
+ uint64_t ret;
|
|
+ register uint64_t x0 __asm__("x0") = code;
|
|
+ register uint32_t w8 __asm__("w8") = 94;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0)
|
|
+ : "r"(w8)
|
|
+ : "cc", "memory", "x1");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
|
|
+ uint64_t ret;
|
|
+ register int x0 __asm__("x0") = -100;
|
|
+ register const char *x1 __asm__("x1") = pathname;
|
|
+ register uint64_t x2 __asm__("x2") = flags;
|
|
+ register uint64_t x3 __asm__("x3") = mode;
|
|
+ register uint32_t w8 __asm__("w8") = 56;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %0, x0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(x3), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __madvise(void *addr, size_t length, int advice) {
|
|
+ int ret;
|
|
+ register void *x0 __asm__("x0") = addr;
|
|
+ register size_t x1 __asm__("x1") = length;
|
|
+ register int x2 __asm__("x2") = advice;
|
|
+ register uint32_t w8 __asm__("w8") = 233;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %w0, w0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __mprotect(void *addr, size_t len, int prot) {
|
|
+ int ret;
|
|
+ register void *x0 __asm__("x0") = addr;
|
|
+ register size_t x1 __asm__("x1") = len;
|
|
+ register int x2 __asm__("x2") = prot;
|
|
+ register uint32_t w8 __asm__("w8") = 226;
|
|
+ __asm__ __volatile__("svc #0\n"
|
|
+ "mov %w0, w0"
|
|
+ : "=r"(ret), "+r"(x0), "+r"(x1)
|
|
+ : "r"(x2), "r"(w8)
|
|
+ : "cc", "memory");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+// Helper functions for writing strings to the .fdata file. We intentionally
|
|
+// avoid using libc names to make it clear it is our impl.
|
|
+
|
|
+/// Compare two strings, at most Num bytes.
|
|
+int strnCmp(const char *Str1, const char *Str2, size_t Num) {
|
|
+ while (Num && *Str1 && (*Str1 == *Str2)) {
|
|
+ Num--;
|
|
+ Str1++;
|
|
+ Str2++;
|
|
+ }
|
|
+ if (Num == 0)
|
|
+ return 0;
|
|
+ return *(unsigned char *)Str1 - *(unsigned char *)Str2;
|
|
+}
|
|
+
|
|
+uint32_t strLen(const char *Str) {
|
|
+ uint32_t Size = 0;
|
|
+ while (*Str++)
|
|
+ ++Size;
|
|
+ return Size;
|
|
+}
|
|
+
|
|
+/// Write number Num using Base to the buffer in OutBuf, returns a pointer to
|
|
+/// the end of the string.
|
|
+char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
|
|
+ const char *Chars = "0123456789abcdef";
|
|
+ char Buf[21];
|
|
+ char *Ptr = Buf;
|
|
+ while (Num) {
|
|
+ *Ptr++ = *(Chars + (Num % Base));
|
|
+ Num /= Base;
|
|
+ }
|
|
+ if (Ptr == Buf) {
|
|
+ *OutBuf++ = '0';
|
|
+ return OutBuf;
|
|
+ }
|
|
+ while (Ptr != Buf)
|
|
+ *OutBuf++ = *--Ptr;
|
|
+
|
|
+ return OutBuf;
|
|
+}
|
|
+
|
|
+/// Copy Str to OutBuf, returns a pointer to the end of the copied string
|
|
+char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
|
|
+ while (*Str) {
|
|
+ *OutBuf++ = *Str++;
|
|
+ if (--Size <= 0)
|
|
+ return OutBuf;
|
|
+ }
|
|
+ return OutBuf;
|
|
+}
|
|
+
|
|
+void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
|
|
+ char Buf[BufSize];
|
|
+ char *Ptr = Buf;
|
|
+ Ptr = strCopy(Ptr, Msg, BufSize - 23);
|
|
+ Ptr = intToStr(Ptr, Num, Base);
|
|
+ Ptr = strCopy(Ptr, "\n");
|
|
+ __write(2, Buf, Ptr - Buf);
|
|
+}
|
|
+
|
|
+void reportError(const char *Msg, uint64_t Size) {
|
|
+ __write(2, Msg, Size);
|
|
+ __exit(1);
|
|
+}
|
|
+#else
|
|
uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
|
|
uint64_t ret;
|
|
#if defined(__APPLE__)
|
|
@@ -550,5 +773,6 @@ public:
|
|
inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
|
|
return (Value + Align - 1) / Align * Align;
|
|
}
|
|
+#endif
|
|
|
|
} // anonymous namespace
|
|
diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp
|
|
index 69e1a7e06..385e4d147 100644
|
|
--- a/bolt/runtime/hugify.cpp
|
|
+++ b/bolt/runtime/hugify.cpp
|
|
@@ -6,26 +6,25 @@
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
-#if defined (__x86_64__)
|
|
#if !defined(__APPLE__)
|
|
|
|
#include "common.h"
|
|
#include <sys/mman.h>
|
|
|
|
// Enables a very verbose logging to stderr useful when debugging
|
|
-//#define ENABLE_DEBUG
|
|
+// #define ENABLE_DEBUG
|
|
|
|
// Function pointers to init routines in the binary, so we can resume
|
|
// regular execution of the function that we hooked.
|
|
extern void (*__bolt_hugify_init_ptr)();
|
|
|
|
// The __hot_start and __hot_end symbols set by Bolt. We use them to figure
|
|
-// out the rage for marking huge pages.
|
|
+// out the range for marking huge pages.
|
|
extern uint64_t __hot_start;
|
|
extern uint64_t __hot_end;
|
|
|
|
#ifdef MADV_HUGEPAGE
|
|
-/// Check whether the kernel supports THP via corresponding sysfs entry.
|
|
+// Check whether the kernel supports THP via corresponding sysfs entry.
|
|
static bool has_pagecache_thp_support() {
|
|
char buf[256] = {0};
|
|
const char *madviseStr = "always [madvise] never";
|
|
@@ -116,14 +115,22 @@ extern "C" void __bolt_hugify_self_impl() {
|
|
#endif
|
|
}
|
|
|
|
-/// This is hooking ELF's entry, it needs to save all machine state.
|
|
+// This is hooking ELF's entry, it needs to save all machine state.
|
|
extern "C" __attribute((naked)) void __bolt_hugify_self() {
|
|
+#if defined (__x86_64__)
|
|
__asm__ __volatile__(SAVE_ALL
|
|
"call __bolt_hugify_self_impl\n"
|
|
RESTORE_ALL
|
|
"jmp *__bolt_hugify_init_ptr(%%rip)\n"
|
|
:::);
|
|
-}
|
|
-
|
|
+#elif defined (__aarch64__)
|
|
+ __asm__ __volatile__(SAVE_ALL
|
|
+ "bl __bolt_hugify_self_impl\n"
|
|
+ RESTORE_ALL
|
|
+ "ldr x16, =__bolt_hugify_init_ptr\n"
|
|
+ "ldr x16, [x16]\n"
|
|
+ "br x16\n"
|
|
+ :::);
|
|
#endif
|
|
+}
|
|
#endif
|
|
diff --git a/bolt/test/runtime/AArch64/Inputs/user_func_order.txt b/bolt/test/runtime/AArch64/Inputs/user_func_order.txt
|
|
new file mode 100644
|
|
index 000000000..48b76cd35
|
|
--- /dev/null
|
|
+++ b/bolt/test/runtime/AArch64/Inputs/user_func_order.txt
|
|
@@ -0,0 +1,2 @@
|
|
+main
|
|
+fib
|
|
diff --git a/bolt/test/runtime/AArch64/user-func-reorder.c b/bolt/test/runtime/AArch64/user-func-reorder.c
|
|
new file mode 100644
|
|
index 000000000..fcb92bca1
|
|
--- /dev/null
|
|
+++ b/bolt/test/runtime/AArch64/user-func-reorder.c
|
|
@@ -0,0 +1,44 @@
|
|
+/* Checks that BOLT correctly processes a user-provided function list file,
|
|
+ * reorder functions according to this list, update hot_start and hot_end
|
|
+ * symbols and insert a function to perform hot text mapping during program
|
|
+ * startup.
|
|
+ */
|
|
+#include <stdio.h>
|
|
+
|
|
+int foo(int x) {
|
|
+ return x + 1;
|
|
+}
|
|
+
|
|
+int fib(int x) {
|
|
+ if (x < 2)
|
|
+ return x;
|
|
+ return fib(x - 1) + fib(x - 2);
|
|
+}
|
|
+
|
|
+int bar(int x) {
|
|
+ return x - 1;
|
|
+}
|
|
+
|
|
+int main(int argc, char **argv) {
|
|
+ printf("fib(%d) = %d\n", argc, fib(argc));
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+REQUIRES: system-linux,bolt-runtime
|
|
+
|
|
+RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q
|
|
+
|
|
+RUN: llvm-bolt %t.exe --relocs=1 --lite --reorder-functions=user \
|
|
+RUN: --hugify --function-order=%p/Inputs/user_func_order.txt -o %t
|
|
+RUN: llvm-nm --numeric-sort --print-armap %t | \
|
|
+RUN: FileCheck %s -check-prefix=CHECK-NM
|
|
+RUN: %t 1 2 3 | FileCheck %s -check-prefix=CHECK-OUTPUT
|
|
+
|
|
+CHECK-NM: W __hot_start
|
|
+CHECK-NM: T main
|
|
+CHECK-NM-NEXT: T fib
|
|
+CHECK-NM-NEXT: W __hot_end
|
|
+
|
|
+CHECK-OUTPUT: fib(4) = 3
|
|
+*/
|
|
--
|
|
2.33.0
|
|
|