From 81a80dbe9f47f728bc593d05cd5708a653a23f1c Mon Sep 17 00:00:00 2001 From: xiongzhou4 Date: Mon, 11 Sep 2023 11:33:41 +0800 Subject: [PATCH] [AArch64] Add AArch64 support for hugify. --- bolt/CMakeLists.txt | 4 +- bolt/runtime/CMakeLists.txt | 28 ++- bolt/runtime/common.h | 224 ++++++++++++++++++ bolt/runtime/hugify.cpp | 21 +- .../AArch64/Inputs/user_func_order.txt | 2 + bolt/test/runtime/AArch64/user-func-reorder.c | 44 ++++ 6 files changed, 305 insertions(+), 18 deletions(-) create mode 100644 bolt/test/runtime/AArch64/Inputs/user_func_order.txt create mode 100644 bolt/test/runtime/AArch64/user-func-reorder.c diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index a97878cd3..3de930496 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -5,7 +5,7 @@ set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 14) set(BOLT_ENABLE_RUNTIME OFF) -if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|aarch64") set(BOLT_ENABLE_RUNTIME ON) endif() @@ -45,7 +45,7 @@ if (LLVM_INCLUDE_TESTS) endif() if (BOLT_ENABLE_RUNTIME) - message(STATUS "Building BOLT runtime libraries for X86") + message(STATUS "Building BOLT runtime libraries") ExternalProject_Add(bolt_rt SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime" STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 7c1b79af4..ee6ab7bd4 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -10,10 +10,12 @@ check_include_files(elf.h HAVE_ELF_H) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h) -add_library(bolt_rt_instr STATIC - instr.cpp - ${CMAKE_CURRENT_BINARY_DIR}/config.h - ) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + add_library(bolt_rt_instr STATIC + instr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h + ) +endif() add_library(bolt_rt_hugify STATIC hugify.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h @@ -23,16 +25,24 @@ set(BOLT_RT_FLAGS -ffreestanding -fno-exceptions -fno-rtti - -fno-stack-protector - -mno-sse) + -fno-stack-protector) + +# x86 exclusive option +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + list(APPEND BOLT_RT_FLAGS -mno-sse) +endif() # Don't let the compiler think it can create calls to standard libs -target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE) -target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE) + target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +endif() target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS}) target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -install(TARGETS bolt_rt_instr DESTINATION lib) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + install(TARGETS bolt_rt_instr DESTINATION lib) +endif() install(TARGETS bolt_rt_hugify DESTINATION lib) if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h index 008dbb6c3..6869742e7 100644 --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -39,6 +39,45 @@ typedef int int32_t; #endif // Save all registers while keeping 16B stack alignment +#if defined (__aarch64__) +#define SAVE_ALL \ + "stp x0, x1, [sp, #-16]!\n" \ + "stp x2, x3, [sp, #-16]!\n" \ + "stp x4, x5, [sp, #-16]!\n" \ + "stp x6, x7, [sp, #-16]!\n" \ + "stp x8, x9, [sp, #-16]!\n" \ + "stp x10, x11, [sp, #-16]!\n" \ + "stp x12, x13, [sp, #-16]!\n" \ + "stp x14, x15, [sp, #-16]!\n" \ + "stp x16, x17, [sp, #-16]!\n" \ + "stp x18, x19, [sp, #-16]!\n" \ + "stp x20, x21, [sp, #-16]!\n" \ + "stp x22, x23, [sp, #-16]!\n" \ + "stp x24, x25, [sp, #-16]!\n" \ + "stp x26, x27, [sp, #-16]!\n" \ + "stp x28, x29, [sp, #-16]!\n" \ + "stp x30, xzr, [sp, #-16]!\n" + +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "ldp x30, xzr, [sp], #16\n" \ + "ldp x28, x29, [sp], #16\n" \ + "ldp x26, x27, [sp], #16\n" \ + "ldp x24, x25, [sp], #16\n" \ + "ldp x22, x23, [sp], #16\n" \ + "ldp x20, x21, [sp], #16\n" \ + "ldp x18, x19, [sp], #16\n" \ + "ldp x16, x17, [sp], #16\n" \ + "ldp x14, x15, [sp], #16\n" \ + "ldp x12, x13, [sp], #16\n" \ + "ldp x10, x11, [sp], #16\n" \ + "ldp x8, x9, [sp], #16\n" \ + "ldp x6, x7, [sp], #16\n" \ + "ldp x4, x5, [sp], #16\n" \ + "ldp x2, x3, [sp], #16\n" \ + "ldp x0, x1, [sp], #16\n" + +#else #define SAVE_ALL \ "push %%rax\n" \ "push %%rbx\n" \ @@ -75,6 +114,7 @@ typedef int int32_t; "pop %%rcx\n" \ "pop %%rbx\n" \ "pop %%rax\n" +#endif // Functions that are required by freestanding environment. Compiler may // generate calls to these implicitly. @@ -129,6 +169,189 @@ constexpr uint32_t BufSize = 10240; #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) +#if defined (__aarch64__) +// Declare some syscall wrappers we use throughout this code to avoid linking +// against system libc. +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 63; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 64; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { + void *ret; + register uint64_t x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint64_t x2 __asm__("x2") = prot; + register uint64_t x3 __asm__("x3") = flags; + register uint64_t x4 __asm__("x4") = fd; + register uint64_t x5 __asm__("x5") = offset; + register uint32_t w8 __asm__("w8") = 222; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { + uint64_t ret; + register void *x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint32_t w8 __asm__("w8") = 215; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = code; + register uint32_t w8 __asm__("w8") = 94; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register uint64_t x2 __asm__("x2") = flags; + register uint64_t x3 __asm__("x3") = mode; + register uint32_t w8 __asm__("w8") = 56; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = length; + register int x2 __asm__("x2") = advice; + register uint32_t w8 __asm__("w8") = 233; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = len; + register int x2 __asm__("x2") = prot; + register uint32_t w8 __asm__("w8") = 226; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +// Helper functions for writing strings to the .fdata file. We intentionally +// avoid using libc names to make it clear it is our impl. + +/// Compare two strings, at most Num bytes. +int strnCmp(const char *Str1, const char *Str2, size_t Num) { + while (Num && *Str1 && (*Str1 == *Str2)) { + Num--; + Str1++; + Str2++; + } + if (Num == 0) + return 0; + return *(unsigned char *)Str1 - *(unsigned char *)Str2; +} + +uint32_t strLen(const char *Str) { + uint32_t Size = 0; + while (*Str++) + ++Size; + return Size; +} + +/// Write number Num using Base to the buffer in OutBuf, returns a pointer to +/// the end of the string. +char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) { + const char *Chars = "0123456789abcdef"; + char Buf[21]; + char *Ptr = Buf; + while (Num) { + *Ptr++ = *(Chars + (Num % Base)); + Num /= Base; + } + if (Ptr == Buf) { + *OutBuf++ = '0'; + return OutBuf; + } + while (Ptr != Buf) + *OutBuf++ = *--Ptr; + + return OutBuf; +} + +/// Copy Str to OutBuf, returns a pointer to the end of the copied string +char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) { + while (*Str) { + *OutBuf++ = *Str++; + if (--Size <= 0) + return OutBuf; + } + return OutBuf; +} + +void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) { + char Buf[BufSize]; + char *Ptr = Buf; + Ptr = strCopy(Ptr, Msg, BufSize - 23); + Ptr = intToStr(Ptr, Num, Base); + Ptr = strCopy(Ptr, "\n"); + __write(2, Buf, Ptr - Buf); +} + +void reportError(const char *Msg, uint64_t Size) { + __write(2, Msg, Size); + __exit(1); +} +#else uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { uint64_t ret; #if defined(__APPLE__) @@ -550,5 +773,6 @@ public: inline uint64_t alignTo(uint64_t Value, uint64_t Align) { return (Value + Align - 1) / Align * Align; } +#endif } // anonymous namespace diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp index 69e1a7e06..385e4d147 100644 --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -6,26 +6,25 @@ // //===----------------------------------------------------------------------===// -#if defined (__x86_64__) #if !defined(__APPLE__) #include "common.h" #include // Enables a very verbose logging to stderr useful when debugging -//#define ENABLE_DEBUG +// #define ENABLE_DEBUG // Function pointers to init routines in the binary, so we can resume // regular execution of the function that we hooked. extern void (*__bolt_hugify_init_ptr)(); // The __hot_start and __hot_end symbols set by Bolt. We use them to figure -// out the rage for marking huge pages. +// out the range for marking huge pages. extern uint64_t __hot_start; extern uint64_t __hot_end; #ifdef MADV_HUGEPAGE -/// Check whether the kernel supports THP via corresponding sysfs entry. +// Check whether the kernel supports THP via corresponding sysfs entry. static bool has_pagecache_thp_support() { char buf[256] = {0}; const char *madviseStr = "always [madvise] never"; @@ -116,14 +115,22 @@ extern "C" void __bolt_hugify_self_impl() { #endif } -/// This is hooking ELF's entry, it needs to save all machine state. +// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_hugify_self() { +#if defined (__x86_64__) __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL "jmp *__bolt_hugify_init_ptr(%%rip)\n" :::); -} - +#elif defined (__aarch64__) + __asm__ __volatile__(SAVE_ALL + "bl __bolt_hugify_self_impl\n" + RESTORE_ALL + "ldr x16, =__bolt_hugify_init_ptr\n" + "ldr x16, [x16]\n" + "br x16\n" + :::); #endif +} #endif diff --git a/bolt/test/runtime/AArch64/Inputs/user_func_order.txt b/bolt/test/runtime/AArch64/Inputs/user_func_order.txt new file mode 100644 index 000000000..48b76cd35 --- /dev/null +++ b/bolt/test/runtime/AArch64/Inputs/user_func_order.txt @@ -0,0 +1,2 @@ +main +fib diff --git a/bolt/test/runtime/AArch64/user-func-reorder.c b/bolt/test/runtime/AArch64/user-func-reorder.c new file mode 100644 index 000000000..fcb92bca1 --- /dev/null +++ b/bolt/test/runtime/AArch64/user-func-reorder.c @@ -0,0 +1,44 @@ +/* Checks that BOLT correctly processes a user-provided function list file, + * reorder functions according to this list, update hot_start and hot_end + * symbols and insert a function to perform hot text mapping during program + * startup. + */ +#include + +int foo(int x) { + return x + 1; +} + +int fib(int x) { + if (x < 2) + return x; + return fib(x - 1) + fib(x - 2); +} + +int bar(int x) { + return x - 1; +} + +int main(int argc, char **argv) { + printf("fib(%d) = %d\n", argc, fib(argc)); + return 0; +} + +/* +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q + +RUN: llvm-bolt %t.exe --relocs=1 --lite --reorder-functions=user \ +RUN: --hugify --function-order=%p/Inputs/user_func_order.txt -o %t +RUN: llvm-nm --numeric-sort --print-armap %t | \ +RUN: FileCheck %s -check-prefix=CHECK-NM +RUN: %t 1 2 3 | FileCheck %s -check-prefix=CHECK-OUTPUT + +CHECK-NM: W __hot_start +CHECK-NM: T main +CHECK-NM-NEXT: T fib +CHECK-NM-NEXT: W __hot_end + +CHECK-OUTPUT: fib(4) = 3 +*/ -- 2.33.0