!34 rseq c/r support

From: @bb-cat 
Reviewed-by: @hjx_gitff 
Signed-off-by: @snoweay
This commit is contained in:
openeuler-ci-bot 2022-03-14 07:38:35 +00:00 committed by Gitee
commit 5abe2ad6ba
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
16 changed files with 3294 additions and 1 deletions

View File

@ -0,0 +1,74 @@
From ee46b1b5755eacf3be02a67934f0dc690293745b Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 13:28:51 +0800
Subject: [PATCH 02/16] compel: add rseq syscall into compel std plugin syscall
tables Add rseq syscall numbers for: arm/aarch64, mips64, ppc64le, s390,
x86_64/x86
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
compel/arch/arm/plugins/std/syscalls/syscall.def | 1 +
compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 1 +
.../compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 +
.../compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 1 +
compel/arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 +
compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 +
6 files changed, 6 insertions(+)
diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
index 1b877d1..bb78cbb 100644
--- a/compel/arch/arm/plugins/std/syscalls/syscall.def
+++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
@@ -119,3 +119,4 @@ clone3 435 435 (struct clone_args *uargs, size_t size)
sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask)
pidfd_open 434 434 (pid_t pid, unsigned int flags)
pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags)
+rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl
index 7a6db19..95dc7d3 100644
--- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl
+++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl
@@ -115,3 +115,4 @@ __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr
__NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size)
__NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags)
__NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)
+__NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
index dd79187..ad0d94f 100644
--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
+++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)
+__NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
index 282adaf..916b697 100644
--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
+++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)
+__NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
index 3fe3194..90f23d5 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
@@ -103,3 +103,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_f
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)
+__NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
index c1d119d..323fab1 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
@@ -114,3 +114,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)
+__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig)
--
2.30.0

View File

@ -0,0 +1,62 @@
From ebd917f395b8bb3c4d6bbe51f9210d1aeca2e1fd Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 13:34:10 +0800
Subject: [PATCH 03/16] kerndat: check for rseq syscall support Signed-off-by:
Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
criu/include/kerndat.h | 1 +
criu/kerndat.c | 18 ++++++++++++++++++
2 files changed, 19 insertions(+)
diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
index 80bad7f..44a6976 100644
--- a/criu/include/kerndat.h
+++ b/criu/include/kerndat.h
@@ -74,6 +74,7 @@ struct kerndat_s {
bool has_pidfd_getfd;
bool has_nspid;
bool has_nftables_concat;
+ bool has_rseq;
};
extern struct kerndat_s kdat;
diff --git a/criu/kerndat.c b/criu/kerndat.c
index 0e88ba4..f5a4490 100644
--- a/criu/kerndat.c
+++ b/criu/kerndat.c
@@ -816,6 +816,20 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void)
return 0;
}
+static int kerndat_has_rseq(void)
+{
+ if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) {
+ pr_err("rseq should fail\n");
+ return -1;
+ }
+ if (errno == ENOSYS)
+ pr_info("rseq syscall isn't supported\n");
+ else
+ kdat.has_rseq = true;
+
+ return 0;
+}
+
#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat"
#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat"
@@ -1360,6 +1374,10 @@ int kerndat_init(void)
ret = -1;
}
+ if (!ret && kerndat_has_rseq()) {
+ pr_err("kerndat_has_rseq failed when initializing kerndat.\n");
+ ret = -1;
+ }
kerndat_lsm();
kerndat_mmap_min_addr();
kerndat_files_stat();
--
2.30.0

View File

@ -0,0 +1,161 @@
From fe1f84eb98092b1aff60ae2be11e351b165f3f43 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 13:35:53 +0800
Subject: [PATCH 04/16] util: move fork_and_ptrace_attach helper from cr-check
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
criu/cr-check.c | 55 -------------------------------
criu/include/util.h | 1 +
criu/util.c | 57 +++++++++++++++++++++++++++++++++
3 files changed, 58 insertions(+), 55 deletions(-)
diff --git a/criu/cr-check.c b/criu/cr-check.c
index 3575fb3..d41ef8f 100644
--- a/criu/cr-check.c
+++ b/criu/cr-check.c
@@ -537,61 +537,6 @@ static int check_sigqueuinfo(void)
return 0;
}
-static pid_t fork_and_ptrace_attach(int (*child_setup)(void))
-{
- pid_t pid;
- int sk_pair[2], sk;
- char c = 0;
-
- if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
- pr_perror("socketpair");
- return -1;
- }
-
- pid = fork();
- if (pid < 0) {
- pr_perror("fork");
- return -1;
- } else if (pid == 0) {
- sk = sk_pair[1];
- close(sk_pair[0]);
-
- if (child_setup && child_setup() != 0)
- exit(1);
-
- if (write(sk, &c, 1) != 1) {
- pr_perror("write");
- exit(1);
- }
-
- while (1)
- sleep(1000);
- exit(1);
- }
-
- sk = sk_pair[0];
- close(sk_pair[1]);
-
- if (read(sk, &c, 1) != 1) {
- close(sk);
- kill(pid, SIGKILL);
- pr_perror("read");
- return -1;
- }
-
- close(sk);
-
- if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) {
- pr_perror("Unable to ptrace the child");
- kill(pid, SIGKILL);
- return -1;
- }
-
- waitpid(pid, NULL, 0);
-
- return pid;
-}
-
static int check_ptrace_peeksiginfo(void)
{
struct ptrace_peeksiginfo_args arg;
diff --git a/criu/include/util.h b/criu/include/util.h
index a2dac22..1c0b3c7 100644
--- a/criu/include/util.h
+++ b/criu/include/util.h
@@ -166,6 +166,7 @@ extern int is_anon_link_type(char *link, char *type);
extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags);
extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid);
+extern pid_t fork_and_ptrace_attach(int (*child_setup)(void));
extern int cr_daemon(int nochdir, int noclose, int close_fd);
extern int status_ready(void);
extern int is_root_user(void);
diff --git a/criu/util.c b/criu/util.c
index 06124c2..e682161 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -654,6 +654,63 @@ out:
return ret;
}
+pid_t fork_and_ptrace_attach(int (*child_setup)(void))
+{
+ pid_t pid;
+ int sk_pair[2], sk;
+ char c = 0;
+
+ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
+ pr_perror("socketpair");
+ return -1;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("fork");
+ return -1;
+ } else if (pid == 0) {
+ sk = sk_pair[1];
+ close(sk_pair[0]);
+
+ if (child_setup && child_setup() != 0)
+ exit(1);
+
+ if (write(sk, &c, 1) != 1) {
+ pr_perror("write");
+ exit(1);
+ }
+
+ while (1)
+ sleep(1000);
+ exit(1);
+ }
+
+ sk = sk_pair[0];
+ close(sk_pair[1]);
+
+ if (read(sk, &c, 1) != 1) {
+ close(sk);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ pr_perror("read");
+ return -1;
+ }
+
+ close(sk);
+
+ if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) {
+ pr_perror("Unable to ptrace the child");
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ return -1;
+ }
+
+ waitpid(pid, NULL, 0);
+
+ return pid;
+}
+
int status_ready(void)
{
char c = 0;
--
2.30.0

View File

@ -0,0 +1,162 @@
From 3c567693f2e6579109dbabcca0e90c059ce5af25 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:30:18 +0800
Subject: [PATCH 05/16] cr-check: Add ptrace rseq conf dump feature Add
"get_rseq_conf" feature corresponding to the
ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support.
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
compel/include/uapi/ptrace.h | 12 +++++++
criu/cr-check.c | 11 +++++++
criu/include/kerndat.h | 1 +
criu/kerndat.c | 41 ++++++++++++++++++++++++
4 files changed, 65 insertions(+)
diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h
index c5291d2..bfe28c7 100644
--- a/compel/include/uapi/ptrace.h
+++ b/compel/include/uapi/ptrace.h
@@ -65,6 +65,18 @@ typedef struct {
uint64_t flags; /* Output: filter's flags */
} seccomp_metadata_t;
+#ifndef PTRACE_GET_RSEQ_CONFIGURATION
+#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
+
+struct ptrace_rseq_configuration {
+ __u64 rseq_abi_pointer;
+ __u32 rseq_abi_size;
+ __u32 signature;
+ __u32 flags;
+ __u32 pad;
+};
+#endif
+
#ifdef PTRACE_EVENT_STOP
#if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */
#undef PTRACE_EVENT_STOP
diff --git a/criu/cr-check.c b/criu/cr-check.c
index d41ef8f..ba87511 100644
--- a/criu/cr-check.c
+++ b/criu/cr-check.c
@@ -794,6 +794,15 @@ static int check_ptrace_dump_seccomp_filters(void)
return ret;
}
+static int check_ptrace_get_rseq_conf(void)
+{
+ if (!kdat.has_ptrace_get_rseq_conf) {
+ pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n");
+ return -1;
+ }
+ return 0;
+}
+
static int check_mem_dirty_track(void)
{
if (!kdat.has_dirty_track) {
@@ -1435,6 +1444,7 @@ int cr_check(void)
ret |= check_ns_pid();
ret |= check_apparmor_stacking();
ret |= check_network_lock_nftables();
+ ret |= check_ptrace_get_rseq_conf();
}
/*
@@ -1547,6 +1557,7 @@ static struct feature_list feature_list[] = {
{ "ns_pid", check_ns_pid },
{ "apparmor_stacking", check_apparmor_stacking },
{ "network_lock_nftables", check_network_lock_nftables },
+ { "get_rseq_conf", check_ptrace_get_rseq_conf },
{ NULL, NULL },
};
diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
index 44a6976..05abeda 100644
--- a/criu/include/kerndat.h
+++ b/criu/include/kerndat.h
@@ -75,6 +75,7 @@ struct kerndat_s {
bool has_nspid;
bool has_nftables_concat;
bool has_rseq;
+ bool has_ptrace_get_rseq_conf;
};
extern struct kerndat_s kdat;
diff --git a/criu/kerndat.c b/criu/kerndat.c
index f5a4490..4841387 100644
--- a/criu/kerndat.c
+++ b/criu/kerndat.c
@@ -4,6 +4,8 @@
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
#include <sys/mman.h>
#include <errno.h>
#include <sys/syscall.h>
@@ -36,6 +38,7 @@
#include "sockets.h"
#include "net.h"
#include "tun.h"
+#include <compel/ptrace.h>
#include <compel/plugins/std/syscall-codes.h>
#include "netfilter.h"
#include "fsnotify.h"
@@ -830,6 +833,40 @@ static int kerndat_has_rseq(void)
return 0;
}
+static int kerndat_has_ptrace_get_rseq_conf(void)
+{
+ pid_t pid;
+ int len;
+ struct ptrace_rseq_configuration rseq;
+
+ pid = fork_and_ptrace_attach(NULL);
+ if (pid < 0)
+ return -1;
+
+ len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq);
+ if (len != sizeof(rseq)) {
+ kdat.has_ptrace_get_rseq_conf = false;
+ pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n");
+ goto out;
+ }
+
+ /*
+ * flags is always zero from the kernel side, if it will be changed
+ * we need to pay attention to that and, possibly, make changes on the CRIU side.
+ */
+ if (rseq.flags != 0) {
+ kdat.has_ptrace_get_rseq_conf = false;
+ pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n");
+ } else {
+ kdat.has_ptrace_get_rseq_conf = true;
+ }
+
+out:
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ return 0;
+}
+
#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat"
#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat"
@@ -1378,6 +1415,10 @@ int kerndat_init(void)
pr_err("kerndat_has_rseq failed when initializing kerndat.\n");
ret = -1;
}
+ if (!ret && kerndat_has_ptrace_get_rseq_conf()) {
+ pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n");
+ ret = -1;
+ }
kerndat_lsm();
kerndat_mmap_min_addr();
kerndat_files_stat();
--
2.30.0

View File

@ -0,0 +1,702 @@
From e444c089ebfb03fb2b6d69a40322d31ab33c0597 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 14:52:35 +0800
Subject: [PATCH 06/16] rseq: initial support TODO: 1. properly handle case
when the kernel has rseq() support but has no
ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support and user processes haven't used
rseq(). 2. properly handle "transient" states, when CRIU comes during rseq
was executed. We need test for this case with some "heavy" rseq + we need to
properly handle RSEQ_CS_* flags.
Fixes: #1696
Reported-by: Radostin Stoyanov <radostin@redhat.com>
Suggested-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
compel/include/uapi/ptrace.h | 16 +--
criu/cr-dump.c | 99 ++++++++++++++++
criu/cr-restore.c | 17 +++
criu/include/linux/rseq.h | 144 +++++++++++++++++++++++
criu/include/parasite.h | 7 ++
criu/include/restorer.h | 7 ++
criu/kerndat.c | 2 +-
criu/parasite-syscall.c | 11 ++
criu/pie/parasite.c | 99 ++++++++++++++++
criu/pie/restorer.c | 24 ++++
images/Makefile | 1 +
images/core.proto | 2 +
images/rseq.proto | 9 ++
13 files changed, 429 insertions(+), 9 deletions(-)
create mode 100644 criu/include/linux/rseq.h
create mode 100644 images/rseq.proto
diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h
index bfe28c7..d807a92 100644
--- a/compel/include/uapi/ptrace.h
+++ b/compel/include/uapi/ptrace.h
@@ -66,14 +66,14 @@ typedef struct {
} seccomp_metadata_t;
#ifndef PTRACE_GET_RSEQ_CONFIGURATION
-#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
-
-struct ptrace_rseq_configuration {
- __u64 rseq_abi_pointer;
- __u32 rseq_abi_size;
- __u32 signature;
- __u32 flags;
- __u32 pad;
+#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
+
+struct __ptrace_rseq_configuration {
+ uint64_t rseq_abi_pointer;
+ uint32_t rseq_abi_size;
+ uint32_t signature;
+ uint32_t flags;
+ uint32_t pad;
};
#endif
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index f07fe6e..91dd08a 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -45,6 +45,7 @@
#include "proc_parse.h"
#include "parasite.h"
#include "parasite-syscall.h"
+#include <compel/ptrace.h>
#include "files.h"
#include "files-reg.h"
#include "shmem.h"
@@ -200,6 +201,25 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
return 0;
}
+static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq, bool has_tc_rseq_entry)
+{
+ if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf)
+ return 0;
+
+ pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited);
+
+ /*
+ * We have no kdat.has_ptrace_get_rseq_conf and user
+ * process has rseq() used, let's fail dump.
+ */
+ if (ti_rseq->rseq_inited) {
+ pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid);
+ return -1;
+ }
+
+ return 0;
+}
+
struct cr_imgset *glob_imgset;
static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds)
@@ -730,6 +750,17 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
if (!ret)
ret = seccomp_dump_thread(pid, tc);
+ /*
+ * We are dumping rseq() in the dump_thread_rseq() function,
+ * *before* processes gets infected (because of ptrace requests
+ * API restriction). At this point, if the kernel lacks
+ * kdat.has_ptrace_get_rseq_conf support we have to ensure
+ * that dumpable processes haven't initialized rseq() or
+ * fail dump if rseq() was used.
+ */
+ if (!ret)
+ ret = check_thread_rseq(pid, &ti->rseq, !!tc->rseq_entry);
+
return ret;
}
@@ -1016,6 +1047,68 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
return 0;
}
+static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
+{
+ struct __ptrace_rseq_configuration rseq;
+ RseqEntry *rseqe = NULL;
+ int ret;
+
+ /*
+ * If we are here it means that rseq() syscall is supported,
+ * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported,
+ * we can just fail dump here. But this is bad idea, IMHO.
+ *
+ * So, we will try to detect if victim process was used rseq().
+ * See check_rseq() and check_thread_rseq() functions.
+ */
+ if (!kdat.has_ptrace_get_rseq_conf)
+ return 0;
+
+ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
+ if (ret != sizeof(rseq)) {
+ pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
+ return -1;
+ }
+
+ if (rseq.flags != 0) {
+ pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
+ rseq.flags);
+ return -1;
+ }
+
+ pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature);
+
+ rseqe = xmalloc(sizeof(*rseqe));
+ if (!rseqe)
+ return -1;
+
+ rseq_entry__init(rseqe);
+
+ rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
+ rseqe->rseq_abi_size = rseq.rseq_abi_size;
+ rseqe->signature = rseq.signature;
+
+ *rseqep = rseqe;
+
+ return 0;
+}
+
+static int dump_task_rseq(pid_t pid, struct pstree_item *item)
+{
+ int i;
+
+ /* if rseq() syscall isn't supported then nothing to dump */
+ if (!kdat.has_rseq)
+ return 0;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry))
+ return -1;
+ }
+
+ return 0;
+}
+
static struct proc_pid_stat pps_buf;
static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item)
@@ -1304,6 +1397,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
goto err;
}
+ ret = dump_task_rseq(pid, item);
+ if (ret) {
+ pr_err("Dump %d rseq failed %d\n", pid, ret);
+ goto err;
+ }
+
parasite_ctl = parasite_infect_seized(pid, item, &vmas);
if (!parasite_ctl) {
pr_err("Can't infect (pid: %d) with parasite\n", pid);
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 5b645c1..b2bd044 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2975,6 +2975,19 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc)
return 0;
}
+static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc)
+{
+ /* compatibility with older CRIU versions */
+ if (!tc->rseq_entry)
+ return 0;
+
+ rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer;
+ rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size;
+ rseq->signature = tc->rseq_entry->signature;
+
+ return 0;
+}
+
static rlim_t decode_rlim(rlim_t ival)
{
return ival == -1 ? RLIM_INFINITY : ival;
@@ -3704,6 +3717,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
core_get_tls(tcore, &thread_args[i].tls);
+ ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core);
+ if (ret)
+ goto err;
+
rst_reloc_creds(&thread_args[i], &creds_pos_next);
thread_args[i].futex_rla = tcore->thread_core->futex_rla;
diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h
new file mode 100644
index 0000000..5c1706a
--- /dev/null
+++ b/criu/include/linux/rseq.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RSEQ_H
+#define _UAPI_LINUX_RSEQ_H
+
+/*
+ * linux/rseq.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+enum rseq_cs_flags_bit {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
+/*
+ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line. It is usually declared as
+ * link-time constant data.
+ */
+struct rseq_cs {
+ /* Version of this structure. */
+ __u32 version;
+ /* enum rseq_cs_flags */
+ __u32 flags;
+ __u64 start_ip;
+ /* Offset from start_ip. */
+ __u64 post_commit_offset;
+ __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+/*
+ * struct rseq is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line.
+ *
+ * A single struct rseq per thread is allowed.
+ */
+struct rseq {
+ /*
+ * Restartable sequences cpu_id_start field. Updated by the
+ * kernel. Read by user-space with single-copy atomicity
+ * semantics. This field should only be read by the thread which
+ * registered this data structure. Aligned on 32-bit. Always
+ * contains a value in the range of possible CPUs, although the
+ * value may not be the actual current CPU (e.g. if rseq is not
+ * initialized). This CPU number value should always be compared
+ * against the value of the cpu_id field before performing a rseq
+ * commit or returning a value read from a data structure indexed
+ * using the cpu_id_start value.
+ */
+ __u32 cpu_id_start;
+ /*
+ * Restartable sequences cpu_id field. Updated by the kernel.
+ * Read by user-space with single-copy atomicity semantics. This
+ * field should only be read by the thread which registered this
+ * data structure. Aligned on 32-bit. Values
+ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED
+ * have a special semantic: the former means "rseq uninitialized",
+ * and latter means "rseq initialization failed". This value is
+ * meant to be read within rseq critical sections and compared
+ * with the cpu_id_start value previously read, before performing
+ * the commit instruction, or read and compared with the
+ * cpu_id_start value before returning a value loaded from a data
+ * structure indexed using the cpu_id_start value.
+ */
+ __u32 cpu_id;
+ /*
+ * Restartable sequences rseq_cs field.
+ *
+ * Contains NULL when no critical section is active for the current
+ * thread, or holds a pointer to the currently active struct rseq_cs.
+ *
+ * Updated by user-space, which sets the address of the currently
+ * active rseq_cs at the beginning of assembly instruction sequence
+ * block, and set to NULL by the kernel when it restarts an assembly
+ * instruction sequence block, as well as when the kernel detects that
+ * it is preempting or delivering a signal outside of the range
+ * targeted by the rseq_cs. Also needs to be set to NULL by user-space
+ * before reclaiming memory that contains the targeted struct rseq_cs.
+ *
+ * Read and set by the kernel. Set by user-space with single-copy
+ * atomicity semantics. This field should only be updated by the
+ * thread which registered this data structure. Aligned on 64-bit.
+ */
+ union {
+ __u64 ptr64;
+#ifdef __LP64__
+ __u64 ptr;
+#else
+ struct {
+#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN)
+ __u32 padding; /* Initialized to zero. */
+ __u32 ptr32;
+#else /* LITTLE */
+ __u32 ptr32;
+ __u32 padding; /* Initialized to zero. */
+#endif /* ENDIAN */
+ } ptr;
+#endif
+ } rseq_cs;
+
+ /*
+ * Restartable sequences flags field.
+ *
+ * This field should only be updated by the thread which
+ * registered this data structure. Read by the kernel.
+ * Mainly used for single-stepping through rseq critical sections
+ * with debuggers.
+ *
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+ * Inhibit instruction sequence block restart on preemption
+ * for this thread.
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+ * Inhibit instruction sequence block restart on signal
+ * delivery for this thread.
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
+ * Inhibit instruction sequence block restart on migration for
+ * this thread.
+ */
+ __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+#endif /* _UAPI_LINUX_RSEQ_H */
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
index 8107aa4..5fde809 100644
--- a/criu/include/parasite.h
+++ b/criu/include/parasite.h
@@ -164,10 +164,17 @@ struct parasite_dump_creds {
unsigned int groups[0];
};
+struct parasite_check_rseq {
+ bool has_rseq;
+ bool has_ptrace_get_rseq_conf; /* no need to check if supported */
+ bool rseq_inited;
+};
+
struct parasite_dump_thread {
unsigned int *tid_addr;
pid_t tid;
tls_t tls;
+ struct parasite_check_rseq rseq;
stack_t sas;
int pdeath_sig;
char comm[TASK_COMM_LEN];
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index c2ef8f0..c29d869 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -45,6 +45,12 @@ struct rst_sched_param {
int prio;
};
+struct rst_rseq_param {
+ u64 rseq_abi_pointer;
+ u32 rseq_abi_size;
+ u32 signature;
+};
+
struct restore_posix_timer {
struct str_posix_timer spt;
struct itimerspec val;
@@ -99,6 +105,7 @@ struct thread_restore_args {
struct task_restore_args *ta;
tls_t tls;
+ struct rst_rseq_param rseq;
siginfo_t *siginfo;
unsigned int siginfo_n;
diff --git a/criu/kerndat.c b/criu/kerndat.c
index 4841387..af7113a 100644
--- a/criu/kerndat.c
+++ b/criu/kerndat.c
@@ -837,7 +837,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void)
{
pid_t pid;
int len;
- struct ptrace_rseq_configuration rseq;
+ struct __ptrace_rseq_configuration rseq;
pid = fork_and_ptrace_attach(NULL);
if (pid < 0)
diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
index 7175ade..ee4fa86 100644
--- a/criu/parasite-syscall.c
+++ b/criu/parasite-syscall.c
@@ -132,6 +132,13 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c
return ce->groups ? 0 : -ENOMEM;
}
+static void init_parasite_rseq_arg(struct parasite_check_rseq *rseq)
+{
+ rseq->has_rseq = kdat.has_rseq;
+ rseq->has_ptrace_get_rseq_conf = kdat.has_ptrace_get_rseq_conf;
+ rseq->rseq_inited = false;
+}
+
int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core)
{
ThreadCoreEntry *tc = core->thread_core;
@@ -144,6 +151,8 @@ int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEn
pc = args->creds;
pc->cap_last_cap = kdat.last_cap;
+ init_parasite_rseq_arg(&args->rseq);
+
ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl);
if (ret < 0)
return ret;
@@ -197,6 +206,8 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit
compel_arch_get_tls_thread(tctl, &args->tls);
+ init_parasite_rseq_arg(&args->rseq);
+
ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD);
if (ret) {
pr_err("Can't init thread in parasite %d\n", pid);
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index bc0a33c..e49958b 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -8,6 +8,8 @@
#include <sys/ioctl.h>
#include <sys/uio.h>
+#include "linux/rseq.h"
+
#include "common/config.h"
#include "int.h"
#include "types.h"
@@ -167,6 +169,7 @@ static int dump_posix_timers(struct parasite_dump_posix_timers_args *args)
}
static int dump_creds(struct parasite_dump_creds *args);
+static int check_rseq(struct parasite_check_rseq *rseq);
static int dump_thread_common(struct parasite_dump_thread *ti)
{
@@ -197,6 +200,12 @@ static int dump_thread_common(struct parasite_dump_thread *ti)
goto out;
}
+ ret = check_rseq(&ti->rseq);
+ if (ret) {
+ pr_err("Unable to check if rseq() is initialized: %d\n", ret);
+ goto out;
+ }
+
ret = dump_creds(ti->creds);
out:
return ret;
@@ -313,6 +322,96 @@ grps_err:
return -1;
}
+static int check_rseq(struct parasite_check_rseq *rseq)
+{
+ int ret;
+ unsigned long rseq_abi_pointer;
+ unsigned long rseq_abi_size;
+ uint32_t rseq_signature;
+ void *addr;
+
+ /* no need to do hacky check if we can get all info from ptrace() */
+ if (!rseq->has_rseq || rseq->has_ptrace_get_rseq_conf)
+ return 0;
+
+ /*
+ * We need to determine if victim process has rseq()
+ * initialized, but we have no *any* proper kernel interface
+ * supported at this point.
+ * Our plan:
+ * 1. We know that if we call rseq() syscall and process already
+ * has current->rseq filled, then we get:
+ * -EINVAL if current->rseq != rseq || rseq_len != sizeof(*rseq),
+ * -EPERM if current->rseq_sig != sig),
+ * -EBUSY if current->rseq == rseq && rseq_len == sizeof(*rseq) &&
+ * current->rseq_sig != sig
+ * if current->rseq == NULL (rseq() wasn't used) then we go to:
+ * IS_ALIGNED(rseq ...) check, if we fail it we get -EINVAL and it
+ * will be hard to distinguish case when rseq() was initialized or not.
+ * Let's construct arguments payload
+ * with:
+ * 1. correct rseq_abi_size
+ * 2. aligned and correct rseq_abi_pointer
+ * And see what rseq() return to us.
+ * If ret value is:
+ * 0: it means that rseq *wasn't* used and we successfuly registered it,
+ * -EINVAL or : it means that rseq is already initialized,
+ * so we *have* to dump it. But as we have has_ptrace_get_rseq_conf = false,
+ * we should just fail dump as it's unsafe to skip rseq() dump for processes
+ * with rseq() initialized.
+ * -EPERM or -EBUSY: should not happen as we take a fresh memory area for rseq
+ */
+ addr = (void *)sys_mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ pr_err("mmap() failed for struct rseq ret = %lx\n", (unsigned long)addr);
+ return -1;
+ }
+
+ memset(addr, 0, sizeof(struct rseq));
+
+ /* sys_mmap returns page aligned addresses */
+ rseq_abi_pointer = (unsigned long)addr;
+ rseq_abi_size = (unsigned long)sizeof(struct rseq);
+ /* it's not so important to have unique signature for us,
+ * because rseq_abi_pointer is guaranteed to be unique
+ */
+ rseq_signature = 0x12345612;
+
+ pr_info("\ttrying sys_rseq(%lx, %lx, %x, %x)\n", rseq_abi_pointer, rseq_abi_size, 0, rseq_signature);
+ ret = sys_rseq((void *)rseq_abi_pointer, rseq_abi_size, 0, rseq_signature);
+ if (ret) {
+ if (ret == -EINVAL) {
+ pr_info("\trseq is initialized in the victim\n");
+ rseq->rseq_inited = true;
+
+ ret = 0;
+ } else {
+ pr_err("\tunexpected failure of sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer,
+ rseq_abi_size, 0, rseq_signature, ret);
+
+ ret = -1;
+ }
+ } else {
+ ret = sys_rseq((void *)rseq_abi_pointer, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, rseq_signature);
+ if (ret) {
+ pr_err("\tfailed to unregister sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer,
+ rseq_abi_size, RSEQ_FLAG_UNREGISTER, rseq_signature, ret);
+
+ ret = -1;
+ goto out;
+ }
+
+ pr_info("\tsys_rseq succeed, let's unregister it back... ok Error\n");
+ pr_info("\trseq is non-initialized in the victim Error\n");
+ rseq->rseq_inited = false;
+ ret = 0;
+ }
+
+out:
+ sys_munmap(addr, sizeof(struct rseq));
+ return ret;
+}
+
static int fill_fds_fown(int fd, struct fd_opts *p)
{
int flags, ret;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index fbc89fe..368b5a0 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -459,6 +459,27 @@ static int restore_cpu_affinity(struct task_restore_args *args)
return 0;
}
+static int restore_rseq(struct rst_rseq_param *rseq)
+{
+ int ret;
+
+ if (!rseq->rseq_abi_pointer) {
+ pr_debug("rseq: nothing to restore\n");
+ return 0;
+ }
+
+ pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, rseq->signature);
+
+ ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature);
+ if (ret) {
+ pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer,
+ (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret);
+ return -1;
+ }
+
+ return 0;
+}
+
static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
{
unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
@@ -583,6 +604,9 @@ static int restore_thread_common(struct thread_restore_args *args)
restore_tls(&args->tls);
+ if (restore_rseq(&args->rseq))
+ return -1;
+
return 0;
}
diff --git a/images/Makefile b/images/Makefile
index 2eaeb7c..004e22e 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o
proto-obj-y += bpfmap-file.o
proto-obj-y += bpfmap-data.o
proto-obj-y += apparmor.o
+proto-obj-y += rseq.o
CFLAGS += -iquote $(obj)/
diff --git a/images/core.proto b/images/core.proto
index 39e7f32..b66230e 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -14,6 +14,7 @@ import "timer.proto";
import "creds.proto";
import "sa.proto";
import "siginfo.proto";
+import "rseq.proto";
import "opts.proto";
@@ -106,6 +107,7 @@ message thread_core_entry {
optional string comm = 13;
optional uint64 blk_sigset_extended = 14;
required thread_allowedcpus_entry allowed_cpus = 15;
+ optional rseq_entry rseq_entry = 16;
}
message task_rlimits_entry {
diff --git a/images/rseq.proto b/images/rseq.proto
new file mode 100644
index 0000000..be28004
--- /dev/null
+++ b/images/rseq.proto
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+
+syntax = "proto2";
+
+message rseq_entry {
+ required uint64 rseq_abi_pointer = 1;
+ required uint32 rseq_abi_size = 2;
+ required uint32 signature = 3;
+}
--
2.30.0

View File

@ -0,0 +1,217 @@
From 5005c08e32dc29dbf0b3a2a582e75d249c190d96 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 14:54:28 +0800
Subject: [PATCH 07/16] zdtm: add simple test for rseq C/R Signed-off-by:
Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
test/zdtm/static/Makefile | 1 +
test/zdtm/static/rseq00.c | 174 +++++++++++++++++++++++
test/zdtm/static/rseq00.desc | 1 +
3 files changed, 176 insertions(+)
create mode 100644 test/zdtm/static/rseq00.c
create mode 100644 test/zdtm/static/rseq00.desc
diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile
index 70123cf..563d947 100644
--- a/test/zdtm/static/Makefile
+++ b/test/zdtm/static/Makefile
@@ -61,6 +61,7 @@ TST_NOFILE := \
pthread02 \
pthread_timers \
pthread_timers_h \
+ rseq00 \
vdso00 \
vdso01 \
vdso02 \
diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c
new file mode 100644
index 0000000..26f41a2
--- /dev/null
+++ b/test/zdtm/static/rseq00.c
@@ -0,0 +1,174 @@
+/*
+ * test for rseq() syscall
+ * See also https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/
+ * https://github.com/torvalds/linux/commit/d7822b1e24f2df5df98c76f0e94a5416349ff759
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <syscall.h>
+
+#include "zdtmtst.h"
+
+#if defined(__x86_64__)
+
+const char *test_doc = "Check that rseq() basic C/R works";
+const char *test_author = "Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>";
+/* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */
+
+/* some useful definitions from kernel uapi */
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ uint64_t rseq_cs;
+ uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
+
+#ifndef __NR_rseq
+#define __NR_rseq 334
+#endif
+/* EOF */
+
+static __thread volatile struct rseq __rseq_abi;
+
+#define RSEQ_SIG 0x53053053
+
+static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig)
+{
+ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
+}
+
+static void register_thread(void)
+{
+ int rc;
+ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
+ if (rc) {
+ fail("Failed to register rseq");
+ exit(1);
+ }
+}
+
+static void unregister_thread(void)
+{
+ int rc;
+ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+ if (rc) {
+ fail("Failed to unregister rseq");
+ exit(1);
+ }
+}
+
+static void check_thread(void)
+{
+ int rc;
+ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
+ if (!(rc && errno == EBUSY)) {
+ fail("Failed to check rseq %d", rc);
+ exit(1);
+ }
+}
+
+#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x))
+
+static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ /* clang-format off */
+ __asm__ __volatile__ goto(
+ ".pushsection __rseq_table, \"aw\"\n\t"
+ ".balign 32\n\t"
+ "cs_obj:\n\t"
+ /* version, flags */
+ ".long 0, 0\n\t"
+ /* start_ip, post_commit_ip, abort_ip */
+ ".quad 1f, (2f-1f), 4f\n\t"
+ ".popsection\n\t"
+ "1:\n\t"
+ "leaq cs_obj(%%rip), %%rax\n\t"
+ "movq %%rax, %[rseq_cs]\n\t"
+ "cmpl %[cpu_id], %[current_cpu_id]\n\t"
+ "jnz 4f\n\t"
+ "addq %[count], %[v]\n\t" /* final store */
+ "2:\n\t"
+ ".pushsection __rseq_failure, \"ax\"\n\t"
+ /* Disassembler-friendly signature: nopl <sig>(%rip). */
+ ".byte 0x0f, 0x1f, 0x05\n\t"
+ ".long 0x53053053\n\t" /* RSEQ_FLAGS */
+ "4:\n\t"
+ "jmp abort\n\t"
+ ".popsection\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (__rseq_abi.cpu_id),
+ [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "er" (count)
+ : "memory", "cc", "rax"
+ : abort
+ );
+ /* clang-format on */
+
+ return 0;
+abort:
+ return -1;
+}
+
+int main(int argc, char *argv[])
+{
+ int cpu, ret;
+ intptr_t *cpu_data;
+ long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+ test_init(argc, argv);
+
+ cpu_data = calloc(nr_cpus, sizeof(*cpu_data));
+ if (!cpu_data) {
+ fail("calloc");
+ exit(EXIT_FAILURE);
+ }
+
+ register_thread();
+
+ test_daemon();
+ test_waitsig();
+
+ check_thread();
+
+ cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start);
+ ret = rseq_addv(&cpu_data[cpu], 2, cpu);
+ if (ret)
+ fail("Failed to increment per-cpu counter");
+ else
+ test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]);
+
+ if (cpu_data[cpu] == 2)
+ pass();
+ else
+ fail();
+
+ return 0;
+}
+
+#else
+
+int main(int argc, char *argv[])
+{
+ test_init(argc, argv);
+ skip("Unsupported arch");
+ return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/test/zdtm/static/rseq00.desc b/test/zdtm/static/rseq00.desc
new file mode 100644
index 0000000..0324fa3
--- /dev/null
+++ b/test/zdtm/static/rseq00.desc
@@ -0,0 +1 @@
+{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'}
--
2.30.0

View File

@ -0,0 +1,123 @@
From 56fad25776a652e143175a22676a1f909476c880 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 14:57:16 +0800
Subject: [PATCH 08/16] ci: add Fedora Rawhide based test on Cirrus We have
ability to use nested virtualization on Cirrus, and already have "Vagrant
Fedora based test (no VDSO)" test, let's do analogical for Fedora Rawhide to
get fresh kernel.
Suggested-by: Adrian Reber <areber@redhat.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
.cirrus.yml | 21 +++++++++++++++++++++
scripts/ci/Makefile | 7 +++++--
scripts/ci/run-ci-tests.sh | 5 +++++
scripts/ci/vagrant.sh | 21 +++++++++++++++++++++
4 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/.cirrus.yml b/.cirrus.yml
index 671178d..9716e58 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -19,6 +19,27 @@ task:
build_script: |
make -C scripts/ci vagrant-fedora-no-vdso
+task:
+ name: Vagrant Fedora Rawhide based test
+ environment:
+ HOME: "/root"
+ CIRRUS_WORKING_DIR: "/tmp/criu"
+
+ compute_engine_instance:
+ image_project: cirrus-images
+ image: family/docker-kvm
+ platform: linux
+ cpu: 4
+ memory: 16G
+ nested_virtualization: true
+
+ setup_script: |
+ scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker
+ sudo kvm-ok
+ ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto
+ build_script: |
+ make -C scripts/ci vagrant-fedora-rawhide
+
task:
name: CentOS 8 based test
environment:
diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile
index 02b4d87..9c9264d 100644
--- a/scripts/ci/Makefile
+++ b/scripts/ci/Makefile
@@ -41,7 +41,7 @@ export CONTAINER_TERMINAL
ifeq ($(UNAME),x86_64)
# On anything besides x86_64 Travis is running unprivileged LXD
# containers which do not support running docker with '--privileged'.
- CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged -v /lib/modules:/lib/modules --tmpfs /run
+ CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run
else
CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run
endif
@@ -92,7 +92,10 @@ setup-vagrant:
vagrant-fedora-no-vdso: setup-vagrant
./vagrant.sh fedora-no-vdso
-.PHONY: setup-vagrant vagrant-fedora-no-vdso
+vagrant-fedora-rawhide: setup-vagrant
+ ./vagrant.sh fedora-rawhide
+
+.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide
%:
$(MAKE) -C ../build $@$(target-suffix)
diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh
index 7c66e68..95b4ec7 100755
--- a/scripts/ci/run-ci-tests.sh
+++ b/scripts/ci/run-ci-tests.sh
@@ -194,6 +194,11 @@ if [ "${STREAM_TEST}" = "1" ]; then
exit 0
fi
+# print some useful debug info
+cat /proc/self/status
+ls -la /proc/self/ns
+cat /proc/self/cgroup
+
# shellcheck disable=SC2086
./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS
diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh
index 839b100..f961b8d 100755
--- a/scripts/ci/vagrant.sh
+++ b/scripts/ci/vagrant.sh
@@ -58,4 +58,25 @@ fedora-no-vdso() {
ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2'
}
+fedora-rawhide() {
+ #ssh default sudo grubby --update-kernel ALL --args="selinux=0 systemd.unified_cgroup_hierarchy=0"
+ ssh default sudo grubby --update-kernel ALL
+ #
+ # Workaround the problem:
+ # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected
+ # Let's just use runc instead of crun
+ # see also https://github.com/kata-containers/tests/issues/4283
+ #
+ ssh default 'sudo dnf remove -y crun || true'
+ ssh default sudo dnf install -y podman runc
+ vagrant reload
+ #ssh default sudo setenforce 0
+ ssh default cat /proc/cmdline
+ ssh default ls -la /proc/self/ns
+ ssh default sudo cat /proc/self/status
+ ssh default sudo cat /proc/self/cgroup
+ #ssh default sudo capsh --print
+ ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"'
+}
+
$1
--
2.30.0

View File

@ -0,0 +1,244 @@
From 99da2f789ca92aa52eeca07b97aee2cbd3d60fca Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:00:07 +0800
Subject: [PATCH 09/16] include: add thread_pointer.h from Glibc Implementation
was taken from the Glibc.
https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=8dbeb0561eeb876f557ac9eef5721912ec074ea5
https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=cb976fba4c51ede7bf8cee5035888527c308dfbc
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
.../arch/aarch64/include/asm/thread_pointer.h | 27 ++++++++++++++
.../arch/arm/include/asm/thread_pointer.h | 27 ++++++++++++++
.../arch/mips/include/asm/thread_pointer.h | 27 ++++++++++++++
.../arch/ppc64/include/asm/thread_pointer.h | 33 +++++++++++++++++
.../arch/s390/include/asm/thread_pointer.h | 27 ++++++++++++++
.../arch/x86/include/asm/thread_pointer.h | 37 +++++++++++++++++++
6 files changed, 178 insertions(+)
create mode 100644 criu/arch/aarch64/include/asm/thread_pointer.h
create mode 100644 criu/arch/arm/include/asm/thread_pointer.h
create mode 100644 criu/arch/mips/include/asm/thread_pointer.h
create mode 100644 criu/arch/ppc64/include/asm/thread_pointer.h
create mode 100644 criu/arch/s390/include/asm/thread_pointer.h
create mode 100644 criu/arch/x86/include/asm/thread_pointer.h
diff --git a/criu/arch/aarch64/include/asm/thread_pointer.h b/criu/arch/aarch64/include/asm/thread_pointer.h
new file mode 100644
index 0000000..f7e0706
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/thread_pointer.h
@@ -0,0 +1,27 @@
+/* __thread_pointer definition. Generic version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+static inline void *__criu_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
diff --git a/criu/arch/arm/include/asm/thread_pointer.h b/criu/arch/arm/include/asm/thread_pointer.h
new file mode 100644
index 0000000..f7e0706
--- /dev/null
+++ b/criu/arch/arm/include/asm/thread_pointer.h
@@ -0,0 +1,27 @@
+/* __thread_pointer definition. Generic version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+static inline void *__criu_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
diff --git a/criu/arch/mips/include/asm/thread_pointer.h b/criu/arch/mips/include/asm/thread_pointer.h
new file mode 100644
index 0000000..f7e0706
--- /dev/null
+++ b/criu/arch/mips/include/asm/thread_pointer.h
@@ -0,0 +1,27 @@
+/* __thread_pointer definition. Generic version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+static inline void *__criu_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
diff --git a/criu/arch/ppc64/include/asm/thread_pointer.h b/criu/arch/ppc64/include/asm/thread_pointer.h
new file mode 100644
index 0000000..304516f
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/thread_pointer.h
@@ -0,0 +1,33 @@
+/* __thread_pointer definition. powerpc version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+#ifdef __powerpc64__
+register void *__thread_register asm("r13");
+#else
+register void *__thread_register asm("r2");
+#endif
+
+static inline void *__criu_thread_pointer(void)
+{
+ return __thread_register;
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
\ No newline at end of file
diff --git a/criu/arch/s390/include/asm/thread_pointer.h b/criu/arch/s390/include/asm/thread_pointer.h
new file mode 100644
index 0000000..f7e0706
--- /dev/null
+++ b/criu/arch/s390/include/asm/thread_pointer.h
@@ -0,0 +1,27 @@
+/* __thread_pointer definition. Generic version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+static inline void *__criu_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
diff --git a/criu/arch/x86/include/asm/thread_pointer.h b/criu/arch/x86/include/asm/thread_pointer.h
new file mode 100644
index 0000000..08603ae
--- /dev/null
+++ b/criu/arch/x86/include/asm/thread_pointer.h
@@ -0,0 +1,37 @@
+/* __thread_pointer definition. x86 version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SYS_THREAD_POINTER_H
+#define _SYS_THREAD_POINTER_H
+
+static inline void *__criu_thread_pointer(void)
+{
+#if __GNUC_PREREQ(11, 1)
+ return __builtin_thread_pointer();
+#else
+ void *__result;
+#ifdef __x86_64__
+ __asm__("mov %%fs:0, %0" : "=r"(__result));
+#else
+ __asm__("mov %%gs:0, %0" : "=r"(__result));
+#endif
+ return __result;
+#endif /* !GCC 11 */
+}
+
+#endif /* _SYS_THREAD_POINTER_H */
\ No newline at end of file
--
2.30.0

View File

@ -0,0 +1,102 @@
From d43ad9913c19afa6d80cb8124015d47361152db8 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:00:43 +0800
Subject: [PATCH 10/16] clone-noasan: unregister rseq at the thread start for
new glibc Fresh glibc does rseq registration by default during
start_thread(). [ see
https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=95e114a0919d844d8fe07839cb6538b7f5ee920e
]
This cause process crashes during memory restore procedure, because
memory which corresponds to the struct rseq will be overwritten.
See also
("nptl: Add public rseq symbols and <sys/rseq.h>")
https://sourceware.org/git?p=glibc.git;a=commit;h=c901c3e764d7c7079f006b4e21e877d5036eb4f5
("nptl: Add <thread_pointer.h> for defining __thread_pointer")
https://sourceware.org/git?p=glibc.git;a=commit;h=8dbeb0561eeb876f557ac9eef5721912ec074ea5
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
criu/clone-noasan.c | 42 +++++++++++++++++++++++++++++++--
1 file changed, 40 insertions(+), 2 deletions(-)
diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c
index d657ea2..5f8dd1b 100644
--- a/criu/clone-noasan.c
+++ b/criu/clone-noasan.c
@@ -2,6 +2,13 @@
#include <sched.h>
#include <unistd.h>
+#ifdef __has_include
+#if __has_include ("sys/rseq.h")
+#include <sys/rseq.h>
+#include "asm/thread_pointer.h"
+#endif
+#endif
+
#include <compel/plugins/std/syscall-codes.h>
#include "sched.h"
@@ -34,16 +41,45 @@
* ... wait for process to finish ...
* unlock_last_pid
*/
+
+#if defined(RSEQ_SIG)
+static inline void unregister_glibc_rseq(void)
+{
+ /* unregister rseq */
+ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG);
+}
+#else
+static inline void unregister_glibc_rseq(void)
+{
+}
+#endif
+
+struct call_fn_args {
+ int (*fn)(void *);
+ void *arg;
+};
+
+int call_fn(void *arg)
+{
+ struct call_fn_args *cargs = arg;
+ unregister_glibc_rseq();
+ return cargs->fn(cargs->arg);
+}
+
int clone_noasan(int (*fn)(void *), int flags, void *arg)
{
void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16);
+ struct call_fn_args a = {
+ .fn = fn,
+ .arg = arg,
+ };
BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK));
/*
* Reserve some bytes for clone() internal needs
* and use as stack the address above this area.
*/
- return clone(fn, stack_ptr, flags, arg);
+ return clone(call_fn, stack_ptr, flags, (void *)&a);
}
int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid)
@@ -78,7 +114,9 @@ int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_sig
c_args.set_tid = ptr_to_u64(&pid);
c_args.set_tid_size = 1;
pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
- if (pid == 0)
+ if (pid == 0) {
+ unregister_glibc_rseq();
exit(fn(arg));
+ }
return pid;
}
--
2.30.0

View File

@ -0,0 +1,158 @@
From 4f4d5acc34046954aea9e8ea10b5f71ff5f0fbd5 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:01:34 +0800
Subject: [PATCH 11/16] zdtm/static/rseq00: fix rseq test when linking with a
fresh Glibc Fresh Glibc does rseq() register by default. We need to
unregister rseq before registering our own.
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
test/zdtm/static/rseq00.c | 76 ++++++++++++++++++++-------
1 file changed, 58 insertions(+), 18 deletions(-)
diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c
index 26f41a2..87053b8 100644
--- a/test/zdtm/static/rseq00.c
+++ b/test/zdtm/static/rseq00.c
@@ -19,13 +19,48 @@
#include "zdtmtst.h"
-#if defined(__x86_64__)
+#ifdef __has_include
+#if __has_include("sys/rseq.h")
+#include <sys/rseq.h>
+#endif
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#if defined(RSEQ_SIG)
+static inline void *__criu_thread_pointer(void)
+{
+#if __GNUC_PREREQ(11, 1)
+ return __builtin_thread_pointer();
+#else
+ void *__result;
+#ifdef __x86_64__
+ __asm__("mov %%fs:0, %0" : "=r"(__result));
+#else
+ __asm__("mov %%gs:0, %0" : "=r"(__result));
+#endif
+ return __result;
+#endif /* !GCC 11 */
+}
+
+static inline void unregister_glibc_rseq(void)
+{
+ /* unregister rseq */
+ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG);
+}
+#else
+static inline void unregister_glibc_rseq(void)
+{
+}
+#endif
const char *test_doc = "Check that rseq() basic C/R works";
const char *test_author = "Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>";
/* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */
/* some useful definitions from kernel uapi */
+#ifndef RSEQ_SIG
+
enum rseq_flags {
RSEQ_FLAG_UNREGISTER = (1 << 0),
};
@@ -37,14 +72,21 @@ struct rseq {
uint32_t flags;
} __attribute__((aligned(4 * sizeof(uint64_t))));
+#define RSEQ_SIG 0x53053053
+
+#endif
+
#ifndef __NR_rseq
#define __NR_rseq 334
#endif
/* EOF */
-static __thread volatile struct rseq __rseq_abi;
+#define RSEQ_TLS_ALLOC 0
-#define RSEQ_SIG 0x53053053
+static volatile struct rseq *rseq_ptr;
+#if RSEQ_TLS_ALLOC
+static __thread volatile struct rseq __rseq_abi;
+#endif
static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig)
{
@@ -54,27 +96,18 @@ static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags
static void register_thread(void)
{
int rc;
- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
+ unregister_glibc_rseq();
+ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG);
if (rc) {
fail("Failed to register rseq");
exit(1);
}
}
-static void unregister_thread(void)
-{
- int rc;
- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
- if (rc) {
- fail("Failed to unregister rseq");
- exit(1);
- }
-}
-
static void check_thread(void)
{
int rc;
- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
+ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG);
if (!(rc && errno == EBUSY)) {
fail("Failed to check rseq %d", rc);
exit(1);
@@ -111,8 +144,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
".popsection\n\t"
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_ptr->cpu_id),
+ [rseq_cs] "m" (rseq_ptr->rseq_cs),
/* final store input */
[v] "m" (*v),
[count] "er" (count)
@@ -132,6 +165,13 @@ int main(int argc, char *argv[])
intptr_t *cpu_data;
long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#if RSEQ_TLS_ALLOC
+ rseq_ptr = &__rseq_abi;
+#else
+ //rseq_ptr = malloc(sizeof(struct rseq));
+ rseq_ptr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0);
+#endif
+
test_init(argc, argv);
cpu_data = calloc(nr_cpus, sizeof(*cpu_data));
@@ -147,7 +187,7 @@ int main(int argc, char *argv[])
check_thread();
- cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start);
+ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start);
ret = rseq_addv(&cpu_data[cpu], 2, cpu);
if (ret)
fail("Failed to increment per-cpu counter");
--
2.30.0

View File

@ -0,0 +1,265 @@
From 06cb51057ce1cc31b79c6321273dfa0b4cb7f980 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:02:08 +0800
Subject: [PATCH 12/16] compel: add helpers to get/set instruction pointer
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
.../src/lib/include/uapi/asm/infect-types.h | 9 +++++----
.../src/lib/include/uapi/asm/infect-types.h | 9 +++++----
.../src/lib/include/uapi/asm/infect-types.h | 9 +++++----
.../src/lib/include/uapi/asm/infect-types.h | 9 +++++----
.../src/lib/include/uapi/asm/infect-types.h | 7 ++++---
.../src/lib/include/uapi/asm/infect-types.h | 9 +++++----
compel/include/uapi/infect.h | 6 ++++++
compel/src/lib/infect.c | 20 +++++++++++++++++++
.../criu/arch/aarch64/include/asm/types.h | 2 ++
criu/arch/arm/include/asm/types.h | 2 ++
.../criu/arch/mips/include/asm/types.h | 2 ++
.../criu/arch/ppc64/include/asm/types.h | 2 ++
.../criu/arch/s390/include/asm/types.h | 2 ++
criu/arch/x86/include/asm/types.h | 2 ++
14 files changed, 67 insertions(+), 23 deletions(-)
diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h
index f91e73d..9d4ce7e 100644
--- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h
@@ -23,10 +23,11 @@ typedef struct user_fpsimd_state user_fpregs_struct_t;
#define compel_arch_get_tls_task(ctl, tls)
#define compel_arch_get_tls_thread(tctl, tls)
-#define REG_RES(r) ((uint64_t)(r).regs[0])
-#define REG_IP(r) ((uint64_t)(r).pc)
-#define REG_SP(r) ((uint64_t)((r).sp))
-#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8])
+#define REG_RES(r) ((uint64_t)(r).regs[0])
+#define REG_IP(r) ((uint64_t)(r).pc)
+#define SET_REG_IP(r, val) ((r).pc = (val))
+#define REG_SP(r) ((uint64_t)((r).sp))
+#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8])
#define user_regs_native(pregs) true
diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h
index 159b6a9..8d32825 100644
--- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h
@@ -56,10 +56,11 @@ struct user_vfp_exc {
unsigned long fpinst2;
};
-#define REG_RES(regs) ((regs).ARM_r0)
-#define REG_IP(regs) ((regs).ARM_pc)
-#define REG_SP(regs) ((regs).ARM_sp)
-#define REG_SYSCALL_NR(regs) ((regs).ARM_r7)
+#define REG_RES(regs) ((regs).ARM_r0)
+#define REG_IP(regs) ((regs).ARM_pc)
+#define SET_REG_IP(regs, val) ((regs).ARM_pc = (val))
+#define REG_SP(regs) ((regs).ARM_sp)
+#define REG_SYSCALL_NR(regs) ((regs).ARM_r7)
#define user_regs_native(pregs) true
diff --git a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h
index 70b3f85..481566a 100644
--- a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h
@@ -56,10 +56,11 @@ static inline bool user_regs_native(user_regs_struct_t *pregs)
#define compel_arch_get_tls_task(ctl, tls)
#define compel_arch_get_tls_thread(tctl, tls)
-#define REG_RES(regs) ((regs).MIPS_v0)
-#define REG_IP(regs) ((regs).cp0_epc)
-#define REG_SP(regs) ((regs).MIPS_sp)
-#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0)
+#define REG_RES(regs) ((regs).MIPS_v0)
+#define REG_IP(regs) ((regs).cp0_epc)
+#define SET_REG_IP(regs, val) ((regs).cp0_epc = (val))
+#define REG_SP(regs) ((regs).MIPS_sp)
+#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0)
//#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall)
#define __NR(syscall, compat) __NR_##syscall
diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h
index fe6192e..bf2cc95 100644
--- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h
@@ -72,10 +72,11 @@ typedef struct {
} tm;
} user_fpregs_struct_t;
-#define REG_RES(regs) ((uint64_t)(regs).gpr[3])
-#define REG_IP(regs) ((uint64_t)(regs).nip)
-#define REG_SP(regs) ((uint64_t)(regs).gpr[1])
-#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0])
+#define REG_RES(regs) ((uint64_t)(regs).gpr[3])
+#define REG_IP(regs) ((uint64_t)(regs).nip)
+#define SET_REG_IP(regs, val) ((regs).nip = (val))
+#define REG_SP(regs) ((uint64_t)(regs).gpr[1])
+#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0])
#define user_regs_native(pregs) true
diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h
index 896d70e..87283bc 100644
--- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h
@@ -62,9 +62,10 @@ typedef struct {
uint32_t system_call;
} user_regs_struct_t;
-#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2])
-#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr)
-#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15])
+#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2])
+#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr)
+#define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val))
+#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15])
/*
* We assume that REG_SYSCALL_NR() is only used for pie code where we
* always use svc 0 with opcode in %r1.
diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h
index 34b3ad0..b35504f 100644
--- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h
+++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h
@@ -127,10 +127,11 @@ typedef struct {
typedef struct xsave_struct user_fpregs_struct_t;
-#define REG_RES(regs) get_user_reg(&regs, ax)
-#define REG_IP(regs) get_user_reg(&regs, ip)
-#define REG_SP(regs) get_user_reg(&regs, sp)
-#define REG_SYSCALL_NR(regs) get_user_reg(&regs, orig_ax)
+#define REG_RES(regs) get_user_reg(&regs, ax)
+#define REG_IP(regs) get_user_reg(&regs, ip)
+#define SET_REG_IP(regs, val) set_user_reg(&regs, ip, val)
+#define REG_SP(regs) get_user_reg(&regs, sp)
+#define REG_SYSCALL_NR(regs) get_user_reg(&regs, orig_ax)
#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall)
diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h
index c3d2ee6..389878e 100644
--- a/compel/include/uapi/infect.h
+++ b/compel/include/uapi/infect.h
@@ -168,4 +168,10 @@ extern unsigned long compel_task_size(void);
extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl);
extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl);
+extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl);
+extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl);
+
+void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v);
+void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v);
+
#endif
diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c
index 0fb9e71..6a13cc1 100644
--- a/compel/src/lib/infect.c
+++ b/compel/src/lib/infect.c
@@ -1686,3 +1686,23 @@ uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl)
{
return REG_SP(tctl->th.regs);
}
+
+uint64_t compel_get_leader_ip(struct parasite_ctl *ctl)
+{
+ return REG_IP(ctl->orig.regs);
+}
+
+uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl)
+{
+ return REG_IP(tctl->th.regs);
+}
+
+void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v)
+{
+ SET_REG_IP(ctl->orig.regs, v);
+}
+
+void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v)
+{
+ SET_REG_IP(tctl->th.regs, v);
+}
diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h
index c860af1..363c1ca 100644
--- a/criu/arch/aarch64/include/asm/types.h
+++ b/criu/arch/aarch64/include/asm/types.h
@@ -22,6 +22,8 @@ typedef UserAarch64RegsEntry UserRegsEntry;
#define TI_SP(core) ((core)->ti_aarch64->gpregs->sp)
+#define TI_IP(core) ((core)->ti_aarch64->gpregs->pc)
+
static inline void *decode_pointer(uint64_t v)
{
return (void *)v;
diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h
index cfcb8a1..93d2dc2 100644
--- a/criu/arch/arm/include/asm/types.h
+++ b/criu/arch/arm/include/asm/types.h
@@ -21,6 +21,8 @@ typedef UserArmRegsEntry UserRegsEntry;
#define TI_SP(core) ((core)->ti_arm->gpregs->sp)
+#define TI_IP(core) ((core)->ti_arm->gpregs->ip)
+
static inline void *decode_pointer(u64 v)
{
return (void *)(u32)v;
diff --git a/criu/arch/mips/include/asm/types.h b/criu/arch/mips/include/asm/types.h
index 237471f..2c75b6a 100644
--- a/criu/arch/mips/include/asm/types.h
+++ b/criu/arch/mips/include/asm/types.h
@@ -18,6 +18,8 @@
#define CORE_THREAD_ARCH_INFO(core) core->ti_mips
+#define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc)
+
typedef UserMipsRegsEntry UserRegsEntry;
static inline u64 encode_pointer(void *p)
diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h
index fedeff2..d60aadd 100644
--- a/criu/arch/ppc64/include/asm/types.h
+++ b/criu/arch/ppc64/include/asm/types.h
@@ -19,6 +19,8 @@ typedef UserPpc64RegsEntry UserRegsEntry;
#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64
+#define TI_IP(core) ((core)->ti_ppc64->gpregs->nip)
+
static inline void *decode_pointer(uint64_t v)
{
return (void *)v;
diff --git a/criu/arch/s390/include/asm/types.h b/criu/arch/s390/include/asm/types.h
index 7522cf2..abf12de 100644
--- a/criu/arch/s390/include/asm/types.h
+++ b/criu/arch/s390/include/asm/types.h
@@ -19,6 +19,8 @@ typedef UserS390RegsEntry UserRegsEntry;
#define CORE_THREAD_ARCH_INFO(core) core->ti_s390
+#define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr)
+
static inline u64 encode_pointer(void *p)
{
return (u64)p;
diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h
index a0a8ed9..8919d0a 100644
--- a/criu/arch/x86/include/asm/types.h
+++ b/criu/arch/x86/include/asm/types.h
@@ -28,6 +28,8 @@ static inline int core_is_compat(CoreEntry *c)
#define CORE_THREAD_ARCH_INFO(core) core->thread_info
+#define TI_IP(core) ((core)->thread_info->gpregs->ip)
+
typedef UserX86RegsEntry UserRegsEntry;
static inline u64 encode_pointer(void *p)
--
2.30.0

View File

@ -0,0 +1,248 @@
From 33abfc12b973560b3d98afdbac7554b8c0542c3d Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:04:54 +0800
Subject: [PATCH 13/16] cr-dump: fixup thread IP when inside rseq cs
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
criu/cr-dump.c | 155 +++++++++++++++++++++++++++-
criu/include/parasite.h | 2 +
criu/include/pstree.h | 1 +
3 files changed, 154 insertions(+), 4 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 91dd08a..a3f8973 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1047,11 +1047,58 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
return 0;
}
-static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
+static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs)
+{
+ int ret;
+ uint64_t addr;
+
+ /* rseq is not registered */
+ if (!rseq->rseq_abi_pointer)
+ return 0;
+
+ /*
+ * We need to cover the case when victim process was inside rseq critical section
+ * at the moment when CRIU comes and seized it. We need to determine the borders
+ * of rseq critical section at first. To achieve that we need to access thread
+ * memory and read pointer to struct rseq_cs.
+ *
+ * We have two ways to access thread memory: from the parasite and using ptrace().
+ * But it this case we can't use parasite, because if victim process returns to the
+ * execution, on the kernel side __rseq_handle_notify_resume hook will be called,
+ * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq
+ * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA).
+ */
+ ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)),
+ sizeof(uint64_t));
+ if (ret) {
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr,
+ (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t));
+ return -1;
+ }
+
+ /* (struct rseq)->rseq_cs is NULL */
+ if (!addr)
+ return 0;
+
+ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs));
+ if (ret) {
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid,
+ (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_thread_rseq(struct pstree_item *item, int i)
{
struct __ptrace_rseq_configuration rseq;
RseqEntry *rseqe = NULL;
int ret;
+ CoreEntry *core = item->core[i];
+ RseqEntry **rseqep = &core->thread_core->rseq_entry;
+ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
+ pid_t tid = item->threads[i].real;
/*
* If we are here it means that rseq() syscall is supported,
@@ -1076,7 +1123,8 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
return -1;
}
- pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature);
+ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
+ rseq.signature);
rseqe = xmalloc(sizeof(*rseqe));
if (!rseqe)
@@ -1088,25 +1136,118 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
rseqe->rseq_abi_size = rseq.rseq_abi_size;
rseqe->signature = rseq.signature;
+ if (read_rseq_cs(tid, &rseq, rseq_cs))
+ goto err;
+
+ /* save rseq entry to the image */
*rseqep = rseqe;
return 0;
+
+err:
+ xfree(rseqe);
+ return -1;
}
static int dump_task_rseq(pid_t pid, struct pstree_item *item)
{
int i;
+ struct rseq_cs *thread_rseq_cs;
/* if rseq() syscall isn't supported then nothing to dump */
if (!kdat.has_rseq)
return 0;
+ thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads);
+ if (!thread_rseq_cs)
+ return -1;
+
+ dmpi(item)->thread_rseq_cs = thread_rseq_cs;
+
for (i = 0; i < item->nr_threads; i++) {
- if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry))
- return -1;
+ if (dump_thread_rseq(item, i))
+ goto free_rseq;
}
return 0;
+
+free_rseq:
+ xfree(thread_rseq_cs);
+ dmpi(item)->thread_rseq_cs = NULL;
+ return -1;
+}
+
+static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr)
+{
+ return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset;
+}
+
+static int fixup_thread_rseq(struct pstree_item *item, int i)
+{
+ CoreEntry *core = item->core[i];
+ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
+ pid_t tid = item->threads[i].real;
+
+ /* (struct rseq)->rseq_cs is NULL */
+ if (!rseq_cs->start_ip)
+ return 0;
+
+ pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
+ tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags,
+ rseq_cs->version, (unsigned long)TI_IP(core));
+
+ if (rseq_cs->version != 0) {
+ pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version);
+ return -1;
+ }
+
+ if (task_in_rseq(rseq_cs, TI_IP(core))) {
+ struct pid *tid = &item->threads[i];
+
+ pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
+ tid->real);
+
+ /*
+ * We need to fixup task instruction pointer from
+ * the original one (which lays inside rseq critical section)
+ * to rseq abort handler address.
+ *
+ * It's worth to mention that we need to fixup IP in CoreEntry
+ * (used when full dump/restore is performed) and also in
+ * the parasite regs storage (used if --leave-running option is used,
+ * or if dump error occured and process execution is resumed).
+ */
+ TI_IP(core) = rseq_cs->abort_ip;
+
+ if (item->pid->real == tid->real) {
+ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
+ } else {
+ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
+ }
+ }
+
+ return 0;
+}
+
+static int fixup_task_rseq(pid_t pid, struct pstree_item *item)
+{
+ int ret = 0;
+ int i;
+
+ if (!kdat.has_ptrace_get_rseq_conf)
+ return 0;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ if (fixup_thread_rseq(item, i)) {
+ ret = -1;
+ goto exit;
+ }
+ }
+
+exit:
+ xfree(dmpi(item)->thread_rseq_cs);
+ dmpi(item)->thread_rseq_cs = NULL;
+ return ret;
}
static struct proc_pid_stat pps_buf;
@@ -1409,6 +1550,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
goto err;
}
+ ret = fixup_task_rseq(pid, item);
+ if (ret) {
+ pr_err("Fixup rseq for %d failed %d\n", pid, ret);
+ goto err;
+ }
+
if (fault_injected(FI_DUMP_EARLY)) {
pr_info("fault: CRIU sudden detach\n");
kill(getpid(), SIGKILL);
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
index 5fde809..d2a0688 100644
--- a/criu/include/parasite.h
+++ b/criu/include/parasite.h
@@ -10,6 +10,8 @@
#include <time.h>
#include <signal.h>
+#include "linux/rseq.h"
+
#include "image.h"
#include "util-pie.h"
#include "common/lock.h"
diff --git a/criu/include/pstree.h b/criu/include/pstree.h
index c5b0fa7..458e5f9 100644
--- a/criu/include/pstree.h
+++ b/criu/include/pstree.h
@@ -63,6 +63,7 @@ struct dmp_info {
struct parasite_ctl *parasite_ctl;
struct parasite_thread_ctl **thread_ctls;
uint64_t *thread_sp;
+ struct rseq_cs *thread_rseq_cs;
/*
* Although we don't support dumping different struct creds in general,
--
2.30.0

View File

@ -0,0 +1,250 @@
From f76aa4ade354649e3291b5e7274c368740b05417 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:05:34 +0800
Subject: [PATCH 14/16] zdtm: add rseq transition test for amd64 Signed-off-by:
Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
test/zdtm/transition/Makefile | 1 +
test/zdtm/transition/rseq01.c | 208 +++++++++++++++++++
test/zdtm/transition/rseq01.desc | 1 +
3 files changed, 210 insertions(+)
create mode 100644 test/zdtm/transition/rseq01.c
create mode 100644 test/zdtm/transition/rseq01.desc
diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile
index 9388157..fae4e27 100644
--- a/test/zdtm/transition/Makefile
+++ b/test/zdtm/transition/Makefile
@@ -23,6 +23,7 @@ TST_NOFILE = \
lazy-thp \
pid_reuse \
pidfd_store_sk \
+ rseq01 \
TST_FILE = \
diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c
new file mode 100644
index 0000000..5fac5a6
--- /dev/null
+++ b/test/zdtm/transition/rseq01.c
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <syscall.h>
+
+#include "zdtmtst.h"
+
+#ifdef __has_include
+# if __has_include ("sys/rseq.h")
+# include <sys/rseq.h>
+# endif
+#endif
+
+#if defined(__x86_64__)
+
+#if defined(__x86_64__) && defined(RSEQ_SIG)
+static inline void *thread_pointer(void)
+{
+ void *result;
+ asm("mov %%fs:0, %0" : "=r"(result));
+ return result;
+}
+
+static inline void unregister_old_rseq(void)
+{
+ /* unregister rseq */
+ syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG);
+}
+#else
+static inline void unregister_old_rseq(void)
+{
+}
+#endif
+
+const char *test_doc = "rseq() transition test";
+const char *test_author = "Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>";
+
+/* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */
+
+/* some useful definitions from kernel uapi */
+#ifndef RSEQ_SIG
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ uint64_t rseq_cs;
+ uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
+
+#define RSEQ_SIG 0x53053053
+
+#endif
+
+#ifndef __NR_rseq
+#define __NR_rseq 334
+#endif
+/* EOF */
+
+static volatile struct rseq *rseq_ptr;
+static __thread volatile struct rseq __rseq_abi;
+
+static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig)
+{
+ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
+}
+
+static void register_thread(void)
+{
+ int rc;
+ unregister_old_rseq();
+ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG);
+ if (rc) {
+ fail("Failed to register rseq");
+ exit(1);
+ }
+}
+
+static void check_thread(void)
+{
+ int rc;
+ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG);
+ if (!(rc && errno == EBUSY)) {
+ fail("Failed to check rseq %d", rc);
+ exit(1);
+ }
+}
+
+#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x))
+
+static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ double a = 10000000000000000.0;
+ double b = -1;
+ /*test_msg("enter %f %f\n", a, b);*/
+
+ /* clang-format off */
+ __asm__ __volatile__ goto(
+ ".pushsection __rseq_table, \"aw\"\n\t"
+ ".balign 32\n\t"
+ "cs_obj:\n\t"
+ /* version, flags */
+ ".long 0, 0\n\t"
+ /* start_ip, post_commit_offset, abort_ip */
+ ".quad 1f, (2f-1f), 4f\n\t"
+ ".popsection\n\t"
+ "1:\n\t"
+ "leaq cs_obj(%%rip), %%rax\n\t"
+ "movq %%rax, %[rseq_cs]\n\t"
+ "cmpl %[cpu_id], %[current_cpu_id]\n\t"
+ "jnz 4f\n\t"
+ "addq %[count], %[v]\n\t" /* final store */
+ "mov $10000000, %%rcx\n\t"
+ "fldl %[x]\n\t" /* we have st clobbered */
+ "5:\n\t"
+ "fsqrt\n\t" /* heavy instruction */
+ "dec %%rcx\n\t"
+ "jnz 5b\n\t"
+ "fstpl %[y]\n\t"
+ "2:\n\t"
+ ".pushsection __rseq_failure, \"ax\"\n\t"
+ /* Disassembler-friendly signature: nopl <sig>(%rip). */
+ ".byte 0x0f, 0xb9, 0x3d\n\t"
+ ".long 0x53053053\n\t" /* RSEQ_FLAGS */
+ "4:\n\t"
+ /*"fstpl %[y]\n\t"*/
+ "jmp %l[abort]\n\t"
+ /*"jmp 1b\n\t"*/
+ ".popsection\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_ptr->cpu_id),
+ [rseq_cs] "m" (rseq_ptr->rseq_cs),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "er" (count),
+ [x] "m" (a),
+ [y] "m" (b)
+ : "memory", "cc", "rax", "rcx", "st"
+ : abort
+ );
+ /* clang-format on */
+ /*test_msg("exit %f %f\n", a, b);*/
+ return 0;
+abort:
+ /*test_msg("abort %f %f\n", a, b);*/
+ return -1;
+}
+
+int main(int argc, char *argv[])
+{
+ int cpu = 0;
+ int ret;
+ intptr_t *cpu_data;
+ long nr_cpus;
+
+ rseq_ptr = &__rseq_abi;
+ memset((void *)rseq_ptr, 0, sizeof(struct rseq));
+
+ test_init(argc, argv);
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+ cpu_data = calloc(nr_cpus, sizeof(*cpu_data));
+ if (!cpu_data) {
+ fail("calloc");
+ exit(EXIT_FAILURE);
+ }
+ register_thread();
+
+ test_daemon();
+
+ while (test_go()) {
+ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start);
+ ret = rseq_addv(&cpu_data[cpu], 2, cpu);
+ if (ret)
+ fail("Failed to increment per-cpu counter");
+ }
+
+ test_waitsig();
+
+ check_thread();
+ pass();
+
+ return 0;
+}
+
+#else
+
+int main(int argc, char *argv[])
+{
+ test_init(argc, argv);
+ skip("Unsupported arch");
+ return 0;
+}
+
+#endif
diff --git a/test/zdtm/transition/rseq01.desc b/test/zdtm/transition/rseq01.desc
new file mode 100644
index 0000000..0324fa3
--- /dev/null
+++ b/test/zdtm/transition/rseq01.desc
@@ -0,0 +1 @@
+{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'}
--
2.30.0

View File

@ -0,0 +1,330 @@
From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:09:44 +0800
Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may
configure rseq critical section by def
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
criu/cr-dump.c | 86 +++++++++++++++++++------------
criu/cr-restore.c | 63 ++++++++++++++++++++++
criu/include/pstree.h | 1 +
images/rseq.proto | 1 +
4 files changed, 119 insertions(+), 32 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index a3f8973..79387fb 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1047,13 +1047,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
return 0;
}
-static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs)
+static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc,
+ struct rseq_cs *rseq_cs, struct rseq *rseq)
{
int ret;
- uint64_t addr;
/* rseq is not registered */
- if (!rseq->rseq_abi_pointer)
+ if (!rseqc->rseq_abi_pointer)
return 0;
/*
@@ -1068,22 +1068,21 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
* then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq
* will be cleared. So, let's use ptrace(PTRACE_PEEKDATA).
*/
- ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)),
- sizeof(uint64_t));
+ ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer),
+ sizeof(struct rseq));
if (ret) {
- pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr,
- (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t));
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq,
+ (unsigned long)(rseqc->rseq_abi_pointer), sizeof(uint64_t));
return -1;
}
- /* (struct rseq)->rseq_cs is NULL */
- if (!addr)
+ if (!rseq->rseq_cs.ptr64)
return 0;
- ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs));
+ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs.ptr64), sizeof(struct rseq_cs));
if (ret) {
pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid,
- (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs));
+ (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs.ptr64, sizeof(struct rseq_cs));
return -1;
}
@@ -1092,11 +1091,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
static int dump_thread_rseq(struct pstree_item *item, int i)
{
- struct __ptrace_rseq_configuration rseq;
+ struct __ptrace_rseq_configuration rseqc;
RseqEntry *rseqe = NULL;
int ret;
CoreEntry *core = item->core[i];
RseqEntry **rseqep = &core->thread_core->rseq_entry;
+ struct rseq rseq;
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
pid_t tid = item->threads[i].real;
@@ -1111,20 +1111,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
if (!kdat.has_ptrace_get_rseq_conf)
return 0;
- ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
- if (ret != sizeof(rseq)) {
+ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc);
+ if (ret != sizeof(rseqc)) {
pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
return -1;
}
- if (rseq.flags != 0) {
+ if (rseqc.flags != 0) {
pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
- rseq.flags);
+ rseqc.flags);
return -1;
}
- pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
- rseq.signature);
+ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer,
+ rseqc.signature);
rseqe = xmalloc(sizeof(*rseqe));
if (!rseqe)
@@ -1132,13 +1132,22 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
rseq_entry__init(rseqe);
- rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
- rseqe->rseq_abi_size = rseq.rseq_abi_size;
- rseqe->signature = rseq.signature;
+ rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer;
+ rseqe->rseq_abi_size = rseqc.rseq_abi_size;
+ rseqe->signature = rseqc.signature;
- if (read_rseq_cs(tid, &rseq, rseq_cs))
+ if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq))
goto err;
+ rseqe->has_rseq_cs_pointer = true;
+ rseqe->rseq_cs_pointer = rseq.rseq_cs.ptr64;
+ pr_err("cs pointer %lx\n", rseqe->rseq_cs_pointer);
+ /* we won't save rseq_cs to the image (only pointer),
+ * so let's combine flags from both struct rseq and struct rseq_cs
+ * (kernel does the same when interpreting RSEQ_CS_FLAG_*)
+ */
+ rseq_cs->flags |= rseq.flags;
+
/* save rseq entry to the image */
*rseqep = rseqe;
@@ -1188,11 +1197,11 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
pid_t tid = item->threads[i].real;
- /* (struct rseq)->rseq_cs is NULL */
+ /* equivalent to (struct rseq)->rseq_cs is NULL */
if (!rseq_cs->start_ip)
return 0;
- pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
+ pr_debug("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags,
rseq_cs->version, (unsigned long)TI_IP(core));
@@ -1204,25 +1213,38 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
if (task_in_rseq(rseq_cs, TI_IP(core))) {
struct pid *tid = &item->threads[i];
- pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
- tid->real);
-
/*
* We need to fixup task instruction pointer from
* the original one (which lays inside rseq critical section)
- * to rseq abort handler address.
+ * to rseq abort handler address. But we need to look on rseq_cs->flags
+ * (please refer to struct rseq -> flags field description).
+ * Naive idea of flags support may be like... let's change instruction pointer (IP)
+ * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL).
+ * But unfortunately, it doesn't work properly, because the kernel does
+ * clean up of rseq_cs field in the struct rseq (modifies userspace memory).
+ * So, we need to preserve original value of (struct rseq)->rseq_cs field in the
+ * image and restore it's value before releasing threads.
*
* It's worth to mention that we need to fixup IP in CoreEntry
* (used when full dump/restore is performed) and also in
* the parasite regs storage (used if --leave-running option is used,
* or if dump error occured and process execution is resumed).
*/
- TI_IP(core) = rseq_cs->abort_ip;
- if (item->pid->real == tid->real) {
- compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
+ if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) {
+ pr_err("The %d task is in rseq critical section.!!! IP will be set to rseq abort handler addr\n",
+ tid->real);
} else {
- compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
+ pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
+ tid->real);
+
+ TI_IP(core) = rseq_cs->abort_ip;
+
+ if (item->pid->real == tid->real) {
+ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
+ } else {
+ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
+ }
}
}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index b2bd044..864140f 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -23,6 +23,7 @@
#include "common/compiler.h"
#include "linux/mount.h"
+#include "linux/rseq.h"
#include "clone-noasan.h"
#include "cr_options.h"
@@ -779,6 +780,7 @@ static int open_cores(int pid, CoreEntry *leader_core)
{
int i, tpid;
CoreEntry **cores = NULL;
+ //RseqEntry *rseqs;
cores = xmalloc(sizeof(*cores) * current->nr_threads);
if (!cores)
@@ -812,6 +814,19 @@ static int open_cores(int pid, CoreEntry *leader_core)
}
}
+
+ pr_err("item %lx\n", (uint64_t)current);
+
+ for (i = 0; i < current->nr_threads; i++) {
+ ThreadCoreEntry *tc = cores[i]->thread_core;
+
+ /* compatibility with older CRIU versions */
+ if (!tc->rseq_entry)
+ continue;
+
+ current->rseqe[i] = *tc->rseq_entry;
+ }
+
return 0;
err:
xfree(cores);
@@ -868,8 +883,15 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
{
unsigned args_len;
struct task_restore_args *ta;
+ RseqEntry *rseqs;
pr_info("Restoring resources\n");
+ rseqs = shmalloc(sizeof(*rseqs) * current->nr_threads);
+ if (!rseqs)
+ return -1;
+
+ current->rseqe = rseqs;
+
rst_mem_switch_to_private();
args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size());
@@ -1966,6 +1988,44 @@ static int attach_to_tasks(bool root_seized)
return 0;
}
+static int restore_rseq_cs(void)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ int i;
+
+ if (!task_alive(item))
+ continue;
+
+ if (item->nr_threads == 1) {
+ item->threads[0].real = item->pid->real;
+ } else {
+ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads))
+ return -1;
+ }
+
+ for (i = 0; i < item->nr_threads; i++) {
+ pid_t pid = item->threads[i].real;
+
+ if (!item->rseqe[i].rseq_cs_pointer || !item->rseqe[i].rseq_abi_pointer) {
+ pr_err("item %lx rseqe %lx\n", (uint64_t)item, (uint64_t)item->rseqe);
+ pr_err("nothing to do with cs_pointer\n");
+ continue;
+ }
+
+ pr_err("restoring cs ... %lx \n", item->rseqe[i].rseq_cs_pointer);
+
+ if (ptrace_poke_area(pid, &item->rseqe[i].rseq_cs_pointer, (void *)(item->rseqe[i].rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t))) {
+ pr_err("Can't restore memfd args (pid: %d)\n", pid);
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int catch_tasks(bool root_seized, enum trace_flags *flag)
{
struct pstree_item *item;
@@ -2400,6 +2460,9 @@ skip_ns_bouncing:
if (restore_freezer_state())
pr_err("Unable to restore freezer state\n");
+ /* just before releasing threads we have to restore rseq_cs */
+ restore_rseq_cs();
+
/* Detaches from processes and they continue run through sigreturn. */
if (finalize_restore_detach())
goto out_kill_network_unlocked;
diff --git a/criu/include/pstree.h b/criu/include/pstree.h
index 458e5f9..97bef11 100644
--- a/criu/include/pstree.h
+++ b/criu/include/pstree.h
@@ -25,6 +25,7 @@ struct pstree_item {
int nr_threads; /* number of threads */
struct pid *threads; /* array of threads */
CoreEntry **core;
+ RseqEntry *rseqe;
TaskKobjIdsEntry *ids;
union {
futex_t task_st;
diff --git a/images/rseq.proto b/images/rseq.proto
index be28004..45cb847 100644
--- a/images/rseq.proto
+++ b/images/rseq.proto
@@ -6,4 +6,5 @@ message rseq_entry {
required uint64 rseq_abi_pointer = 1;
required uint32 rseq_abi_size = 2;
required uint32 signature = 3;
+ optional uint64 rseq_cs_pointer = 4;
}
--
2.30.0

View File

@ -0,0 +1,177 @@
From bb8295ae4f1224db2236fdd3134912e093ed20d9 Mon Sep 17 00:00:00 2001
From: bb-cat <ningyu9@huawei.com>
Date: Wed, 2 Mar 2022 15:10:24 +0800
Subject: [PATCH 16/16] zdtm: add rseq02 transition test with NO_RESTART CS
flag Signed-off-by: Alexander Mikhalitsyn
<alexander.mikhalitsyn@virtuozzo.com>
---
test/zdtm/transition/Makefile | 2 +
test/zdtm/transition/rseq01.c | 61 +++++++++++++++++++-
test/zdtm/transition/rseq02.c | 1 +
test/zdtm/transition/rseq02.desc | 1 +
4 files changed, 63 insertions(+), 2 deletions(-)
create mode 120000 test/zdtm/transition/rseq02.c
create mode 120000 test/zdtm/transition/rseq02.desc
diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile
index fae4e27..378a4fc 100644
--- a/test/zdtm/transition/Makefile
+++ b/test/zdtm/transition/Makefile
@@ -24,6 +24,7 @@ TST_NOFILE = \
pid_reuse \
pidfd_store_sk \
rseq01 \
+ rseq02 \
TST_FILE = \
@@ -82,6 +83,7 @@ ptrace: LDFLAGS += -pthread
fork2: CFLAGS += -D FORK2
thread-bomb.o: CFLAGS += -pthread
thread-bomb: LDFLAGS += -pthread
+rseq02: CFLAGS += -D NOABORT
%: %.sh
cp $< $@
diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c
index 5fac5a6..25e1d61 100644
--- a/test/zdtm/transition/rseq01.c
+++ b/test/zdtm/transition/rseq01.c
@@ -53,6 +53,18 @@ enum rseq_flags {
RSEQ_FLAG_UNREGISTER = (1 << 0),
};
+enum rseq_cs_flags_bit {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
struct rseq {
uint32_t cpu_id_start;
uint32_t cpu_id;
@@ -104,6 +116,7 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
{
double a = 10000000000000000.0;
double b = -1;
+ uint64_t rseq_cs1, rseq_cs2;
/*test_msg("enter %f %f\n", a, b);*/
/* clang-format off */
@@ -129,6 +142,9 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
"dec %%rcx\n\t"
"jnz 5b\n\t"
"fstpl %[y]\n\t"
+ "movq %%rax, %[rseq_cs_check2]\n\t"
+ "movq %[rseq_cs], %%rax\n\t"
+ "movq %%rax, %[rseq_cs_check1]\n\t"
"2:\n\t"
".pushsection __rseq_failure, \"ax\"\n\t"
/* Disassembler-friendly signature: nopl <sig>(%rip). */
@@ -143,6 +159,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (rseq_ptr->cpu_id),
[rseq_cs] "m" (rseq_ptr->rseq_cs),
+ [rseq_cs_check1] "m" (rseq_cs1),
+ [rseq_cs_check2] "m" (rseq_cs2),
/* final store input */
[v] "m" (*v),
[count] "er" (count),
@@ -153,8 +171,20 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu)
);
/* clang-format on */
/*test_msg("exit %f %f\n", a, b);*/
+ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2);
+ if (rseq_cs1 != rseq_cs2) {
+ /*
+ * It means that we finished critical section
+ * *normally* (haven't jumped to abort) but the kernel had cleaned up
+ * rseq_ptr->rseq_cs before we left critical section
+ * and CRIU wasn't restored it correctly.
+ * That's a bug picture.
+ */
+ return -1;
+ }
return 0;
abort:
+ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2);
/*test_msg("abort %f %f\n", a, b);*/
return -1;
}
@@ -177,21 +207,48 @@ int main(int argc, char *argv[])
fail("calloc");
exit(EXIT_FAILURE);
}
+
register_thread();
+ /*
+ * We want to test that RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+ * is handled properly by CRIU, but that flag can be used
+ * only with all another flags set.
+ * Please, refer to
+ * https://github.com/torvalds/linux/blob/master/kernel/rseq.c#L192
+ */
+#ifdef NOABORT
+ rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT |
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL |
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE;
+#endif
+
test_daemon();
while (test_go()) {
cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start);
ret = rseq_addv(&cpu_data[cpu], 2, cpu);
- if (ret)
+#ifndef NOABORT
+ /* just ignore abort */
+ ret = 0;
+#else
+ if (ret) {
fail("Failed to increment per-cpu counter");
+ break;
+ } else {
+ //test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]);
+ }
+#endif
}
test_waitsig();
check_thread();
- pass();
+
+ if (ret)
+ fail();
+ else
+ pass();
return 0;
}
diff --git a/test/zdtm/transition/rseq02.c b/test/zdtm/transition/rseq02.c
new file mode 120000
index 0000000..d564917
--- /dev/null
+++ b/test/zdtm/transition/rseq02.c
@@ -0,0 +1 @@
+rseq01.c
\ No newline at end of file
diff --git a/test/zdtm/transition/rseq02.desc b/test/zdtm/transition/rseq02.desc
new file mode 120000
index 0000000..b888f0d
--- /dev/null
+++ b/test/zdtm/transition/rseq02.desc
@@ -0,0 +1 @@
+rseq01.desc
\ No newline at end of file
--
2.30.0

View File

@ -1,6 +1,6 @@
Name: criu
Version: 3.16.1
Release: 2
Release: 3
Provides: crtools = %{version}-%{release}
Obsoletes: crtools <= 1.0-2
Summary: A tool of Checkpoint/Restore in User-space
@ -17,6 +17,21 @@ Obsoletes: %{name}-libs < %{version}-%{release}
Patch1: 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch
Patch2: 0002-mm-add-pin-memory-method-for-criu.patch
Patch3: 0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch
Patch4: 0003-kerndat-check-for-rseq-syscall-support.patch
Patch5: 0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch
Patch6: 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch
Patch7: 0006-rseq-initial-support.patch
Patch8: 0007-zdtm-add-simple-test-for-rseq-C-R.patch
Patch9: 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch
Patch10: 0009-include-add-thread_pointer.h-from-Glibc.patch
Patch11: 0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch
Patch12: 0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch
Patch13: 0012-compel-add-helpers-to-get-set-instruction-pointer.patch
Patch14: 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch
Patch15: 0014-zdtm-add-rseq-transition-test-for-amd64.patch
Patch16: 0015-cr-dump-handle-rseq-flags-field.patch
Patch17: 0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch
%description
Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system.
@ -99,6 +114,9 @@ chmod 0755 %{buildroot}/run/%{name}/
%doc %{_mandir}/man1/{compel.1*,crit.1*,criu-ns.1*}
%changelog
* Fri Mar 4 2022 ningyu <ningyu9@huawei.com> - 3.16.1-3
- rseq c/r support
* Sat Feb 26 2022 luolongjun <luolongjuna@gmail.com> - 3.16.1-2
- add support for pin memory