see https://github.com/checkpoint-restore/criu/issues/1696 and https://github.com/checkpoint-restore/criu/pull/1706
703 lines
21 KiB
Diff
703 lines
21 KiB
Diff
From e444c089ebfb03fb2b6d69a40322d31ab33c0597 Mon Sep 17 00:00:00 2001
|
|
From: bb-cat <ningyu9@huawei.com>
|
|
Date: Wed, 2 Mar 2022 14:52:35 +0800
|
|
Subject: [PATCH 06/16] rseq: initial support TODO: 1. properly handle case
|
|
when the kernel has rseq() support but has no
|
|
ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support and user processes haven't used
|
|
rseq(). 2. properly handle "transient" states, when CRIU comes during rseq
|
|
was executed. We need test for this case with some "heavy" rseq + we need to
|
|
properly handle RSEQ_CS_* flags.
|
|
|
|
Fixes: #1696
|
|
|
|
Reported-by: Radostin Stoyanov <radostin@redhat.com>
|
|
Suggested-by: Florian Weimer <fweimer@redhat.com>
|
|
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
|
|
---
|
|
compel/include/uapi/ptrace.h | 16 +--
|
|
criu/cr-dump.c | 99 ++++++++++++++++
|
|
criu/cr-restore.c | 17 +++
|
|
criu/include/linux/rseq.h | 144 +++++++++++++++++++++++
|
|
criu/include/parasite.h | 7 ++
|
|
criu/include/restorer.h | 7 ++
|
|
criu/kerndat.c | 2 +-
|
|
criu/parasite-syscall.c | 11 ++
|
|
criu/pie/parasite.c | 99 ++++++++++++++++
|
|
criu/pie/restorer.c | 24 ++++
|
|
images/Makefile | 1 +
|
|
images/core.proto | 2 +
|
|
images/rseq.proto | 9 ++
|
|
13 files changed, 429 insertions(+), 9 deletions(-)
|
|
create mode 100644 criu/include/linux/rseq.h
|
|
create mode 100644 images/rseq.proto
|
|
|
|
diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h
|
|
index bfe28c7..d807a92 100644
|
|
--- a/compel/include/uapi/ptrace.h
|
|
+++ b/compel/include/uapi/ptrace.h
|
|
@@ -66,14 +66,14 @@ typedef struct {
|
|
} seccomp_metadata_t;
|
|
|
|
#ifndef PTRACE_GET_RSEQ_CONFIGURATION
|
|
-#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
|
|
-
|
|
-struct ptrace_rseq_configuration {
|
|
- __u64 rseq_abi_pointer;
|
|
- __u32 rseq_abi_size;
|
|
- __u32 signature;
|
|
- __u32 flags;
|
|
- __u32 pad;
|
|
+#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
|
|
+
|
|
+struct __ptrace_rseq_configuration {
|
|
+ uint64_t rseq_abi_pointer;
|
|
+ uint32_t rseq_abi_size;
|
|
+ uint32_t signature;
|
|
+ uint32_t flags;
|
|
+ uint32_t pad;
|
|
};
|
|
#endif
|
|
|
|
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
|
|
index f07fe6e..91dd08a 100644
|
|
--- a/criu/cr-dump.c
|
|
+++ b/criu/cr-dump.c
|
|
@@ -45,6 +45,7 @@
|
|
#include "proc_parse.h"
|
|
#include "parasite.h"
|
|
#include "parasite-syscall.h"
|
|
+#include <compel/ptrace.h>
|
|
#include "files.h"
|
|
#include "files-reg.h"
|
|
#include "shmem.h"
|
|
@@ -200,6 +201,25 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
|
|
return 0;
|
|
}
|
|
|
|
+static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq, bool has_tc_rseq_entry)
|
|
+{
|
|
+ if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf)
|
|
+ return 0;
|
|
+
|
|
+ pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited);
|
|
+
|
|
+ /*
|
|
+ * We have no kdat.has_ptrace_get_rseq_conf and user
|
|
+ * process has rseq() used, let's fail dump.
|
|
+ */
|
|
+ if (ti_rseq->rseq_inited) {
|
|
+ pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
struct cr_imgset *glob_imgset;
|
|
|
|
static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds)
|
|
@@ -730,6 +750,17 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
|
|
if (!ret)
|
|
ret = seccomp_dump_thread(pid, tc);
|
|
|
|
+ /*
|
|
+ * We are dumping rseq() in the dump_thread_rseq() function,
|
|
+ * *before* processes gets infected (because of ptrace requests
|
|
+ * API restriction). At this point, if the kernel lacks
|
|
+ * kdat.has_ptrace_get_rseq_conf support we have to ensure
|
|
+ * that dumpable processes haven't initialized rseq() or
|
|
+ * fail dump if rseq() was used.
|
|
+ */
|
|
+ if (!ret)
|
|
+ ret = check_thread_rseq(pid, &ti->rseq, !!tc->rseq_entry);
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
@@ -1016,6 +1047,68 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
|
|
return 0;
|
|
}
|
|
|
|
+static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
|
|
+{
|
|
+ struct __ptrace_rseq_configuration rseq;
|
|
+ RseqEntry *rseqe = NULL;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * If we are here it means that rseq() syscall is supported,
|
|
+ * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported,
|
|
+ * we can just fail dump here. But this is bad idea, IMHO.
|
|
+ *
|
|
+ * So, we will try to detect if victim process was used rseq().
|
|
+ * See check_rseq() and check_thread_rseq() functions.
|
|
+ */
|
|
+ if (!kdat.has_ptrace_get_rseq_conf)
|
|
+ return 0;
|
|
+
|
|
+ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
|
|
+ if (ret != sizeof(rseq)) {
|
|
+ pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (rseq.flags != 0) {
|
|
+ pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
|
|
+ rseq.flags);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature);
|
|
+
|
|
+ rseqe = xmalloc(sizeof(*rseqe));
|
|
+ if (!rseqe)
|
|
+ return -1;
|
|
+
|
|
+ rseq_entry__init(rseqe);
|
|
+
|
|
+ rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
|
|
+ rseqe->rseq_abi_size = rseq.rseq_abi_size;
|
|
+ rseqe->signature = rseq.signature;
|
|
+
|
|
+ *rseqep = rseqe;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int dump_task_rseq(pid_t pid, struct pstree_item *item)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* if rseq() syscall isn't supported then nothing to dump */
|
|
+ if (!kdat.has_rseq)
|
|
+ return 0;
|
|
+
|
|
+ for (i = 0; i < item->nr_threads; i++) {
|
|
+ if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry))
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static struct proc_pid_stat pps_buf;
|
|
|
|
static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item)
|
|
@@ -1304,6 +1397,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
|
|
goto err;
|
|
}
|
|
|
|
+ ret = dump_task_rseq(pid, item);
|
|
+ if (ret) {
|
|
+ pr_err("Dump %d rseq failed %d\n", pid, ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
parasite_ctl = parasite_infect_seized(pid, item, &vmas);
|
|
if (!parasite_ctl) {
|
|
pr_err("Can't infect (pid: %d) with parasite\n", pid);
|
|
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
|
|
index 5b645c1..b2bd044 100644
|
|
--- a/criu/cr-restore.c
|
|
+++ b/criu/cr-restore.c
|
|
@@ -2975,6 +2975,19 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc)
|
|
return 0;
|
|
}
|
|
|
|
+static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc)
|
|
+{
|
|
+ /* compatibility with older CRIU versions */
|
|
+ if (!tc->rseq_entry)
|
|
+ return 0;
|
|
+
|
|
+ rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer;
|
|
+ rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size;
|
|
+ rseq->signature = tc->rseq_entry->signature;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static rlim_t decode_rlim(rlim_t ival)
|
|
{
|
|
return ival == -1 ? RLIM_INFINITY : ival;
|
|
@@ -3704,6 +3717,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
|
|
thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
|
|
core_get_tls(tcore, &thread_args[i].tls);
|
|
|
|
+ ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
rst_reloc_creds(&thread_args[i], &creds_pos_next);
|
|
|
|
thread_args[i].futex_rla = tcore->thread_core->futex_rla;
|
|
diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h
|
|
new file mode 100644
|
|
index 0000000..5c1706a
|
|
--- /dev/null
|
|
+++ b/criu/include/linux/rseq.h
|
|
@@ -0,0 +1,144 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
|
|
+#ifndef _UAPI_LINUX_RSEQ_H
|
|
+#define _UAPI_LINUX_RSEQ_H
|
|
+
|
|
+/*
|
|
+ * linux/rseq.h
|
|
+ *
|
|
+ * Restartable sequences system call API
|
|
+ *
|
|
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
|
|
+ */
|
|
+
|
|
+#include <linux/types.h>
|
|
+#include <asm/byteorder.h>
|
|
+
|
|
+enum rseq_cpu_id_state {
|
|
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
|
|
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
|
|
+};
|
|
+
|
|
+enum rseq_flags {
|
|
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
|
|
+};
|
|
+
|
|
+enum rseq_cs_flags_bit {
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
|
|
+};
|
|
+
|
|
+enum rseq_cs_flags {
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
|
|
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
|
|
+};
|
|
+
|
|
+/*
|
|
+ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
|
|
+ * contained within a single cache-line. It is usually declared as
|
|
+ * link-time constant data.
|
|
+ */
|
|
+struct rseq_cs {
|
|
+ /* Version of this structure. */
|
|
+ __u32 version;
|
|
+ /* enum rseq_cs_flags */
|
|
+ __u32 flags;
|
|
+ __u64 start_ip;
|
|
+ /* Offset from start_ip. */
|
|
+ __u64 post_commit_offset;
|
|
+ __u64 abort_ip;
|
|
+} __attribute__((aligned(4 * sizeof(__u64))));
|
|
+
|
|
+/*
|
|
+ * struct rseq is aligned on 4 * 8 bytes to ensure it is always
|
|
+ * contained within a single cache-line.
|
|
+ *
|
|
+ * A single struct rseq per thread is allowed.
|
|
+ */
|
|
+struct rseq {
|
|
+ /*
|
|
+ * Restartable sequences cpu_id_start field. Updated by the
|
|
+ * kernel. Read by user-space with single-copy atomicity
|
|
+ * semantics. This field should only be read by the thread which
|
|
+ * registered this data structure. Aligned on 32-bit. Always
|
|
+ * contains a value in the range of possible CPUs, although the
|
|
+ * value may not be the actual current CPU (e.g. if rseq is not
|
|
+ * initialized). This CPU number value should always be compared
|
|
+ * against the value of the cpu_id field before performing a rseq
|
|
+ * commit or returning a value read from a data structure indexed
|
|
+ * using the cpu_id_start value.
|
|
+ */
|
|
+ __u32 cpu_id_start;
|
|
+ /*
|
|
+ * Restartable sequences cpu_id field. Updated by the kernel.
|
|
+ * Read by user-space with single-copy atomicity semantics. This
|
|
+ * field should only be read by the thread which registered this
|
|
+ * data structure. Aligned on 32-bit. Values
|
|
+ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED
|
|
+ * have a special semantic: the former means "rseq uninitialized",
|
|
+ * and latter means "rseq initialization failed". This value is
|
|
+ * meant to be read within rseq critical sections and compared
|
|
+ * with the cpu_id_start value previously read, before performing
|
|
+ * the commit instruction, or read and compared with the
|
|
+ * cpu_id_start value before returning a value loaded from a data
|
|
+ * structure indexed using the cpu_id_start value.
|
|
+ */
|
|
+ __u32 cpu_id;
|
|
+ /*
|
|
+ * Restartable sequences rseq_cs field.
|
|
+ *
|
|
+ * Contains NULL when no critical section is active for the current
|
|
+ * thread, or holds a pointer to the currently active struct rseq_cs.
|
|
+ *
|
|
+ * Updated by user-space, which sets the address of the currently
|
|
+ * active rseq_cs at the beginning of assembly instruction sequence
|
|
+ * block, and set to NULL by the kernel when it restarts an assembly
|
|
+ * instruction sequence block, as well as when the kernel detects that
|
|
+ * it is preempting or delivering a signal outside of the range
|
|
+ * targeted by the rseq_cs. Also needs to be set to NULL by user-space
|
|
+ * before reclaiming memory that contains the targeted struct rseq_cs.
|
|
+ *
|
|
+ * Read and set by the kernel. Set by user-space with single-copy
|
|
+ * atomicity semantics. This field should only be updated by the
|
|
+ * thread which registered this data structure. Aligned on 64-bit.
|
|
+ */
|
|
+ union {
|
|
+ __u64 ptr64;
|
|
+#ifdef __LP64__
|
|
+ __u64 ptr;
|
|
+#else
|
|
+ struct {
|
|
+#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN)
|
|
+ __u32 padding; /* Initialized to zero. */
|
|
+ __u32 ptr32;
|
|
+#else /* LITTLE */
|
|
+ __u32 ptr32;
|
|
+ __u32 padding; /* Initialized to zero. */
|
|
+#endif /* ENDIAN */
|
|
+ } ptr;
|
|
+#endif
|
|
+ } rseq_cs;
|
|
+
|
|
+ /*
|
|
+ * Restartable sequences flags field.
|
|
+ *
|
|
+ * This field should only be updated by the thread which
|
|
+ * registered this data structure. Read by the kernel.
|
|
+ * Mainly used for single-stepping through rseq critical sections
|
|
+ * with debuggers.
|
|
+ *
|
|
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
|
|
+ * Inhibit instruction sequence block restart on preemption
|
|
+ * for this thread.
|
|
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
|
|
+ * Inhibit instruction sequence block restart on signal
|
|
+ * delivery for this thread.
|
|
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
|
|
+ * Inhibit instruction sequence block restart on migration for
|
|
+ * this thread.
|
|
+ */
|
|
+ __u32 flags;
|
|
+} __attribute__((aligned(4 * sizeof(__u64))));
|
|
+
|
|
+#endif /* _UAPI_LINUX_RSEQ_H */
|
|
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
|
|
index 8107aa4..5fde809 100644
|
|
--- a/criu/include/parasite.h
|
|
+++ b/criu/include/parasite.h
|
|
@@ -164,10 +164,17 @@ struct parasite_dump_creds {
|
|
unsigned int groups[0];
|
|
};
|
|
|
|
+struct parasite_check_rseq {
|
|
+ bool has_rseq;
|
|
+ bool has_ptrace_get_rseq_conf; /* no need to check if supported */
|
|
+ bool rseq_inited;
|
|
+};
|
|
+
|
|
struct parasite_dump_thread {
|
|
unsigned int *tid_addr;
|
|
pid_t tid;
|
|
tls_t tls;
|
|
+ struct parasite_check_rseq rseq;
|
|
stack_t sas;
|
|
int pdeath_sig;
|
|
char comm[TASK_COMM_LEN];
|
|
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
|
|
index c2ef8f0..c29d869 100644
|
|
--- a/criu/include/restorer.h
|
|
+++ b/criu/include/restorer.h
|
|
@@ -45,6 +45,12 @@ struct rst_sched_param {
|
|
int prio;
|
|
};
|
|
|
|
+struct rst_rseq_param {
|
|
+ u64 rseq_abi_pointer;
|
|
+ u32 rseq_abi_size;
|
|
+ u32 signature;
|
|
+};
|
|
+
|
|
struct restore_posix_timer {
|
|
struct str_posix_timer spt;
|
|
struct itimerspec val;
|
|
@@ -99,6 +105,7 @@ struct thread_restore_args {
|
|
struct task_restore_args *ta;
|
|
|
|
tls_t tls;
|
|
+ struct rst_rseq_param rseq;
|
|
|
|
siginfo_t *siginfo;
|
|
unsigned int siginfo_n;
|
|
diff --git a/criu/kerndat.c b/criu/kerndat.c
|
|
index 4841387..af7113a 100644
|
|
--- a/criu/kerndat.c
|
|
+++ b/criu/kerndat.c
|
|
@@ -837,7 +837,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void)
|
|
{
|
|
pid_t pid;
|
|
int len;
|
|
- struct ptrace_rseq_configuration rseq;
|
|
+ struct __ptrace_rseq_configuration rseq;
|
|
|
|
pid = fork_and_ptrace_attach(NULL);
|
|
if (pid < 0)
|
|
diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
|
|
index 7175ade..ee4fa86 100644
|
|
--- a/criu/parasite-syscall.c
|
|
+++ b/criu/parasite-syscall.c
|
|
@@ -132,6 +132,13 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c
|
|
return ce->groups ? 0 : -ENOMEM;
|
|
}
|
|
|
|
+static void init_parasite_rseq_arg(struct parasite_check_rseq *rseq)
|
|
+{
|
|
+ rseq->has_rseq = kdat.has_rseq;
|
|
+ rseq->has_ptrace_get_rseq_conf = kdat.has_ptrace_get_rseq_conf;
|
|
+ rseq->rseq_inited = false;
|
|
+}
|
|
+
|
|
int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core)
|
|
{
|
|
ThreadCoreEntry *tc = core->thread_core;
|
|
@@ -144,6 +151,8 @@ int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEn
|
|
pc = args->creds;
|
|
pc->cap_last_cap = kdat.last_cap;
|
|
|
|
+ init_parasite_rseq_arg(&args->rseq);
|
|
+
|
|
ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl);
|
|
if (ret < 0)
|
|
return ret;
|
|
@@ -197,6 +206,8 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit
|
|
|
|
compel_arch_get_tls_thread(tctl, &args->tls);
|
|
|
|
+ init_parasite_rseq_arg(&args->rseq);
|
|
+
|
|
ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD);
|
|
if (ret) {
|
|
pr_err("Can't init thread in parasite %d\n", pid);
|
|
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
|
|
index bc0a33c..e49958b 100644
|
|
--- a/criu/pie/parasite.c
|
|
+++ b/criu/pie/parasite.c
|
|
@@ -8,6 +8,8 @@
|
|
#include <sys/ioctl.h>
|
|
#include <sys/uio.h>
|
|
|
|
+#include "linux/rseq.h"
|
|
+
|
|
#include "common/config.h"
|
|
#include "int.h"
|
|
#include "types.h"
|
|
@@ -167,6 +169,7 @@ static int dump_posix_timers(struct parasite_dump_posix_timers_args *args)
|
|
}
|
|
|
|
static int dump_creds(struct parasite_dump_creds *args);
|
|
+static int check_rseq(struct parasite_check_rseq *rseq);
|
|
|
|
static int dump_thread_common(struct parasite_dump_thread *ti)
|
|
{
|
|
@@ -197,6 +200,12 @@ static int dump_thread_common(struct parasite_dump_thread *ti)
|
|
goto out;
|
|
}
|
|
|
|
+ ret = check_rseq(&ti->rseq);
|
|
+ if (ret) {
|
|
+ pr_err("Unable to check if rseq() is initialized: %d\n", ret);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
ret = dump_creds(ti->creds);
|
|
out:
|
|
return ret;
|
|
@@ -313,6 +322,96 @@ grps_err:
|
|
return -1;
|
|
}
|
|
|
|
+static int check_rseq(struct parasite_check_rseq *rseq)
|
|
+{
|
|
+ int ret;
|
|
+ unsigned long rseq_abi_pointer;
|
|
+ unsigned long rseq_abi_size;
|
|
+ uint32_t rseq_signature;
|
|
+ void *addr;
|
|
+
|
|
+ /* no need to do hacky check if we can get all info from ptrace() */
|
|
+ if (!rseq->has_rseq || rseq->has_ptrace_get_rseq_conf)
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * We need to determine if victim process has rseq()
|
|
+ * initialized, but we have no *any* proper kernel interface
|
|
+ * supported at this point.
|
|
+ * Our plan:
|
|
+ * 1. We know that if we call rseq() syscall and process already
|
|
+ * has current->rseq filled, then we get:
|
|
+ * -EINVAL if current->rseq != rseq || rseq_len != sizeof(*rseq),
|
|
+ * -EPERM if current->rseq_sig != sig),
|
|
+ * -EBUSY if current->rseq == rseq && rseq_len == sizeof(*rseq) &&
|
|
+ * current->rseq_sig != sig
|
|
+ * if current->rseq == NULL (rseq() wasn't used) then we go to:
|
|
+ * IS_ALIGNED(rseq ...) check, if we fail it we get -EINVAL and it
|
|
+ * will be hard to distinguish case when rseq() was initialized or not.
|
|
+ * Let's construct arguments payload
|
|
+ * with:
|
|
+ * 1. correct rseq_abi_size
|
|
+ * 2. aligned and correct rseq_abi_pointer
|
|
+ * And see what rseq() return to us.
|
|
+ * If ret value is:
|
|
+ * 0: it means that rseq *wasn't* used and we successfuly registered it,
|
|
+ * -EINVAL or : it means that rseq is already initialized,
|
|
+ * so we *have* to dump it. But as we have has_ptrace_get_rseq_conf = false,
|
|
+ * we should just fail dump as it's unsafe to skip rseq() dump for processes
|
|
+ * with rseq() initialized.
|
|
+ * -EPERM or -EBUSY: should not happen as we take a fresh memory area for rseq
|
|
+ */
|
|
+ addr = (void *)sys_mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|
+ if (addr == MAP_FAILED) {
|
|
+ pr_err("mmap() failed for struct rseq ret = %lx\n", (unsigned long)addr);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ memset(addr, 0, sizeof(struct rseq));
|
|
+
|
|
+ /* sys_mmap returns page aligned addresses */
|
|
+ rseq_abi_pointer = (unsigned long)addr;
|
|
+ rseq_abi_size = (unsigned long)sizeof(struct rseq);
|
|
+ /* it's not so important to have unique signature for us,
|
|
+ * because rseq_abi_pointer is guaranteed to be unique
|
|
+ */
|
|
+ rseq_signature = 0x12345612;
|
|
+
|
|
+ pr_info("\ttrying sys_rseq(%lx, %lx, %x, %x)\n", rseq_abi_pointer, rseq_abi_size, 0, rseq_signature);
|
|
+ ret = sys_rseq((void *)rseq_abi_pointer, rseq_abi_size, 0, rseq_signature);
|
|
+ if (ret) {
|
|
+ if (ret == -EINVAL) {
|
|
+ pr_info("\trseq is initialized in the victim\n");
|
|
+ rseq->rseq_inited = true;
|
|
+
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ pr_err("\tunexpected failure of sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer,
|
|
+ rseq_abi_size, 0, rseq_signature, ret);
|
|
+
|
|
+ ret = -1;
|
|
+ }
|
|
+ } else {
|
|
+ ret = sys_rseq((void *)rseq_abi_pointer, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, rseq_signature);
|
|
+ if (ret) {
|
|
+ pr_err("\tfailed to unregister sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer,
|
|
+ rseq_abi_size, RSEQ_FLAG_UNREGISTER, rseq_signature, ret);
|
|
+
|
|
+ ret = -1;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ pr_info("\tsys_rseq succeed, let's unregister it back... ok Error\n");
|
|
+ pr_info("\trseq is non-initialized in the victim Error\n");
|
|
+ rseq->rseq_inited = false;
|
|
+ ret = 0;
|
|
+ }
|
|
+
|
|
+out:
|
|
+ sys_munmap(addr, sizeof(struct rseq));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int fill_fds_fown(int fd, struct fd_opts *p)
|
|
{
|
|
int flags, ret;
|
|
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
|
|
index fbc89fe..368b5a0 100644
|
|
--- a/criu/pie/restorer.c
|
|
+++ b/criu/pie/restorer.c
|
|
@@ -459,6 +459,27 @@ static int restore_cpu_affinity(struct task_restore_args *args)
|
|
return 0;
|
|
}
|
|
|
|
+static int restore_rseq(struct rst_rseq_param *rseq)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (!rseq->rseq_abi_pointer) {
|
|
+ pr_debug("rseq: nothing to restore\n");
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, rseq->signature);
|
|
+
|
|
+ ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature);
|
|
+ if (ret) {
|
|
+ pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer,
|
|
+ (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
|
|
{
|
|
unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
|
|
@@ -583,6 +604,9 @@ static int restore_thread_common(struct thread_restore_args *args)
|
|
|
|
restore_tls(&args->tls);
|
|
|
|
+ if (restore_rseq(&args->rseq))
|
|
+ return -1;
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/images/Makefile b/images/Makefile
|
|
index 2eaeb7c..004e22e 100644
|
|
--- a/images/Makefile
|
|
+++ b/images/Makefile
|
|
@@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o
|
|
proto-obj-y += bpfmap-file.o
|
|
proto-obj-y += bpfmap-data.o
|
|
proto-obj-y += apparmor.o
|
|
+proto-obj-y += rseq.o
|
|
|
|
CFLAGS += -iquote $(obj)/
|
|
|
|
diff --git a/images/core.proto b/images/core.proto
|
|
index 39e7f32..b66230e 100644
|
|
--- a/images/core.proto
|
|
+++ b/images/core.proto
|
|
@@ -14,6 +14,7 @@ import "timer.proto";
|
|
import "creds.proto";
|
|
import "sa.proto";
|
|
import "siginfo.proto";
|
|
+import "rseq.proto";
|
|
|
|
import "opts.proto";
|
|
|
|
@@ -106,6 +107,7 @@ message thread_core_entry {
|
|
optional string comm = 13;
|
|
optional uint64 blk_sigset_extended = 14;
|
|
required thread_allowedcpus_entry allowed_cpus = 15;
|
|
+ optional rseq_entry rseq_entry = 16;
|
|
}
|
|
|
|
message task_rlimits_entry {
|
|
diff --git a/images/rseq.proto b/images/rseq.proto
|
|
new file mode 100644
|
|
index 0000000..be28004
|
|
--- /dev/null
|
|
+++ b/images/rseq.proto
|
|
@@ -0,0 +1,9 @@
|
|
+// SPDX-License-Identifier: MIT
|
|
+
|
|
+syntax = "proto2";
|
|
+
|
|
+message rseq_entry {
|
|
+ required uint64 rseq_abi_pointer = 1;
|
|
+ required uint32 rseq_abi_size = 2;
|
|
+ required uint32 signature = 3;
|
|
+}
|
|
--
|
|
2.30.0
|
|
|