rseq, virt: Retrigger RSEQ after vcpu_run()

Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.

This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.

It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().

This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.

Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de
pull/1354/merge
Thomas Gleixner 2025-10-27 09:44:28 +01:00 committed by Ingo Molnar
parent d923739e2e
commit 83409986f4
4 changed files with 68 additions and 37 deletions

View File

@ -29,6 +29,7 @@
#include <linux/crash_dump.h>
#include <linux/panic_notifier.h>
#include <linux/vmalloc.h>
#include <linux/rseq.h>
#include "mshv_eventfd.h"
#include "mshv.h"
@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
}
} while (!vp->run.flags.intercept_suspend);
rseq_virt_userspace_exit();
return ret;
}

View File

@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void)
}
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
* that case just to do it eventually again before returning to user space,
* the entry resume_user_mode_work() invocation is ignored as the register
* argument is NULL.
*
* After returning from guest mode, they have to invoke this function to
* re-raise TIF_NOTIFY_RESUME if necessary.
*/
static inline void rseq_virt_userspace_exit(void)
{
if (current->rseq_event_pending)
set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t)
static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
static inline void rseq_exit_to_user_mode(void) { }

View File

@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
{
struct task_struct *t = current;
int ret, sig;
bool event;
/*
* If invoked from hypervisors before entering the guest via
* resume_user_mode_work(), then @regs is a NULL pointer.
*
* resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
* it before returning from the ioctl() to user space when
* rseq_event.sched_switch is set.
*
* So it's safe to ignore here instead of pointlessly updating it
* in the vcpu_run() loop.
*/
if (!regs)
return;
if (unlikely(t->flags & PF_EXITING))
return;
/*
* If invoked from hypervisors or IO-URING, then @regs is a NULL
* pointer, so fixup cannot be done. If the syscall which led to
* this invocation was invoked inside a critical section, then it
* will either end up in this code again or a possible violation of
* a syscall inside a critical region can only be detected by the
* debug code in rseq_syscall() in a debug enabled kernel.
* Read and clear the event pending bit first. If the task
* was not preempted or migrated or a signal is on the way,
* there is no point in doing any of the heavy lifting here
* on production kernels. In that case TIF_NOTIFY_RESUME
* was raised by some other functionality.
*
* This is correct because the read/clear operation is
* guarded against scheduler preemption, which makes it CPU
* local atomic. If the task is preempted right after
* re-enabling preemption then TIF_NOTIFY_RESUME is set
* again and this function is invoked another time _before_
* the task is able to return to user mode.
*
* On a debug kernel, invoke the fixup code unconditionally
* with the result handed in to allow the detection of
* inconsistencies.
*/
if (regs) {
/*
* Read and clear the event pending bit first. If the task
* was not preempted or migrated or a signal is on the way,
* there is no point in doing any of the heavy lifting here
* on production kernels. In that case TIF_NOTIFY_RESUME
* was raised by some other functionality.
*
* This is correct because the read/clear operation is
* guarded against scheduler preemption, which makes it CPU
* local atomic. If the task is preempted right after
* re-enabling preemption then TIF_NOTIFY_RESUME is set
* again and this function is invoked another time _before_
* the task is able to return to user mode.
*
* On a debug kernel, invoke the fixup code unconditionally
* with the result handed in to allow the detection of
* inconsistencies.
*/
bool event;
scoped_guard(RSEQ_EVENT_GUARD) {
event = t->rseq_event_pending;
t->rseq_event_pending = false;
}
if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
ret = rseq_ip_fixup(regs, event);
if (unlikely(ret < 0))
goto error;
}
scoped_guard(RSEQ_EVENT_GUARD) {
event = t->rseq_event_pending;
t->rseq_event_pending = false;
}
if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
ret = rseq_ip_fixup(regs, event);
if (unlikely(ret < 0))
goto error;
}
if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;

View File

@ -49,6 +49,7 @@
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/suspend.h>
#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ioctl.h>
@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = kvm_arch_vcpu_ioctl_run(vcpu);
vcpu->wants_to_run = false;
/*
* FIXME: Remove this hack once all KVM architectures
* support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
*/
rseq_virt_userspace_exit();
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}