rseq, virt: Retrigger RSEQ after vcpu_run()
Hypervisors invoke resume_user_mode_work() before entering the guest, which clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user space context available to them, so the rseq notify handler skips inspecting the critical section, but updates the CPU/MM CID values unconditionally so that the eventual pending rseq event is not lost on the way to user space. This is a pointless exercise as the task might be rescheduled before actually returning to user space and it creates unnecessary work in the vcpu_run() loops. It's way more efficient to ignore that invocation based on @regs == NULL and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the vcpu_run() loop before returning from the ioctl(). This ensures that a pending RSEQ update is not lost and the IDs are updated before returning to user space. Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into a NOOP. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Acked-by: Sean Christopherson <seanjc@google.com> Link: https://patch.msgid.link/20251027084306.399495855@linutronix.depull/1354/merge
parent
d923739e2e
commit
83409986f4
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/crash_dump.h>
|
||||
#include <linux/panic_notifier.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/rseq.h>
|
||||
|
||||
#include "mshv_eventfd.h"
|
||||
#include "mshv.h"
|
||||
|
|
@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
|
|||
}
|
||||
} while (!vp->run.flags.intercept_suspend);
|
||||
|
||||
rseq_virt_userspace_exit();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
|
||||
* which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
|
||||
* that case just to do it eventually again before returning to user space,
|
||||
* the entry resume_user_mode_work() invocation is ignored as the register
|
||||
* argument is NULL.
|
||||
*
|
||||
* After returning from guest mode, they have to invoke this function to
|
||||
* re-raise TIF_NOTIFY_RESUME if necessary.
|
||||
*/
|
||||
static inline void rseq_virt_userspace_exit(void)
|
||||
{
|
||||
if (current->rseq_event_pending)
|
||||
set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
|
||||
}
|
||||
|
||||
/*
|
||||
* If parent process has a registered restartable sequences area, the
|
||||
* child inherits. Unregister rseq for a clone with CLONE_VM set.
|
||||
|
|
@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t)
|
|||
static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
|
||||
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
|
||||
static inline void rseq_sched_switch_event(struct task_struct *t) { }
|
||||
static inline void rseq_virt_userspace_exit(void) { }
|
||||
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
|
||||
static inline void rseq_execve(struct task_struct *t) { }
|
||||
static inline void rseq_exit_to_user_mode(void) { }
|
||||
|
|
|
|||
|
|
@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
|
|||
{
|
||||
struct task_struct *t = current;
|
||||
int ret, sig;
|
||||
bool event;
|
||||
|
||||
/*
|
||||
* If invoked from hypervisors before entering the guest via
|
||||
* resume_user_mode_work(), then @regs is a NULL pointer.
|
||||
*
|
||||
* resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
|
||||
* it before returning from the ioctl() to user space when
|
||||
* rseq_event.sched_switch is set.
|
||||
*
|
||||
* So it's safe to ignore here instead of pointlessly updating it
|
||||
* in the vcpu_run() loop.
|
||||
*/
|
||||
if (!regs)
|
||||
return;
|
||||
|
||||
if (unlikely(t->flags & PF_EXITING))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If invoked from hypervisors or IO-URING, then @regs is a NULL
|
||||
* pointer, so fixup cannot be done. If the syscall which led to
|
||||
* this invocation was invoked inside a critical section, then it
|
||||
* will either end up in this code again or a possible violation of
|
||||
* a syscall inside a critical region can only be detected by the
|
||||
* debug code in rseq_syscall() in a debug enabled kernel.
|
||||
* Read and clear the event pending bit first. If the task
|
||||
* was not preempted or migrated or a signal is on the way,
|
||||
* there is no point in doing any of the heavy lifting here
|
||||
* on production kernels. In that case TIF_NOTIFY_RESUME
|
||||
* was raised by some other functionality.
|
||||
*
|
||||
* This is correct because the read/clear operation is
|
||||
* guarded against scheduler preemption, which makes it CPU
|
||||
* local atomic. If the task is preempted right after
|
||||
* re-enabling preemption then TIF_NOTIFY_RESUME is set
|
||||
* again and this function is invoked another time _before_
|
||||
* the task is able to return to user mode.
|
||||
*
|
||||
* On a debug kernel, invoke the fixup code unconditionally
|
||||
* with the result handed in to allow the detection of
|
||||
* inconsistencies.
|
||||
*/
|
||||
if (regs) {
|
||||
/*
|
||||
* Read and clear the event pending bit first. If the task
|
||||
* was not preempted or migrated or a signal is on the way,
|
||||
* there is no point in doing any of the heavy lifting here
|
||||
* on production kernels. In that case TIF_NOTIFY_RESUME
|
||||
* was raised by some other functionality.
|
||||
*
|
||||
* This is correct because the read/clear operation is
|
||||
* guarded against scheduler preemption, which makes it CPU
|
||||
* local atomic. If the task is preempted right after
|
||||
* re-enabling preemption then TIF_NOTIFY_RESUME is set
|
||||
* again and this function is invoked another time _before_
|
||||
* the task is able to return to user mode.
|
||||
*
|
||||
* On a debug kernel, invoke the fixup code unconditionally
|
||||
* with the result handed in to allow the detection of
|
||||
* inconsistencies.
|
||||
*/
|
||||
bool event;
|
||||
|
||||
scoped_guard(RSEQ_EVENT_GUARD) {
|
||||
event = t->rseq_event_pending;
|
||||
t->rseq_event_pending = false;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
|
||||
ret = rseq_ip_fixup(regs, event);
|
||||
if (unlikely(ret < 0))
|
||||
goto error;
|
||||
}
|
||||
scoped_guard(RSEQ_EVENT_GUARD) {
|
||||
event = t->rseq_event_pending;
|
||||
t->rseq_event_pending = false;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
|
||||
ret = rseq_ip_fixup(regs, event);
|
||||
if (unlikely(ret < 0))
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (unlikely(rseq_update_cpu_node_id(t)))
|
||||
goto error;
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@
|
|||
#include <linux/lockdep.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/rseq.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/ioctl.h>
|
||||
|
|
@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
|
|||
r = kvm_arch_vcpu_ioctl_run(vcpu);
|
||||
vcpu->wants_to_run = false;
|
||||
|
||||
/*
|
||||
* FIXME: Remove this hack once all KVM architectures
|
||||
* support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
|
||||
*/
|
||||
rseq_virt_userspace_exit();
|
||||
|
||||
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue