/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H #include #include #include #include #include #include #include #include #include #ifndef _TIF_UPROBE # define _TIF_UPROBE (0) #endif /* * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() */ #ifndef ARCH_SYSCALL_WORK_ENTER # define ARCH_SYSCALL_WORK_ENTER (0) #endif /* * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() */ #ifndef ARCH_SYSCALL_WORK_EXIT # define ARCH_SYSCALL_WORK_EXIT (0) #endif #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) /** * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper * * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). * * This allows architecture specific ptrace_report_syscall_entry() * implementations. If not defined by the architecture this falls back to * to ptrace_report_syscall_entry(). */ static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); #ifndef arch_ptrace_report_syscall_entry static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) { return ptrace_report_syscall_entry(regs); } #endif bool syscall_user_dispatch(struct pt_regs *regs); long trace_syscall_enter(struct pt_regs *regs, long syscall); void trace_syscall_exit(struct pt_regs *regs, long ret); static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { unsigned long args[6]; syscall_get_arguments(current, regs, args); audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); } } static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) { long syscall, ret = 0; /* * Handle Syscall User Dispatch. This must comes first, since * the ABI here can be something that doesn't make sense for * other syscall_work features. */ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { if (syscall_user_dispatch(regs)) return -1L; } /* * User space got a time slice extension granted and relinquishes * the CPU. The work stops the slice timer to avoid an extra round * through hrtimer_interrupt(). */ if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) rseq_syscall_enter_work(syscall_get_nr(current, regs)); /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_ptrace_report_syscall_entry(regs); if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { ret = __secure_computing(); if (ret == -1L) return ret; } /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) syscall = trace_syscall_enter(regs, syscall); syscall_enter_audit(regs, syscall); return ret ? : syscall; } /** * syscall_enter_from_user_mode_work - Check and handle work before invoking * a syscall * @regs: Pointer to currents pt_regs * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts * enabled after invoking enter_from_user_mode(), enabling interrupts and * extra architecture specific work. * * Returns: The original or a modified syscall number * * If the returned syscall number is -1 then the syscall should be * skipped. In this case the caller may invoke syscall_set_error() or * syscall_set_return_value() first. If neither of those are called and -1 * is returned, then the syscall will fail with ENOSYS. * * It handles the following work items: * * 1) syscall_work flag dependent invocations of * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); if (work & SYSCALL_WORK_ENTER) syscall = syscall_trace_enter(regs, work); return syscall; } /** * syscall_enter_from_user_mode - Establish state and check and handle work * before invoking a syscall * @regs: Pointer to currents pt_regs * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts * disabled. The calling code has to be non-instrumentable. When the * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * * This is the combination of enter_from_user_mode() and * syscall_enter_from_user_mode_work() to be used when there is no * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. */ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) { long ret; enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); ret = syscall_enter_from_user_mode_work(regs, syscall); instrumentation_end(); return ret; } /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ static __always_inline bool report_single_step(unsigned long work) { if (work & SYSCALL_WORK_SYSCALL_EMU) return false; return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; } /** * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() * * This allows architecture specific ptrace_report_syscall_exit() * implementations. If not defined by the architecture this falls back to * to ptrace_report_syscall_exit(). */ static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, int step); #ifndef arch_ptrace_report_syscall_exit static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, int step) { ptrace_report_syscall_exit(regs, step); } #endif /** * syscall_exit_work - Handle work before returning to user mode * @regs: Pointer to current pt_regs * @work: Current thread syscall work * * Do one-time syscall specific work. */ static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; /* * If the syscall was rolled back due to syscall user dispatching, * then the tracers below are not invoked for the same reason as * the entry side was not invoked in syscall_trace_enter(): The ABI * of these syscalls is unknown. */ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { if (unlikely(current->syscall_dispatch.on_dispatch)) { current->syscall_dispatch.on_dispatch = false; return; } } audit_syscall_exit(regs); if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_syscall_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_ptrace_report_syscall_exit(regs, step); } /** * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode * @regs: Pointer to currents pt_regs * * Step 1 of syscall_exit_to_user_mode() with the same calling convention. * * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. */ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long nr = syscall_get_nr(current, regs); CT_WARN_ON(ct_state() != CT_STATE_KERNEL); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) local_irq_enable(); } rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); } /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs * * Invoked with interrupts enabled and fully valid @regs. Returns with all * work handled, interrupts disabled such that the caller can immediately * switch to user mode. Called from architecture specific syscall and ret * from fork code. * * The call order is: * 1) One-time syscall exit work: * - rseq syscall exit * - audit * - syscall tracing * - ptrace (single stepping) * * 2) Preparatory work * - Disable interrupts * - Exit to user mode loop (common TIF handling). Invokes * arch_exit_to_user_mode_work() for architecture specific TIF work * - Architecture specific one time work arch_exit_to_user_mode_prepare() * - Address limit and lockdep checks * * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the * functionality in exit_to_user_mode(). * * This is a combination of syscall_exit_to_user_mode_work() (1), disabling * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and * exit_to_user_mode() (3). This function is preferred unless there is a * compelling architectural reason to invoke the functions separately. */ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); local_irq_disable_exit_to_user(); syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } #endif