bianbu-linux-6.6/kernel/entry/common.c
Oleg Nesterov bf9ad37dc8 signal, x86: Delay calling signals in atomic on RT enabled kernels
On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.

When CONFIG_PREEMPT_RT is enabled, spinlock_t locks become sleeping, and
one of these is the spin lock used in signal handling.

Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spinlock_t lock that has been converted to a
sleeping lock. If this happens, the above issues with the corrupted
stack is possible.

Instead of calling the signal right away, for PREEMPT_RT and x86,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.

[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT to
  ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]
[bigeasy: Add on 32bit as per Yang Shi, minor rewording. ]
[ tglx: Use a config option ]

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/Ygq5aBB/qMQw6aP5@linutronix.de
2022-03-04 14:58:54 +01:00

491 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
/* See comment for enter_from_user_mode() in entry-common.h */
static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
arch_check_user_regs(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit_irqoff();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
}
void noinstr enter_from_user_mode(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
}
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
unsigned long args[6];
syscall_get_arguments(current, regs, args);
audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
}
}
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long work)
{
long ret = 0;
/*
* Handle Syscall User Dispatch. This must comes first, since
* the ABI here can be something that doesn't make sense for
* other syscall_work features.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (syscall_user_dispatch(regs))
return -1L;
}
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = arch_syscall_enter_tracehook(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
/* Do seccomp after ptrace, to catch any tracer changes. */
if (work & SYSCALL_WORK_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
}
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
return ret ? : syscall;
}
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
if (work & SYSCALL_WORK_ENTER)
syscall = syscall_trace_enter(regs, syscall, work);
return syscall;
}
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
__enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
ret = __syscall_enter_from_user_work(regs, syscall);
instrumentation_end();
return ret;
}
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
/* See comment for exit_to_user_mode() in entry-common.h */
static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
user_enter_irqoff();
arch_exit_to_user_mode();
lockdep_hardirqs_on(CALLER_ADDR0);
}
void noinstr exit_to_user_mode(void)
{
__exit_to_user_mode();
}
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
{
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}
#ifdef CONFIG_RT_DELAYED_SIGNALS
static inline void raise_delayed_signal(void)
{
if (unlikely(current->forced_info.si_signo)) {
force_sig_info(&current->forced_info);
current->forced_info.si_signo = 0;
}
}
#else
static inline void raise_delayed_signal(void) { }
#endif
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
while (ti_work & EXIT_TO_USER_MODE_WORK) {
local_irq_enable_exit_to_user(ti_work);
if (ti_work & _TIF_NEED_RESCHED)
schedule();
raise_delayed_signal();
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
/*
* Disable interrupts and reevaluate the work flags as they
* might have changed while interrupts and preemption was
* enabled above.
*/
local_irq_disable_exit_to_user();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
return ti_work;
}
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
/* Ensure that the address limit is intact and no locks are held */
addr_limit_user_check();
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
}
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
* instruction has been already reported in syscall_enter_from_user_mode().
*/
static inline bool report_single_step(unsigned long work)
{
if (work & SYSCALL_WORK_SYSCALL_EMU)
return false;
return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}
static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
/*
* If the syscall was rolled back due to syscall user dispatching,
* then the tracers below are not invoked for the same reason as
* the entry side was not invoked in syscall_trace_enter(): The ABI
* of these syscalls is unknown.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (unlikely(current->syscall_dispatch.on_dispatch)) {
current->syscall_dispatch.on_dispatch = false;
return;
}
}
audit_syscall_exit(regs);
if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
trace_sys_exit(regs, syscall_get_return_value(current, regs));
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
arch_syscall_exit_tracehook(regs, step);
}
/*
* Syscall specific exit to user mode preparation. Runs with interrupts
* enabled.
*/
static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
local_irq_enable();
}
rseq_syscall(regs);
/*
* Do one-time syscall specific work. If these work items are
* enabled, we want to run them exactly once per syscall exit with
* interrupts enabled.
*/
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
}
static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
exit_to_user_mode_prepare(regs);
}
void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
__syscall_exit_to_user_mode_work(regs);
}
__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
__exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
__exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
irqentry_state_t ret = {
.exit_rcu = false,
};
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
return ret;
}
/*
* If this entry hit the idle task invoke rcu_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
rcu_irq_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
ret.exit_rcu = true;
return ret;
}
/*
* If RCU is watching then RCU only wants to check whether it needs
* to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
return ret;
}
void irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
if (need_resched())
preempt_schedule_irq();
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
lockdep_assert_irqs_disabled();
/* Check whether this returns to user mode */
if (user_mode(regs)) {
irqentry_exit_to_user_mode(regs);
} else if (!regs_irqs_disabled(regs)) {
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
* and RCU as the return to user mode path.
*/
if (state.exit_rcu) {
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
rcu_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION)) {
#ifdef CONFIG_PREEMPT_DYNAMIC
static_call(irqentry_exit_cond_resched)();
#else
irqentry_exit_cond_resched();
#endif
}
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
rcu_irq_exit();
}
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
{
irqentry_state_t irq_state;
irq_state.lockdep = lockdep_hardirqs_enabled();
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
rcu_nmi_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
return irq_state;
}
void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
{
instrumentation_begin();
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
}
instrumentation_end();
rcu_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
__nmi_exit();
}