diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 2 | ||||
-rw-r--r-- | kernel/audit.h | 2 | ||||
-rw-r--r-- | kernel/auditsc.c | 12 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 6 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 6 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 10 | ||||
-rw-r--r-- | kernel/compat.c | 23 | ||||
-rw-r--r-- | kernel/events/core.c | 3 | ||||
-rw-r--r-- | kernel/extable.c | 45 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/locking/test-ww_mutex.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 10 | ||||
-rw-r--r-- | kernel/sched/wait.c | 85 | ||||
-rw-r--r-- | kernel/seccomp.c | 321 | ||||
-rw-r--r-- | kernel/sysctl_binary.c | 21 | ||||
-rw-r--r-- | kernel/trace/trace.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_mmiotrace.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 15 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 2 |
21 files changed, 484 insertions, 107 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 5b1284370367..5e72af29ab73 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) if (file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + __kernel_write(file, &ac, sizeof(acct_t), &pos); file_end_write(file); } out: diff --git a/kernel/audit.h b/kernel/audit.h index b331d9b83f63..9b110ae17ee3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -182,7 +182,7 @@ struct audit_context { mqd_t mqdes; size_t msg_len; unsigned int msg_prio; - struct timespec abs_timeout; + struct timespec64 abs_timeout; } mq_sendrecv; struct { int oflag; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index aac1a41f82bd..ecc23e25c9eb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, - context->mq_sendrecv.abs_timeout.tv_sec, + (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: @@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec *abs_timeout) + const struct timespec64 *abs_timeout) { struct audit_context *context = current->audit_context; - struct timespec *p = &context->mq_sendrecv.abs_timeout; + struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) - memcpy(p, abs_timeout, sizeof(struct timespec)); + memcpy(p, abs_timeout, sizeof(*p)); else - memset(p, 0, sizeof(struct timespec)); + memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(&map_idr_lock); + spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); if (do_idr_lock) - spin_unlock_bh(&map_idr_lock); + spin_unlock_irqrestore(&map_idr_lock, flags); else __release(&map_idr_lock); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 477b6932c3c1..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2292,7 +2292,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } @@ -4204,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, diff --git a/kernel/compat.c b/kernel/compat.c index 6f0a0e723a06..772e038d04d9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int compat_convert_timespec(struct timespec __user **kts, - const void __user *cts) -{ - struct timespec ts; - struct timespec __user *uts; - - if (!cts || COMPAT_USE_64BIT_TIME) { - *kts = (struct timespec __user *)cts; - return 0; - } - - uts = compat_alloc_user_space(sizeof(ts)); - if (!uts) - return -EFAULT; - if (compat_get_timespec(&ts, cts)) - return -EFAULT; - if (copy_to_user(uts, &ts, sizeof(ts))) - return -EFAULT; - - *kts = uts; - return 0; -} - int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { struct compat_itimerval v32; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. @@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..6fc89fd93824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 39f56c870051..0e4cd64ad2c0 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -362,7 +362,7 @@ static int *get_random_order(int count) int *order; int n, r, tmp; - order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) return order; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1250e4bd4b85..0c44c7b42e6d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -882,6 +882,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1015,6 +1020,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index d6afed6d0752..98feab7933c7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry } EXPORT_SYMBOL(remove_wait_queue); +/* + * Scan threshold to break wait queue walk. + * This allows a waker to take a break from holding the + * wait queue lock during the wait queue walk. + */ +#define WAITQUEUE_WALK_BREAK_CNT 64 /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just @@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) +static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key, + wait_queue_entry_t *bookmark) { wait_queue_entry_t *curr, *next; + int cnt = 0; + + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { + curr = list_next_entry(bookmark, entry); + + list_del(&bookmark->entry); + bookmark->flags = 0; + } else + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); - list_for_each_entry_safe(curr, next, &wq_head->head, entry) { + if (&curr->entry == &wq_head->head) + return nr_exclusive; + + list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - int ret = curr->func(curr, mode, wake_flags, key); + int ret; + + if (flags & WQ_FLAG_BOOKMARK) + continue; + + ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; + + if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && + (&next->entry != &wq_head->head)) { + bookmark->flags = WQ_FLAG_BOOKMARK; + list_add_tail(&bookmark->entry, &next->entry); + break; + } + } + return nr_exclusive; +} + +static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + unsigned long flags; + wait_queue_entry_t bookmark; + + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, + wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); } } @@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; - - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); } EXPORT_SYMBOL(__wake_up); @@ -104,16 +155,23 @@ EXPORT_SYMBOL(__wake_up); */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); +void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, + unsigned int mode, void *key, wait_queue_entry_t *bookmark) +{ + __wake_up_common(wq_head, mode, 1, 0, key, bookmark); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); + /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -134,7 +192,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ if (unlikely(!wq_head)) @@ -143,9 +200,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..c24579dfa7a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -17,11 +17,13 @@ #include <linux/audit.h> #include <linux/compat.h> #include <linux/coredump.h> +#include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> #include <linux/slab.h> #include <linux/syscalls.h> +#include <linux/sysctl.h> #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include <asm/syscall.h> @@ -42,6 +44,7 @@ * get/put helpers should be used when accessing an instance * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. + * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @@ -57,6 +60,7 @@ */ struct seccomp_filter { refcount_t usage; + bool log; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters + * @match: stores struct seccomp_filter that resulted in the return value, + * unless filter returned SECCOMP_RET_ALLOW, in which case it will + * be unchanged. * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(const struct seccomp_data *sd) +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) +static u32 seccomp_run_filters(const struct seccomp_data *sd, + struct seccomp_filter **match) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL; + return SECCOMP_RET_KILL_PROCESS; if (!sd) { populate_seccomp_data(&sd_local); @@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; + *match = f; + } } return ret; } @@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Set log flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_LOG) + filter->log = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -514,6 +529,65 @@ static void seccomp_send_sigsys(int syscall, int reason) } #endif /* CONFIG_SECCOMP_FILTER */ +/* For use with seccomp_actions_logged */ +#define SECCOMP_LOG_KILL_PROCESS (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 1) +#define SECCOMP_LOG_TRAP (1 << 2) +#define SECCOMP_LOG_ERRNO (1 << 3) +#define SECCOMP_LOG_TRACE (1 << 4) +#define SECCOMP_LOG_LOG (1 << 5) +#define SECCOMP_LOG_ALLOW (1 << 6) + +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | + SECCOMP_LOG_KILL_THREAD | + SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | + SECCOMP_LOG_TRACE | + SECCOMP_LOG_LOG; + +static inline void seccomp_log(unsigned long syscall, long signr, u32 action, + bool requested) +{ + bool log = false; + + switch (action) { + case SECCOMP_RET_ALLOW: + break; + case SECCOMP_RET_TRAP: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; + break; + case SECCOMP_RET_ERRNO: + log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; + break; + case SECCOMP_RET_TRACE: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; + break; + case SECCOMP_RET_LOG: + log = seccomp_actions_logged & SECCOMP_LOG_LOG; + break; + case SECCOMP_RET_KILL_THREAD: + log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; + break; + case SECCOMP_RET_KILL_PROCESS: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; + } + + /* + * Force an audit message to be emitted when the action is RET_KILL_*, + * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is + * allowed to be logged by the admin. + */ + if (log) + return __audit_seccomp(syscall, signr, action); + + /* + * Let the audit subsystem decide if the action should be audited based + * on whether the current task itself is being audited. + */ + return audit_seccomp(syscall, signr, action); +} + /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit @@ -539,7 +613,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -566,6 +640,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_filter *match = NULL; int data; /* @@ -574,9 +649,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); - filter_ret = seccomp_run_filters(sd); + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; - action = filter_ret & SECCOMP_RET_ACTION; + action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: @@ -637,14 +712,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_LOG: + seccomp_log(this_syscall, 0, action, true); + return 0; + case SECCOMP_RET_ALLOW: + /* + * Note that the "match" filter will always be NULL for + * this action since SECCOMP_RET_ALLOW is the starting + * state in seccomp_run_filters(). + */ return 0; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_KILL_PROCESS: default: - audit_seccomp(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (get_nr_threads(current) == 1) { + if (action == SECCOMP_RET_KILL_PROCESS || + get_nr_threads(current) == 1) { siginfo_t info; /* Show the original registers in the dump. */ @@ -653,13 +739,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - do_exit(SIGSYS); + if (action == SECCOMP_RET_KILL_PROCESS) + do_group_exit(SIGSYS); + else + do_exit(SIGSYS); } unreachable(); skip: - audit_seccomp(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action, match ? match->log : false); return -1; } #else @@ -794,6 +883,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags, } #endif +static long seccomp_get_action_avail(const char __user *uaction) +{ + u32 action; + + if (copy_from_user(&action, uaction, sizeof(action))) + return -EFAULT; + + switch (action) { + case SECCOMP_RET_KILL_PROCESS: + case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + case SECCOMP_RET_LOG: + case SECCOMP_RET_ALLOW: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, const char __user *uargs) @@ -805,6 +917,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); + case SECCOMP_GET_ACTION_AVAIL: + if (flags != 0) + return -EINVAL; + + return seccomp_get_action_avail(uargs); default: return -EINVAL; } @@ -922,3 +1039,185 @@ out: return ret; } #endif + +#ifdef CONFIG_SYSCTL + +/* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" +#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" +#define SECCOMP_RET_TRAP_NAME "trap" +#define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_LOG_NAME "log" +#define SECCOMP_RET_ALLOW_NAME "allow" + +static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_PROCESS_NAME " " + SECCOMP_RET_KILL_THREAD_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " + SECCOMP_RET_ALLOW_NAME; + +struct seccomp_log_name { + u32 log; + const char *name; +}; + +static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, + { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, + { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, + { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, + { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, + { } +}; + +static bool seccomp_names_from_actions_logged(char *names, size_t size, + u32 actions_logged) +{ + const struct seccomp_log_name *cur; + bool append_space = false; + + for (cur = seccomp_log_names; cur->name && size; cur++) { + ssize_t ret; + + if (!(actions_logged & cur->log)) + continue; + + if (append_space) { + ret = strscpy(names, " ", size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } else + append_space = true; + + ret = strscpy(names, cur->name, size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } + + return true; +} + +static bool seccomp_action_logged_from_name(u32 *action_logged, + const char *name) +{ + const struct seccomp_log_name *cur; + + for (cur = seccomp_log_names; cur->name; cur++) { + if (!strcmp(cur->name, name)) { + *action_logged = cur->log; + return true; + } + } + + return false; +} + +static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) +{ + char *name; + + *actions_logged = 0; + while ((name = strsep(&names, " ")) && *name) { + u32 action_logged = 0; + + if (!seccomp_action_logged_from_name(&action_logged, name)) + return false; + + *actions_logged |= action_logged; + } + + return true; +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + memset(names, 0, sizeof(names)); + + if (!write) { + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged)) + return -EINVAL; + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) { + u32 actions_logged; + + if (!seccomp_actions_logged_from_names(&actions_logged, + table.data)) + return -EINVAL; + + if (actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; + + seccomp_actions_logged = actions_logged; + } + + return 0; +} + +static struct ctl_path seccomp_sysctl_path[] = { + { .procname = "kernel", }, + { .procname = "seccomp", }, + { } +}; + +static struct ctl_table seccomp_sysctl_table[] = { + { + .procname = "actions_avail", + .data = (void *) &seccomp_actions_avail, + .maxlen = sizeof(seccomp_actions_avail), + .mode = 0444, + .proc_handler = proc_dostring, + }, + { + .procname = "actions_logged", + .mode = 0644, + .proc_handler = seccomp_actions_logged_handler, + }, + { } +}; + +static int __init seccomp_sysctl_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); + if (!hdr) + pr_warn("seccomp: sysctl registration failed\n"); + else + kmemleak_not_leak(hdr); + + return 0; +} + +device_initcall(seccomp_sysctl_init) + +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 02e1859f2ca8..58ea8c03662e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1016,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1029,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1057,8 +1059,9 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1087,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1100,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1120,8 +1124,9 @@ static ssize_t bin_uuid(struct file *file, if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; uuid_t uuid; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; @@ -1154,8 +1159,9 @@ static ssize_t bin_dn_node_address(struct file *file, char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; @@ -1188,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file, __le16 dnaddr; char buf[15]; int len; + loff_t pos = 0; result = -EINVAL; if (newlen != sizeof(dnaddr)) @@ -1201,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - result = kernel_write(file, buf, len, 0); + result = kernel_write(file, buf, len, &pos); if (result < 0) goto out; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; @@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,8 @@ struct tracer { #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 181e139a8057..61e7f0678d33 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps, int pos = ps->lasterr_pos; char *buf, *pbuf; - buf = (char *)__get_free_page(GFP_TEMPORARY); + buf = (char *)__get_free_page(GFP_KERNEL); if (!buf) return; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = .close = mmio_close, .read = mmio_read, .print_line = mmio_print_line, + .noboot = true, }; __init static int init_mmio_trace(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9c4eef20301c..696afe72d3b1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -565,7 +565,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; - unsigned long args[sys_data->nb_args]; + unsigned long args[SYSCALL_DEFINE_MAXARGS]; } param; int i; |