diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 11:30:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 11:30:10 -0800 |
commit | e994cc240a3b75744c33ca9b8d74f71f0fcd8852 (patch) | |
tree | 10809f00d4cbb97bff138301b21edfacf8b129af | |
parent | ba1d41a55e4d07c7b27ee2f6e7cf5b5348849261 (diff) | |
parent | 2c07343abd8932200a45ff7b10950e71081e9e77 (diff) |
Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull seccomp updates from Kees Cook:
"The major change here is finally gaining seccomp constant-action
bitmaps, which internally reduces the seccomp overhead for many
real-world syscall filters to O(1), as discussed at Plumbers this
year.
- Improve seccomp performance via constant-action bitmaps (YiFei Zhu
& Kees Cook)
- Fix bogus __user annotations (Jann Horn)
- Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"
* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
selftests/seccomp: Update kernel config
seccomp: Remove bogus __user annotations
seccomp/cache: Report cache data through /proc/pid/seccomp_cache
xtensa: Enable seccomp architecture tracking
sh: Enable seccomp architecture tracking
s390: Enable seccomp architecture tracking
riscv: Enable seccomp architecture tracking
powerpc: Enable seccomp architecture tracking
parisc: Enable seccomp architecture tracking
csky: Enable seccomp architecture tracking
arm: Enable seccomp architecture tracking
arm64: Enable seccomp architecture tracking
selftests/seccomp: Compare bitmap vs filter overhead
x86: Enable seccomp architecture tracking
seccomp/cache: Add "emulator" to check if filter is constant allow
seccomp/cache: Lookup syscall allowlist bitmap for fast path
-rw-r--r-- | arch/Kconfig | 17 | ||||
-rw-r--r-- | arch/arm/include/asm/Kbuild | 1 | ||||
-rw-r--r-- | arch/arm/include/asm/seccomp.h | 11 | ||||
-rw-r--r-- | arch/arm64/include/asm/seccomp.h | 9 | ||||
-rw-r--r-- | arch/csky/include/asm/Kbuild | 1 | ||||
-rw-r--r-- | arch/csky/include/asm/seccomp.h | 11 | ||||
-rw-r--r-- | arch/parisc/include/asm/Kbuild | 1 | ||||
-rw-r--r-- | arch/parisc/include/asm/seccomp.h | 22 | ||||
-rw-r--r-- | arch/powerpc/include/asm/seccomp.h | 23 | ||||
-rw-r--r-- | arch/riscv/include/asm/seccomp.h | 10 | ||||
-rw-r--r-- | arch/s390/include/asm/seccomp.h | 9 | ||||
-rw-r--r-- | arch/sh/include/asm/seccomp.h | 10 | ||||
-rw-r--r-- | arch/x86/include/asm/seccomp.h | 20 | ||||
-rw-r--r-- | arch/xtensa/include/asm/Kbuild | 1 | ||||
-rw-r--r-- | arch/xtensa/include/asm/seccomp.h | 11 | ||||
-rw-r--r-- | fs/proc/base.c | 6 | ||||
-rw-r--r-- | include/linux/seccomp.h | 7 | ||||
-rw-r--r-- | kernel/seccomp.c | 296 | ||||
-rw-r--r-- | tools/testing/selftests/seccomp/config | 1 | ||||
-rw-r--r-- | tools/testing/selftests/seccomp/seccomp_benchmark.c | 151 | ||||
-rw-r--r-- | tools/testing/selftests/seccomp/settings | 2 |
21 files changed, 590 insertions, 30 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 96992b01d806..d4bdc19ed3ad 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER - secure_computing return value is checked and a return value of -1 results in the system call being skipped immediately. - seccomp syscall wired up + - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE, + SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If + COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too. config SECCOMP prompt "Enable seccomp to safely execute untrusted bytecode" @@ -514,6 +517,20 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config SECCOMP_CACHE_DEBUG + bool "Show seccomp filter cache status in /proc/pid/seccomp_cache" + depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR + depends on PROC_FS + help + This enables the /proc/pid/seccomp_cache interface to monitor + seccomp cache data. The file format is subject to change. Reading + the file requires CAP_SYS_ADMIN. + + This option is for debugging only. Enabling presents the risk that + an adversary may be able to infer the seccomp filter logic. + + If unsure, say N. + config HAVE_ARCH_STACKLEAK bool help diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 383635b68763..4a0848aef207 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -4,7 +4,6 @@ generic-y += extable.h generic-y += flat.h generic-y += local64.h generic-y += parport.h -generic-y += seccomp.h generated-y += mach-types.h generated-y += unistd-nr.h diff --git a/arch/arm/include/asm/seccomp.h b/arch/arm/include/asm/seccomp.h new file mode 100644 index 000000000000..e9ad0f37d2ba --- /dev/null +++ b/arch/arm/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include <asm-generic/seccomp.h> + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "arm" + +#endif /* _ASM_SECCOMP_H */ diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h index c36387170936..30256233788b 100644 --- a/arch/arm64/include/asm/seccomp.h +++ b/arch/arm64/include/asm/seccomp.h @@ -19,4 +19,13 @@ #include <asm-generic/seccomp.h> +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64 +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "aarch64" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM +# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "arm" +#endif + #endif /* _ASM_SECCOMP_H */ diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 64876e59e2ef..93372255984d 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -4,6 +4,5 @@ generic-y += gpio.h generic-y += kvm_para.h generic-y += local64.h generic-y += qrwlock.h -generic-y += seccomp.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/csky/include/asm/seccomp.h b/arch/csky/include/asm/seccomp.h new file mode 100644 index 000000000000..d33e758126fb --- /dev/null +++ b/arch/csky/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include <asm-generic/seccomp.h> + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "csky" + +#endif /* _ASM_SECCOMP_H */ diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index e3ee5c0bfe80..f16c4db80116 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h generic-y += kvm_para.h generic-y += local64.h generic-y += mcs_spinlock.h -generic-y += seccomp.h generic-y += user.h diff --git a/arch/parisc/include/asm/seccomp.h b/arch/parisc/include/asm/seccomp.h new file mode 100644 index 000000000000..b058b2220322 --- /dev/null +++ b/arch/parisc/include/asm/seccomp.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include <asm-generic/seccomp.h> + +#ifdef CONFIG_64BIT +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "parisc64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "parisc" +# endif +#else /* !CONFIG_64BIT */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "parisc" +#endif + +#endif /* _ASM_SECCOMP_H */ diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h index 51209f6071c5..ac2033f134f0 100644 --- a/arch/powerpc/include/asm/seccomp.h +++ b/arch/powerpc/include/asm/seccomp.h @@ -8,4 +8,27 @@ #include <asm-generic/seccomp.h> +#ifdef __LITTLE_ENDIAN__ +#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE +#define __SECCOMP_ARCH_LE_NAME "le" +#else +#define __SECCOMP_ARCH_LE 0 +#define __SECCOMP_ARCH_LE_NAME +#endif + +#ifdef CONFIG_PPC64 +# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME +# endif +#else /* !CONFIG_PPC64 */ +# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME +#endif + #endif /* _ASM_POWERPC_SECCOMP_H */ diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h index bf7744ee3b3d..c7ee6a3507be 100644 --- a/arch/riscv/include/asm/seccomp.h +++ b/arch/riscv/include/asm/seccomp.h @@ -7,4 +7,14 @@ #include <asm-generic/seccomp.h> +#ifdef CONFIG_64BIT +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "riscv64" +#else /* !CONFIG_64BIT */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "riscv32" +#endif + #endif /* _ASM_SECCOMP_H */ diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h index 795bbe0d7ca6..71d46f0ba97b 100644 --- a/arch/s390/include/asm/seccomp.h +++ b/arch/s390/include/asm/seccomp.h @@ -16,4 +16,13 @@ #include <asm-generic/seccomp.h> +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "s390x" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "s390" +#endif + #endif /* _ASM_S390_SECCOMP_H */ diff --git a/arch/sh/include/asm/seccomp.h b/arch/sh/include/asm/seccomp.h index 54111e4d32b8..d4578395fd66 100644 --- a/arch/sh/include/asm/seccomp.h +++ b/arch/sh/include/asm/seccomp.h @@ -8,4 +8,14 @@ #define __NR_seccomp_exit __NR_exit #define __NR_seccomp_sigreturn __NR_rt_sigreturn +#ifdef CONFIG_CPU_LITTLE_ENDIAN +#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE +#else +#define __SECCOMP_ARCH_LE 0 +#endif + +#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE) +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "sh" + #endif /* __ASM_SECCOMP_H */ diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 2bd1338de236..fef16e398161 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -16,6 +16,26 @@ #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn #endif +#ifdef CONFIG_X86_64 +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "x86_64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 +# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ia32" +# endif +/* + * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support + * caching them and they are treated as out of range syscalls, which will + * always pass through the BPF filter. + */ +#else /* !CONFIG_X86_64 */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ia32" +#endif + #include <asm-generic/seccomp.h> #endif /* _ASM_X86_SECCOMP_H */ diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index c59c42a1221a..9718e9593564 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h generic-y += param.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += seccomp.h generic-y += user.h diff --git a/arch/xtensa/include/asm/seccomp.h b/arch/xtensa/include/asm/seccomp.h new file mode 100644 index 000000000000..f1cb6b0a9e1f --- /dev/null +++ b/arch/xtensa/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include <asm-generic/seccomp.h> + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "xtensa" + +#endif /* _ASM_SECCOMP_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index e71040b68991..b3422cda2a91 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 47763f3999f7..0c564e5d40ff 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task, return -EINVAL; } #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +struct seq_file; + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +#endif #endif /* _LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15f47fc11d13..952dc1c90229 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,6 +143,38 @@ struct notification { struct list_head notifications; }; +#ifdef SECCOMP_ARCH_NATIVE +/** + * struct action_cache - per-filter cache of seccomp actions per + * arch/syscall pair + * + * @allow_native: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * native architecture. + * @allow_compat: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * compat architecture. + */ +struct action_cache { + DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); +#ifdef SECCOMP_ARCH_COMPAT + DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); +#endif +}; +#else +struct action_cache { }; + +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + return false; +} + +static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * struct seccomp_filter - container for seccomp BPF programs * @@ -159,6 +191,7 @@ struct notification { * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. + * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -180,6 +213,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; @@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) return 0; } +#ifdef SECCOMP_ARCH_NATIVE +static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, + size_t bitmap_size, + int syscall_nr) +{ + if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) + return false; + syscall_nr = array_index_nospec(syscall_nr, bitmap_size); + + return test_bit(syscall_nr, bitmap); +} + +/** + * seccomp_cache_check_allow - lookup seccomp cache + * @sfilter: The seccomp filter + * @sd: The seccomp data to lookup the cache with + * + * Returns true if the seccomp_data is cached and allowed. + */ +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + int syscall_nr = sd->nr; + const struct action_cache *cache = &sfilter->cache; + +#ifndef SECCOMP_ARCH_COMPAT + /* A native-only architecture doesn't need to check sd->arch. */ + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); +#else + if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); + if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) + return seccomp_cache_check_allow_bitmap(cache->allow_compat, + SECCOMP_ARCH_COMPAT_NR, + syscall_nr); +#endif /* SECCOMP_ARCH_COMPAT */ + + WARN_ON_ONCE(true); + return false; +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters @@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; + if (seccomp_cache_check_allow(f, sd)) + return SECCOMP_RET_ALLOW; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; + /* We are effectively holding the siglock by not having any sighand. */ + WARN_ON(tsk->sighand != NULL); + /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; __seccomp_filter_release(orig); @@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) + true; +#else + false; +#endif if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -609,6 +700,148 @@ out: return filter; } +#ifdef SECCOMP_ARCH_NATIVE +/** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs + * @sd: The seccomp data to check against, only syscall number and arch + * number are considered constant. + */ +static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + struct seccomp_data *sd) +{ + unsigned int reg_value = 0; + unsigned int pc; + bool op_res; + + if (WARN_ON_ONCE(!fprog)) + return false; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; + u16 code = insn->code; + u32 k = insn->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + switch (k) { + case offsetof(struct seccomp_data, nr): + reg_value = sd->nr; + break; + case offsetof(struct seccomp_data, arch): + reg_value = sd->arch; + break; + default: + /* can't optimize (non-constant value load) */ + return false; + } + break; + case BPF_RET | BPF_K: + /* reached return with constant values only, check allow */ + return k == SECCOMP_RET_ALLOW; + case BPF_JMP | BPF_JA: + pc += insn->k; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + switch (BPF_OP(code)) { + case BPF_JEQ: + op_res = reg_value == k; + break; + case BPF_JGE: + op_res = reg_value >= k; + break; + case BPF_JGT: + op_res = reg_value > k; + break; + case BPF_JSET: + op_res = !!(reg_value & k); + break; + default: + /* can't optimize (unknown jump) */ + return false; + } + + pc += op_res ? insn->jt : insn->jf; + break; + case BPF_ALU | BPF_AND | BPF_K: + reg_value &= k; + break; + default: + /* can't optimize (unknown insn) */ + return false; + } + } + + /* ran off the end of the filter?! */ + WARN_ON(1); + return false; +} + +static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, + void *bitmap, const void *bitmap_prev, + size_t bitmap_size, int arch) +{ + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; + struct seccomp_data sd; + int nr; + + if (bitmap_prev) { + /* The new filter must be as restrictive as the last. */ + bitmap_copy(bitmap, bitmap_prev, bitmap_size); + } else { + /* Before any filters, all syscalls are always allowed. */ + bitmap_fill(bitmap, bitmap_size); + } + + for (nr = 0; nr < bitmap_size; nr++) { + /* No bitmap change: not a cacheable action. */ + if (!test_bit(nr, bitmap)) + continue; + + sd.nr = nr; + sd.arch = arch; + + /* No bitmap change: continue to always allow. */ + if (seccomp_is_const_allow(fprog, &sd)) + continue; + + /* + * Not a cacheable action: always run filters. + * atomic clear_bit() not needed, filter not visible yet. + */ + __clear_bit(nr, bitmap); + } +} + +/** + * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * @sfilter: The seccomp filter + * + * Returns 0 if successful or -errno if error occurred. + */ +static void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + struct action_cache *cache = &sfilter->cache; + const struct action_cache *cache_prev = + sfilter->prev ? &sfilter->prev->cache : NULL; + + seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, + cache_prev ? cache_prev->allow_native : NULL, + SECCOMP_ARCH_NATIVE_NR, + SECCOMP_ARCH_NATIVE); + +#ifdef SECCOMP_ARCH_COMPAT + seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, + cache_prev ? cache_prev->allow_compat : NULL, + SECCOMP_ARCH_COMPAT_NR, + SECCOMP_ARCH_COMPAT); +#endif /* SECCOMP_ARCH_COMPAT */ +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags, * task reference. */ filter->prev = current->seccomp.filter; + seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); @@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int read_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { char names[sizeof(seccomp_actions_avail)]; @@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, return proc_dostring(&table, 0, buffer, lenp, ppos); } -static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int write_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; @@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void) device_initcall(seccomp_sysctl_init) #endif /* CONFIG_SYSCTL */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ +static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, + const void *bitmap, size_t bitmap_size) +{ + int nr; + + for (nr = 0; nr < bitmap_size; nr++) { + bool cached = test_bit(nr, bitmap); + char *status = cached ? "ALLOW" : "FILTER"; + + seq_printf(m, "%s %d %s\n", name, nr, status); + } +} + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct seccomp_filter *f; + unsigned long flags; + + /* + * We don't want some sandboxed process to know what their seccomp + * filters consist of. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + + if (!lock_task_sighand(task, &flags)) + return -ESRCH; + + f = READ_ONCE(task->seccomp.filter); + if (!f) { + unlock_task_sighand(task, &flags); + return 0; + } + + /* prevent filter from being freed while we are printing it */ + __get_seccomp_filter(f); + unlock_task_sighand(task, &flags); + + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME, + f->cache.allow_native, + SECCOMP_ARCH_NATIVE_NR); + +#ifdef SECCOMP_ARCH_COMPAT + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME, + f->cache.allow_compat, + SECCOMP_ARCH_COMPAT_NR); +#endif /* SECCOMP_ARCH_COMPAT */ + + __put_seccomp_filter(f); + return 0; +} +#endif /* CONFIG_SECCOMP_CACHE_DEBUG */ diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config index 64c19d8eba79..ad431a5178fb 100644 --- a/tools/testing/selftests/seccomp/config +++ b/tools/testing/selftests/seccomp/config @@ -1,3 +1,4 @@ +CONFIG_PID_NS=y CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y CONFIG_USER_NS=y diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 91f5a89cadac..fcc806585266 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -4,12 +4,16 @@ */ #define _GNU_SOURCE #include <assert.h> +#include <limits.h> +#include <stdbool.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> +#include <sys/param.h> #include <sys/prctl.h> #include <sys/syscall.h> #include <sys/types.h> @@ -70,18 +74,74 @@ unsigned long long calibrate(void) return samples * seconds; } +bool approx(int i_one, int i_two) +{ + double one = i_one, one_bump = one * 0.01; + double two = i_two, two_bump = two * 0.01; + + one_bump = one + MAX(one_bump, 2.0); + two_bump = two + MAX(two_bump, 2.0); + + /* Equal to, or within 1% or 2 digits */ + if (one == two || + (one > two && one <= two_bump) || + (two > one && two <= one_bump)) + return true; + return false; +} + +bool le(int i_one, int i_two) +{ + if (i_one <= i_two) + return true; + return false; +} + +long compare(const char *name_one, const char *name_eval, const char *name_two, + unsigned long long one, bool (*eval)(int, int), unsigned long long two) +{ + bool good; + + printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, + (long long)one, name_eval, (long long)two); + if (one > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); + return 1; + } + if (two > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); + return 1; + } + + good = eval(one, two); + printf("%s\n", good ? "✔️" : "❌"); + + return good ? 0 : 1; +} + int main(int argc, char *argv[]) { + struct sock_filter bitmap_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog bitmap_prog = { + .len = (unsigned short)ARRAY_SIZE(bitmap_filter), + .filter = bitmap_filter, + }; struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; - long ret; - unsigned long long samples; - unsigned long long native, filter1, filter2; + + long ret, bits; + unsigned long long samples, calc; + unsigned long long native, filter1, filter2, bitmap1, bitmap2; + unsigned long long entry, per_filter1, per_filter2; printf("Current BPF sysctl settings:\n"); system("sysctl net.core.bpf_jit_enable"); @@ -101,35 +161,82 @@ int main(int argc, char *argv[]) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); - /* One filter */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + /* One filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); - filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); + bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); + + /* Second filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - if (filter1 == native) - printf("No overhead measured!? Try running again with more samples.\n"); + bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); - /* Two filters */ + /* Third filter, can no longer be converted to bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); - filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); - - /* Calculations */ - printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", - filter1 - native); + filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); - printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", - filter2 - native); + /* Fourth filter, can not be converted to bitmap because of filter 3 */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - printf("Estimated seccomp per-filter overhead: %llu ns\n", - filter2 - filter1); + filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); + + /* Estimations */ +#define ESTIMATE(fmt, var, what) do { \ + var = (what); \ + printf("Estimated " fmt ": %llu ns\n", var); \ + if (var > INT_MAX) \ + goto more_samples; \ + } while (0) + + ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, + bitmap1 - native); + ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc, + bitmap2 - native); + ESTIMATE("total seccomp overhead for 3 full filters", calc, + filter1 - native); + ESTIMATE("total seccomp overhead for 4 full filters", calc, + filter2 - native); + ESTIMATE("seccomp entry overhead", entry, + bitmap1 - native - (bitmap2 - bitmap1)); + ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1, + filter2 - filter1); + ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, + (filter2 - native - entry) / 4); + + printf("Expectations:\n"); + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); + bits = compare("native", "≤", "1 filter", native, le, filter1); + if (bits) + goto more_samples; + + ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", + per_filter1, approx, per_filter2); + + bits = compare("1 bitmapped", "≈", "2 bitmapped", + bitmap1 - native, approx, bitmap2 - native); + if (bits) { + printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); + goto out; + } - printf("Estimated seccomp entry overhead: %llu ns\n", - filter1 - native - (filter2 - filter1)); + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); + ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", + entry + (per_filter1 * 4) + native, approx, filter2); + if (ret == 0) + goto out; +more_samples: + printf("Saw unexpected benchmark result. Try running again with more samples?\n"); +out: return 0; } diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings index ba4d85f74cd6..6091b45d226b 100644 --- a/tools/testing/selftests/seccomp/settings +++ b/tools/testing/selftests/seccomp/settings @@ -1 +1 @@ -timeout=90 +timeout=120 |