summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/arm/include/asm/unistd.h1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/include/asm/unistd.h3
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/ia32/sys_ia32.c12
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/xtensa/include/asm/unistd.h1
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--include/linux/sched/task.h17
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/sched.h16
-rw-r--r--kernel/fork.c191
-rw-r--r--kernel/sys_ni.c2
17 files changed, 218 insertions, 41 deletions
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 9fb00973c608..3676e82cf95c 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -37,6 +37,7 @@
#define __ARCH_WANT_SYS_FORK
#define __ARCH_WANT_SYS_VFORK
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
/*
* Unimplemented (or alternatively implemented) syscalls
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 81e6e1817c45..6da7dc4d79cc 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -448,3 +448,4 @@
432 common fsmount sys_fsmount
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
+435 common clone3 sys_clone3
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index ede7b88d4f15..2629a68b8724 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,10 +38,11 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 435
+#define __NR_compat_syscalls 436
#endif
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
#ifndef __COMPAT_SYSCALL_NR
#include <uapi/asm/unistd.h>
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 52415923e08f..94ab29cf4f00 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -877,6 +877,8 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
__SYSCALL(__NR_fspick, sys_fspick)
#define __NR_pidfd_open 434
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
+#define __NR_clone3 435
+__SYSCALL(__NR_clone3, sys_clone3)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index ad706f83c755..09b0cd7dab0a 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
432 common fsmount sys_fsmount
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
+435 common clone3 sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 43e4429a5272..c00019abd076 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -439,3 +439,4 @@
432 i386 fsmount sys_fsmount __ia32_sys_fsmount
433 i386 fspick sys_fspick __ia32_sys_fspick
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
+435 i386 clone3 sys_clone3 __ia32_sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1bee0a77fdd3..c29976eca4a8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -356,6 +356,7 @@
432 common fsmount __x64_sys_fsmount
433 common fspick __x64_sys_fspick
434 common pidfd_open __x64_sys_pidfd_open
+435 common clone3 __x64_sys_clone3/ptregs
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..64a6c952091e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
unsigned long, newsp, int __user *, parent_tidptr,
unsigned long, tls_val, int __user *, child_tidptr)
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
- tls_val);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls_val,
+ };
+
+ return _do_fork(&args);
}
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 146859efd83c..097589753fec 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -54,5 +54,6 @@
# define __ARCH_WANT_SYS_FORK
# define __ARCH_WANT_SYS_VFORK
# define __ARCH_WANT_SYS_CLONE
+# define __ARCH_WANT_SYS_CLONE3
#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h
index 30af4dc3ce7b..b52236245e51 100644
--- a/arch/xtensa/include/asm/unistd.h
+++ b/arch/xtensa/include/asm/unistd.h
@@ -3,6 +3,7 @@
#define _XTENSA_UNISTD_H
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
#include <uapi/asm/unistd.h>
#define __ARCH_WANT_NEW_STAT
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 782b81945ccc..25f4de729a6d 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -405,3 +405,4 @@
432 common fsmount sys_fsmount
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
+435 common clone3 sys_clone3
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1227f2c38a4..109a0df5af39 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,11 +8,26 @@
*/
#include <linux/sched.h>
+#include <linux/uaccess.h>
struct task_struct;
struct rusage;
union thread_union;
+/* All the bits taken by the old clone syscall. */
+#define CLONE_LEGACY_FLAGS 0xffffffffULL
+
+struct kernel_clone_args {
+ u64 flags;
+ int __user *pidfd;
+ int __user *child_tid;
+ int __user *parent_tid;
+ int exit_signal;
+ unsigned long stack;
+ unsigned long stack_size;
+ unsigned long tls;
+};
+
/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
@@ -73,7 +88,7 @@ extern void do_group_exit(int);
extern void exit_files(struct task_struct *);
extern void exit_itimers(struct signal_struct *);
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(struct kernel_clone_args *kargs);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 699aed6674a0..b01d54a5732e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -68,6 +68,7 @@ struct sigaltstack;
struct rseq;
union bpf_attr;
struct io_uring_params;
+struct clone_args;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -850,6 +851,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
int __user *, unsigned long);
#endif
#endif
+
+asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size);
+
asmlinkage long sys_execve(const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e5684a4512c0..9acfff0cd153 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -846,9 +846,11 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
__SYSCALL(__NR_fspick, sys_fspick)
#define __NR_pidfd_open 434
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
+#define __NR_clone3 435
+__SYSCALL(__NR_clone3, sys_clone3)
#undef __NR_syscalls
-#define __NR_syscalls 435
+#define __NR_syscalls 436
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 617bb59aa8ba..b3105ac1381a 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -2,6 +2,8 @@
#ifndef _UAPI_LINUX_SCHED_H
#define _UAPI_LINUX_SCHED_H
+#include <linux/types.h>
+
/*
* cloning flags:
*/
@@ -32,6 +34,20 @@
#define CLONE_IO 0x80000000 /* Clone io context */
/*
+ * Arguments for the clone3 syscall
+ */
+struct clone_args {
+ __aligned_u64 flags;
+ __aligned_u64 pidfd;
+ __aligned_u64 child_tid;
+ __aligned_u64 parent_tid;
+ __aligned_u64 exit_signal;
+ __aligned_u64 stack;
+ __aligned_u64 stack_size;
+ __aligned_u64 tls;
+};
+
+/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
diff --git a/kernel/fork.c b/kernel/fork.c
index 187c02ce534c..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1768,20 +1768,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
- unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls,
- int node)
+ int node,
+ struct kernel_clone_args *args)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
+ u64 clone_flags = args->flags;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1831,14 +1827,11 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PIDFD) {
/*
- * - CLONE_PARENT_SETTID is useless for pidfds and also
- * parent_tidptr is used to return pidfds.
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
* - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags &
- (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
return ERR_PTR(-EINVAL);
}
@@ -1871,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
@@ -2031,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+ retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+ args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -2066,7 +2060,7 @@ static __latent_entropy struct task_struct *copy_process(
}
get_pid(pid); /* held by pidfile now */
- retval = put_user(pidfd, parent_tidptr);
+ retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}
@@ -2109,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
- p->exit_signal = (clone_flags & CSIGNAL);
+ p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -2322,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
- cpu_to_node(cpu));
+ struct kernel_clone_args args = {
+ .flags = CLONE_VM,
+ };
+
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
init_idle_pids(task);
init_idle(task, cpu);
@@ -2343,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long _do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
@@ -2365,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
@@ -2374,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
@@ -2391,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
+ put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
@@ -2423,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
int __user *parent_tidptr,
int __user *child_tidptr)
{
- return _do_fork(clone_flags, stack_start, stack_size,
- parent_tidptr, child_tidptr, 0);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = stack_start,
+ .stack_size = stack_size,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2433,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ };
+
+ return _do_fork(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2452,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VFORK | CLONE_VM,
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2481,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls,
+ };
+
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+ return -EINVAL;
+
+ return _do_fork(&args);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+ struct clone_args __user *uargs,
+ size_t size)
+{
+ struct clone_args args;
+
+ if (unlikely(size > PAGE_SIZE))
+ return -E2BIG;
+
+ if (unlikely(size < sizeof(struct clone_args)))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(uargs, size)))
+ return -EFAULT;
+
+ if (size > sizeof(struct clone_args)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uargs + sizeof(struct clone_args);
+ end = (void __user *)uargs + size;
+
+ for (; addr < end; addr++) {
+ if (get_user(val, addr))
+ return -EFAULT;
+ if (val)
+ return -E2BIG;
+ }
+
+ size = sizeof(struct clone_args);
+ }
+
+ if (copy_from_user(&args, uargs, size))
+ return -EFAULT;
+
+ *kargs = (struct kernel_clone_args){
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ };
+
+ return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+ /*
+ * All lower bits of the flag word are taken.
+ * Verify that no other unknown flags are passed along.
+ */
+ if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ return false;
+
+ /*
+ * - make the CLONE_DETACHED bit reuseable for clone3
+ * - make the CSIGNAL bits reuseable for clone3
+ */
+ if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ return false;
+
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+ kargs->exit_signal)
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+ int err;
+
+ struct kernel_clone_args kargs;
+
+ err = copy_clone_args_from_user(&kargs, uargs, size);
+ if (err)
+ return err;
+
+ if (!clone3_args_valid(&kargs))
+ return -EINVAL;
+
+ return _do_fork(&kargs);
}
#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4d9ae5ea6caf..34b76895b81e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -137,6 +137,8 @@ COND_SYSCALL(capset);
/* kernel/exit.c */
/* kernel/fork.c */
+/* __ARCH_WANT_SYS_CLONE3 */
+COND_SYSCALL(clone3);
/* kernel/futex.c */
COND_SYSCALL(futex);