diff options
-rw-r--r-- | fs/binfmt_elf.c | 215 | ||||
-rw-r--r-- | fs/binfmt_elf_fdpic.c | 20 | ||||
-rw-r--r-- | fs/binfmt_misc.c | 386 | ||||
-rw-r--r-- | include/linux/binfmts.h | 10 | ||||
-rw-r--r-- | include/linux/mm.h | 3 | ||||
-rw-r--r-- | include/linux/user_namespace.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/elf.h | 2 | ||||
-rw-r--r-- | kernel/user.c | 13 | ||||
-rw-r--r-- | kernel/user_namespace.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 5 |
11 files changed, 443 insertions, 228 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7b3d2d491407..5397b552fbeb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) -static int set_brk(unsigned long start, unsigned long end, int prot) -{ - start = ELF_PAGEALIGN(start); - end = ELF_PAGEALIGN(end); - if (end > start) { - /* - * Map the last of the bss segment. - * If the header is requesting these pages to be - * executable, honour that (ppc32 needs this). - */ - int error = vm_brk_flags(start, end - start, - prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - return error; - } - current->mm->start_brk = current->mm->brk = end; - return 0; -} - -/* We need to explicitly zero any fractional pages - after the data section (i.e. bss). This would - contain the junk from the file that should not - be in memory +/* + * We need to explicitly zero any trailing portion of the page that follows + * p_filesz when it ends before the page ends (e.g. bss), otherwise this + * memory will contain the junk from the file that should not be present. */ -static int padzero(unsigned long elf_bss) +static int padzero(unsigned long address) { unsigned long nbyte; - nbyte = ELF_PAGEOFFSET(elf_bss); + nbyte = ELF_PAGEOFFSET(address); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; - if (clear_user((void __user *) elf_bss, nbyte)) + if (clear_user((void __user *)address, nbyte)) return -EFAULT; } return 0; @@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } +/* + * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" + * into memory at "addr". (Note that p_filesz is rounded up to the + * next page, so any extra bytes from the file must be wiped.) + */ static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +/* + * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" + * into memory at "addr". Memory from "p_filesz" through "p_memsz" + * rounded up to the next page is zeroed. + */ +static unsigned long elf_load(struct file *filep, unsigned long addr, + const struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long zero_start, zero_end; + unsigned long map_addr; + + if (eppnt->p_filesz) { + map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); + if (BAD_ADDR(map_addr)) + return map_addr; + if (eppnt->p_memsz > eppnt->p_filesz) { + zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_filesz; + zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + + /* + * Zero the end of the last mapped page but ignore + * any errors if the segment isn't writable. + */ + if (padzero(zero_start) && (prot & PROT_WRITE)) + return -EFAULT; + } + } else { + map_addr = zero_start = ELF_PAGESTART(addr); + zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + } + if (eppnt->p_memsz > eppnt->p_filesz) { + /* + * Map the last of the segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error; + + zero_start = ELF_PAGEALIGN(zero_start); + zero_end = ELF_PAGEALIGN(zero_end); + + error = vm_brk_flags(zero_start, zero_end - zero_start, + prot & PROT_EXEC ? VM_EXEC : 0); + if (error) + map_addr = error; + } + return map_addr; +} + + static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; @@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; - unsigned long last_bss = 0, elf_bss = 0; - int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; - map_addr = elf_map(interpreter, load_addr + vaddr, + map_addr = elf_load(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; error = map_addr; @@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, error = -ENOMEM; goto out; } - - /* - * Find the end of the file mapping for this phdr, and - * keep track of the largest address we see for this. - */ - k = load_addr + eppnt->p_vaddr + eppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; - - /* - * Do the same thing for the memory mapping - between - * elf_bss and last_bss is the bss section. - */ - k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) { - last_bss = k; - bss_prot = elf_prot; - } } } - /* - * Now fill out the bss section: first pad the last page from - * the file up to the page boundary, and zero it from elf_bss - * up to the end of the page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out; - } - /* - * Next, align both the file and mem bss up to the page size, - * since this is where elf_bss was just zeroed up to, and where - * last_bss will end after the vm_brk_flags() below. - */ - elf_bss = ELF_PAGEALIGN(elf_bss); - last_bss = ELF_PAGEALIGN(last_bss); - /* Finally, if there is still more bss to allocate, do it. */ - if (last_bss > elf_bss) { - error = vm_brk_flags(elf_bss, last_bss - elf_bss, - bss_prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - goto out; - } - error = load_addr; out: return error; @@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; - unsigned long elf_bss, elf_brk; - int bss_prot = 0; + unsigned long elf_brk; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1020,7 +1015,6 @@ out_free_interp: if (retval < 0) goto out_free_dentry; - elf_bss = 0; elf_brk = 0; start_code = ~0UL; @@ -1040,33 +1034,6 @@ out_free_interp: if (elf_ppnt->p_type != PT_LOAD) continue; - if (unlikely (elf_brk > elf_bss)) { - unsigned long nbyte; - - /* There was a PT_LOAD segment with p_memsz > p_filesz - before this one. Map anonymous pages, if needed, - and clear the area. */ - retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias, - bss_prot); - if (retval) - goto out_free_dentry; - nbyte = ELF_PAGEOFFSET(elf_bss); - if (nbyte) { - nbyte = ELF_MIN_ALIGN - nbyte; - if (nbyte > elf_brk - elf_bss) - nbyte = elf_brk - elf_bss; - if (clear_user((void __user *)elf_bss + - load_bias, nbyte)) { - /* - * This bss-zeroing can fail if the ELF - * file specifies odd protections. So - * we don't check the return value - */ - } - } - } - elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); @@ -1162,7 +1129,7 @@ out_free_interp: } } - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR_VALUE(error) ? @@ -1210,40 +1177,24 @@ out_free_interp: k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) { - bss_prot = elf_prot; + if (k > elf_brk) elf_brk = k; - } } e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; - elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; - /* Calling set_brk effectively mmaps the pages that we need - * for the bss and break sections. We must do this before - * mapping in the interpreter, to make sure it doesn't wind - * up getting placed where the bss needs to go. - */ - retval = set_brk(elf_bss, elf_brk, bss_prot); - if (retval) - goto out_free_dentry; - if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { - retval = -EFAULT; /* Nobody gets to see this, but.. */ - goto out_free_dentry; - } + current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, @@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file) { struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; - unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; @@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file) eppnt++; /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, - ELF_PAGESTART(eppnt->p_vaddr), - (eppnt->p_filesz + - ELF_PAGEOFFSET(eppnt->p_vaddr)), + error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr), + eppnt, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED_NOREPLACE | MAP_PRIVATE, - (eppnt->p_offset - - ELF_PAGEOFFSET(eppnt->p_vaddr))); + 0); + if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; - elf_bss = eppnt->p_vaddr + eppnt->p_filesz; - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_free_ph; - } - - len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr); - bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr); - if (bss > len) { - error = vm_brk(len, bss - len); - if (error) - goto out_free_ph; - } error = 0; out_free_ph: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 206812ce544a..fefc642541cb 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -899,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); seg = loadmap->segs; for (loop = 0; loop < loadmap->nsegs; loop++, seg++) - kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", + kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]", loop, - seg->addr, seg->addr + seg->p_memsz - 1, - seg->p_vaddr, seg->p_memsz); + (unsigned long long) seg->addr, + (unsigned long long) seg->addr + seg->p_memsz - 1, + (unsigned long long) seg->p_vaddr, + (unsigned long long) seg->p_memsz); return 0; @@ -1081,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp); - kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", - loop, phdr->p_memsz + disp, prot, flags, - phdr->p_offset - disp, maddr); + kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx", + loop, (unsigned long long) phdr->p_memsz + disp, + prot, flags, (unsigned long long) phdr->p_offset - disp, + maddr); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1145,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, #else if (excess > 0) { - kdebug("clear[%d] ad=%lx sz=%lx", - loop, maddr + phdr->p_filesz, excess); + kdebug("clear[%d] ad=%llx sz=%lx", loop, + (unsigned long long) maddr + phdr->p_filesz, + excess); if (clear_user((void *) maddr + phdr->p_filesz, excess)) return -EFAULT; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 5d2be9b0a0a5..68fa225f89e5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -40,9 +40,6 @@ enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; -static LIST_HEAD(entries); -static int enabled = 1; - enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) #define MISC_FMT_OPEN_BINARY (1UL << 30) @@ -60,12 +57,10 @@ typedef struct { char *name; struct dentry *dentry; struct file *interp_file; + refcount_t users; /* sync removal with load_misc_binary() */ } Node; -static DEFINE_RWLOCK(entries_lock); static struct file_system_type bm_fs_type; -static struct vfsmount *bm_mnt; -static int entry_count; /* * Max length of the register string. Determined by: @@ -82,19 +77,24 @@ static int entry_count; */ #define MAX_REGISTER_LENGTH 1920 -/* - * Check if we support the binfmt - * if we do, return the node, else NULL - * locking is done in load_misc_binary +/** + * search_binfmt_handler - search for a binary handler for @bprm + * @misc: handle to binfmt_misc instance + * @bprm: binary for which we are looking for a handler + * + * Search for a binary type handler for @bprm in the list of registered binary + * type handlers. + * + * Return: binary type list entry on success, NULL on failure */ -static Node *check_file(struct linux_binprm *bprm) +static Node *search_binfmt_handler(struct binfmt_misc *misc, + struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); - struct list_head *l; + Node *e; /* Walk all the registered handlers. */ - list_for_each(l, &entries) { - Node *e = list_entry(l, Node, list); + list_for_each_entry(e, &misc->entries, list) { char *s; int j; @@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm) if (j == e->size) return e; } + return NULL; } +/** + * get_binfmt_handler - try to find a binary type handler + * @misc: handle to binfmt_misc instance + * @bprm: binary for which we are looking for a handler + * + * Try to find a binfmt handler for the binary type. If one is found take a + * reference to protect against removal via bm_{entry,status}_write(). + * + * Return: binary type list entry on success, NULL on failure + */ +static Node *get_binfmt_handler(struct binfmt_misc *misc, + struct linux_binprm *bprm) +{ + Node *e; + + read_lock(&misc->entries_lock); + e = search_binfmt_handler(misc, bprm); + if (e) + refcount_inc(&e->users); + read_unlock(&misc->entries_lock); + return e; +} + +/** + * put_binfmt_handler - put binary handler node + * @e: node to put + * + * Free node syncing with load_misc_binary() and defer final free to + * load_misc_binary() in case it is using the binary type handler we were + * requested to remove. + */ +static void put_binfmt_handler(Node *e) +{ + if (refcount_dec_and_test(&e->users)) { + if (e->flags & MISC_FMT_OPEN_FILE) + filp_close(e->interp_file, NULL); + kfree(e); + } +} + +/** + * load_binfmt_misc - load the binfmt_misc of the caller's user namespace + * + * To be called in load_misc_binary() to load the relevant struct binfmt_misc. + * If a user namespace doesn't have its own binfmt_misc mount it can make use + * of its ancestor's binfmt_misc handlers. This mimicks the behavior of + * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where + * available to all user and user namespaces on the system. + * + * Return: the binfmt_misc instance of the caller's user namespace + */ +static struct binfmt_misc *load_binfmt_misc(void) +{ + const struct user_namespace *user_ns; + struct binfmt_misc *misc; + + user_ns = current_user_ns(); + while (user_ns) { + /* Pairs with smp_store_release() in bm_fill_super(). */ + misc = smp_load_acquire(&user_ns->binfmt_misc); + if (misc) + return misc; + + user_ns = user_ns->parent; + } + + return &init_binfmt_misc; +} + /* * the loader itself */ @@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - int retval; + int retval = -ENOEXEC; + struct binfmt_misc *misc; - retval = -ENOEXEC; - if (!enabled) + misc = load_binfmt_misc(); + if (!misc->enabled) return retval; - /* to keep locking time low, we copy the interpreter string */ - read_lock(&entries_lock); - fmt = check_file(bprm); - if (fmt) - dget(fmt->dentry); - read_unlock(&entries_lock); + fmt = get_binfmt_handler(misc, bprm); if (!fmt) return retval; @@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = 0; ret: - dput(fmt->dentry); + + /* + * If we actually put the node here all concurrent calls to + * load_misc_binary() will have finished. We also know + * that for the refcount to be zero someone must have concurently + * removed the binary type handler from the list and it's our job to + * free it. + */ + put_binfmt_handler(fmt); + return retval; } @@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count) err = -ENOMEM; memsize = sizeof(Node) + count + 8; - e = kmalloc(memsize, GFP_KERNEL); + e = kmalloc(memsize, GFP_KERNEL_ACCOUNT); if (!e) goto out; @@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask) { int i; - char *masked = kmalloc(e->size, GFP_KERNEL); + char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", @@ -552,30 +627,109 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) return inode; } +/** + * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode + * @inode: inode of the relevant binfmt_misc instance + * + * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can + * be done without any memory barriers because we are guaranteed that + * user_ns->binfmt_misc is fully initialized. It was fully initialized when the + * binfmt_misc mount was first created. + * + * Return: struct binfmt_misc of the relevant binfmt_misc instance + */ +static struct binfmt_misc *i_binfmt_misc(struct inode *inode) +{ + return inode->i_sb->s_user_ns->binfmt_misc; +} + +/** + * bm_evict_inode - cleanup data associated with @inode + * @inode: inode to which the data is attached + * + * Cleanup the binary type handler data associated with @inode if a binary type + * entry is removed or the filesystem is unmounted and the super block is + * shutdown. + * + * If the ->evict call was not caused by a super block shutdown but by a write + * to remove the entry or all entries via bm_{entry,status}_write() the entry + * will have already been removed from the list. We keep the list_empty() check + * to make that explicit. +*/ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if (e && e->flags & MISC_FMT_OPEN_FILE) - filp_close(e->interp_file, NULL); - clear_inode(inode); - kfree(e); + + if (e) { + struct binfmt_misc *misc; + + misc = i_binfmt_misc(inode); + write_lock(&misc->entries_lock); + if (!list_empty(&e->list)) + list_del_init(&e->list); + write_unlock(&misc->entries_lock); + put_binfmt_handler(e); + } } -static void kill_node(Node *e) +/** + * unlink_binfmt_dentry - remove the dentry for the binary type handler + * @dentry: dentry associated with the binary type handler + * + * Do the actual filesystem work to remove a dentry for a registered binary + * type handler. Since binfmt_misc only allows simple files to be created + * directly under the root dentry of the filesystem we ensure that we are + * indeed passed a dentry directly beneath the root dentry, that the inode + * associated with the root dentry is locked, and that it is a regular file we + * are asked to remove. + */ +static void unlink_binfmt_dentry(struct dentry *dentry) { - struct dentry *dentry; + struct dentry *parent = dentry->d_parent; + struct inode *inode, *parent_inode; - write_lock(&entries_lock); - list_del_init(&e->list); - write_unlock(&entries_lock); + /* All entries are immediate descendants of the root dentry. */ + if (WARN_ON_ONCE(dentry->d_sb->s_root != parent)) + return; - dentry = e->dentry; - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); + /* We only expect to be called on regular files. */ + inode = d_inode(dentry); + if (WARN_ON_ONCE(!S_ISREG(inode->i_mode))) + return; + + /* The parent inode must be locked. */ + parent_inode = d_inode(parent); + if (WARN_ON_ONCE(!inode_is_locked(parent_inode))) + return; + + if (simple_positive(dentry)) { + dget(dentry); + simple_unlink(parent_inode, dentry); + d_delete(dentry); + dput(dentry); + } +} + +/** + * remove_binfmt_handler - remove a binary type handler + * @misc: handle to binfmt_misc instance + * @e: binary type handler to remove + * + * Remove a binary type handler from the list of binary type handlers and + * remove its associated dentry. This is called from + * binfmt_{entry,status}_write(). In the future, we might want to think about + * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's + * to use writes to files in order to delete binary type handlers. But it has + * worked for so long that it's not a pressing issue. + */ +static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) +{ + write_lock(&misc->entries_lock); + list_del_init(&e->list); + write_unlock(&misc->entries_lock); + unlink_binfmt_dentry(e->dentry); } /* /<entry> */ @@ -602,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct dentry *root; - Node *e = file_inode(file)->i_private; + struct inode *inode = file_inode(file); + Node *e = inode->i_private; int res = parse_command(buffer, count); switch (res) { @@ -617,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 3: /* Delete this handler. */ - root = file_inode(file)->i_sb->s_root; - inode_lock(d_inode(root)); + inode = d_inode(inode->i_sb->s_root); + inode_lock(inode); + /* + * In order to add new element or remove elements from the list + * via bm_{entry,register,status}_write() inode_lock() on the + * root inode must be held. + * The lock is exclusive ensuring that the list can't be + * modified. Only load_misc_binary() can access but does so + * read-only. So we only need to take the write lock when we + * actually remove the entry from the list. + */ if (!list_empty(&e->list)) - kill_node(e); + remove_binfmt_handler(i_binfmt_misc(inode), e); - inode_unlock(d_inode(root)); + inode_unlock(inode); break; default: return res; @@ -647,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, struct inode *inode; struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; + struct binfmt_misc *misc; int err = 0; struct file *f = NULL; @@ -656,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { + const struct cred *old_cred; + + /* + * Now that we support unprivileged binfmt_misc mounts make + * sure we use the credentials that the register @file was + * opened with to also open the interpreter. Before that this + * didn't matter much as only a privileged process could open + * the register file. + */ + old_cred = override_creds(file->f_cred); f = open_exec(e->interpreter); + revert_creds(old_cred); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); @@ -682,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, if (!inode) goto out2; - err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); - if (err) { - iput(inode); - inode = NULL; - goto out2; - } - + refcount_set(&e->users, 1); e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; d_instantiate(dentry, inode); - write_lock(&entries_lock); - list_add(&e->list, &entries); - write_unlock(&entries_lock); + misc = i_binfmt_misc(inode); + write_lock(&misc->entries_lock); + list_add(&e->list, &misc->entries); + write_unlock(&misc->entries_lock); err = 0; out2: @@ -723,35 +893,50 @@ static const struct file_operations bm_register_operations = { static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled\n" : "disabled\n"; + struct binfmt_misc *misc; + char *s; + misc = i_binfmt_misc(file_inode(file)); + s = misc->enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { + struct binfmt_misc *misc; int res = parse_command(buffer, count); - struct dentry *root; + Node *e, *next; + struct inode *inode; + misc = i_binfmt_misc(file_inode(file)); switch (res) { case 1: /* Disable all handlers. */ - enabled = 0; + misc->enabled = false; break; case 2: /* Enable all handlers. */ - enabled = 1; + misc->enabled = true; break; case 3: /* Delete all handlers. */ - root = file_inode(file)->i_sb->s_root; - inode_lock(d_inode(root)); + inode = d_inode(file_inode(file)->i_sb->s_root); + inode_lock(inode); - while (!list_empty(&entries)) - kill_node(list_first_entry(&entries, Node, list)); + /* + * In order to add new element or remove elements from the list + * via bm_{entry,register,status}_write() inode_lock() on the + * root inode must be held. + * The lock is exclusive ensuring that the list can't be + * modified. Only load_misc_binary() can access but does so + * read-only. So we only need to take the write lock when we + * actually remove the entry from the list. + */ + list_for_each_entry_safe(e, next, &misc->entries, list) + remove_binfmt_handler(misc, e); - inode_unlock(d_inode(root)); + inode_unlock(inode); break; default: return res; @@ -768,32 +953,100 @@ static const struct file_operations bm_status_operations = { /* Superblock handling */ +static void bm_put_super(struct super_block *sb) +{ + struct user_namespace *user_ns = sb->s_fs_info; + + sb->s_fs_info = NULL; + put_user_ns(user_ns); +} + static const struct super_operations s_ops = { .statfs = simple_statfs, .evict_inode = bm_evict_inode, + .put_super = bm_put_super, }; static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; + struct user_namespace *user_ns = sb->s_user_ns; + struct binfmt_misc *misc; static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; + if (WARN_ON(user_ns != current_user_ns())) + return -EINVAL; + + /* + * Lazily allocate a new binfmt_misc instance for this namespace, i.e. + * do it here during the first mount of binfmt_misc. We don't need to + * waste memory for every user namespace allocation. It's likely much + * more common to not mount a separate binfmt_misc instance than it is + * to mount one. + * + * While multiple superblocks can exist they are keyed by userns in + * s_fs_info for binfmt_misc. Hence, the vfs guarantees that + * bm_fill_super() is called exactly once whenever a binfmt_misc + * superblock for a userns is created. This in turn lets us conclude + * that when a binfmt_misc superblock is created for the first time for + * a userns there's no one racing us. Therefore we don't need any + * barriers when we dereference binfmt_misc. + */ + misc = user_ns->binfmt_misc; + if (!misc) { + /* + * If it turns out that most user namespaces actually want to + * register their own binary type handler and therefore all + * create their own separate binfm_misc mounts we should + * consider turning this into a kmem cache. + */ + misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); + if (!misc) + return -ENOMEM; + + INIT_LIST_HEAD(&misc->entries); + rwlock_init(&misc->entries_lock); + + /* Pairs with smp_load_acquire() in load_binfmt_misc(). */ + smp_store_release(&user_ns->binfmt_misc, misc); + } + + /* + * When the binfmt_misc superblock for this userns is shutdown + * ->enabled might have been set to false and we don't reinitialize + * ->enabled again in put_super() as someone might already be mounting + * binfmt_misc again. It also would be pointless since by the time + * ->put_super() is called we know that the binary type list for this + * bintfmt_misc mount is empty making load_misc_binary() return + * -ENOEXEC independent of whether ->enabled is true. Instead, if + * someone mounts binfmt_misc for the first time or again we simply + * reset ->enabled to true. + */ + misc->enabled = true; + err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; } +static void bm_free(struct fs_context *fc) +{ + if (fc->s_fs_info) + put_user_ns(fc->s_fs_info); +} + static int bm_get_tree(struct fs_context *fc) { - return get_tree_single(fc, bm_fill_super); + return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns)); } static const struct fs_context_operations bm_context_ops = { + .free = bm_free, .get_tree = bm_get_tree, }; @@ -812,6 +1065,7 @@ static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, + .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("binfmt_misc"); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8d51f69f9f5e..70f97f685bff 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -90,6 +90,16 @@ struct linux_binfmt { #endif } __randomize_layout; +#if IS_ENABLED(CONFIG_BINFMT_MISC) +struct binfmt_misc { + struct list_head entries; + rwlock_t entries_lock; + bool enabled; +} __randomize_layout; + +extern struct binfmt_misc init_binfmt_misc; +#endif + extern void __register_binfmt(struct linux_binfmt *fmt, int insert); /* Registration of default binfmt handlers */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 19fc73b02c9f..116c28c51468 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3308,8 +3308,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif -/* These take the mm semaphore themselves */ -extern int __must_check vm_brk(unsigned long, unsigned long); +/* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 45f09bec02c4..6030a8235617 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -65,6 +65,10 @@ enum rlimit_type { UCOUNT_RLIMIT_COUNTS, }; +#if IS_ENABLED(CONFIG_BINFMT_MISC) +struct binfmt_misc; +#endif + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -102,6 +106,10 @@ struct user_namespace { struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; + +#if IS_ENABLED(CONFIG_BINFMT_MISC) + struct binfmt_misc *binfmt_misc; +#endif } __randomize_layout; struct ucounts { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 9b731976ce2f..9417309b7230 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -140,7 +140,7 @@ typedef __s64 Elf64_Sxword; #define ELF64_ST_BIND(x) ELF_ST_BIND(x) #define ELF64_ST_TYPE(x) ELF_ST_TYPE(x) -typedef struct dynamic { +typedef struct { Elf32_Sword d_tag; union { Elf32_Sword d_val; diff --git a/kernel/user.c b/kernel/user.c index d667debeafd6..03cedc366dc9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -18,8 +18,18 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> #include <linux/proc_ns.h> +#if IS_ENABLED(CONFIG_BINFMT_MISC) +struct binfmt_misc init_binfmt_misc = { + .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), + .enabled = true, + .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), +}; +EXPORT_SYMBOL_GPL(init_binfmt_misc); +#endif + /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? @@ -67,6 +77,9 @@ struct user_namespace init_user_ns = { .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif +#if IS_ENABLED(CONFIG_BINFMT_MISC) + .binfmt_misc = &init_binfmt_misc, +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d8e47bed3f1..d52a894ecf57 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } +#if IS_ENABLED(CONFIG_BINFMT_MISC) + kfree(ns->binfmt_misc); +#endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); diff --git a/mm/mmap.c b/mm/mmap.c index 9e018d8dd7d6..853489ca05ef 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3194,12 +3194,6 @@ limits_failed: } EXPORT_SYMBOL(vm_brk_flags); -int vm_brk(unsigned long addr, unsigned long len) -{ - return vm_brk_flags(addr, len, 0); -} -EXPORT_SYMBOL(vm_brk); - /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { diff --git a/mm/nommu.c b/mm/nommu.c index 7f9e9e5a0e12..23c43c208f2b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1531,11 +1531,6 @@ void exit_mmap(struct mm_struct *mm) mmap_write_unlock(mm); } -int vm_brk(unsigned long addr, unsigned long len) -{ - return -ENOMEM; -} - /* * expand (or shrink) an existing mapping, potentially moving it at the same * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |