From 12b9d301ff73122aebd78548fa4c04ca69ed78fe Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 29 Sep 2022 12:29:33 +0800 Subject: proc/vmcore: fix potential memory leak in vmcore_init() Patch series "Some minor cleanup patches resent". The first three patches trivial clean up patches. And for the patch "kexec: replace crash_mem_range with range", I got a ibm-p9wr ppc64le system to test, it works well. This patch (of 4): elfcorehdr_alloc() allocates a memory chunk for elfcorehdr_addr with kzalloc(). If is_vmcore_usable() returns false, elfcorehdr_addr is a predefined value. If parse_crash_elf_headers() gets some error and returns a negetive value, the elfcorehdr_addr should be released with elfcorehdr_free(). Fix it by calling elfcorehdr_free() when parse_crash_elf_headers() fails. Link: https://lkml.kernel.org/r/20220929042936.22012-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220929042936.22012-2-bhe@redhat.com Signed-off-by: Jianglei Nie Signed-off-by: Baoquan He Acked-by: Baoquan He Cc: Benjamin Herrenschmidt Cc: Chen Lifu Cc: "Eric W . Biederman" Cc: Li Chen Cc: Michael Ellerman Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: ye xingchen Cc: Zeal Robot Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..74747571d58e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1567,6 +1567,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { + elfcorehdr_free(elfcorehdr_addr); pr_warn("Kdump: vmcore not initialized\n"); return rc; } -- cgit v1.2.3-58-ga151 From f1f1f2569901ec5b9d425f2e91c09a0e320768f3 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Thu, 22 Sep 2022 15:40:26 -0700 Subject: proc: report open files as size in stat() for /proc/pid/fd Many monitoring tools include open file count as a metric. Currently the only way to get this number is to enumerate the files in /proc/pid/fd. The problem with the current approach is that it does many things people generally don't care about when they need one number for a metric. In our tests for cadvisor, which reports open file counts per cgroup, we observed that reading the number of open files is slow. Out of 35.23% of CPU time spent in `proc_readfd_common`, we see 29.43% spent in `proc_fill_cache`, which is responsible for filling dentry info. Some of this extra time is spinlock contention, but it's a contention for the lock we don't want to take to begin with. We considered putting the number of open files in /proc/pid/status. Unfortunately, counting the number of fds involves iterating the open_files bitmap, which has a linear complexity in proportion with the number of open files (bitmap slots really, but it's close). We don't want to make /proc/pid/status any slower, so instead we put this info in /proc/pid/fd as a size member of the stat syscall result. Previously the reported number was zero, so there's very little risk of breaking anything, while still providing a somewhat logical way to count the open files with a fallback if it's zero. RFC for this patch included iterating open fds under RCU. Thanks to Frank Hofmann for the suggestion to use the bitmap instead. Previously: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 0 Blocks: 0 IO Block: 1024 directory ``` With this patch: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 65 Blocks: 0 IO Block: 1024 directory ``` Correctness check: ``` $ sudo ls /proc/1/fd | wc -l 65 ``` I added the docs for /proc//fd while I'm at it. [ivan@cloudflare.com: use bitmap_weight() to count the bits] Link: https://lkml.kernel.org/r/20221018045844.37697-1-ivan@cloudflare.com [akpm@linux-foundation.org: include linux/bitmap.h for bitmap_weight()] [ivan@cloudflare.com: return errno from proc_fd_getattr() instead of setting negative size] Link: https://lkml.kernel.org/r/20221024173140.30673-1-ivan@cloudflare.com Link: https://lkml.kernel.org/r/20220922224027.59266-1-ivan@cloudflare.com Signed-off-by: Ivan Babrou Cc: Alexey Dobriyan Cc: Al Viro Cc: Christoph Anton Mitterer Cc: David Hildenbrand Cc: David Laight Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 17 ++++++++++++++ fs/proc/fd.c | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'fs/proc') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 898c99eae8e4..ec6cfdf1796a 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -47,6 +47,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 3.10 /proc//timerslack_ns - Task timerslack value 3.11 /proc//patch_state - Livepatch patch operation state 3.12 /proc//arch_status - Task architecture specific information + 3.13 /proc//fd - List of symlinks to open files 4 Configuring procfs 4.1 Mount options @@ -2149,6 +2150,22 @@ AVX512_elapsed_ms the task is unlikely an AVX512 user, but depends on the workload and the scheduling scenario, it also could be a false negative mentioned above. +3.13 /proc//fd - List of symlinks to open files +------------------------------------------------------- +This directory contains symbolic links which represent open files +the process is maintaining. Example output:: + + lr-x------ 1 root root 64 Sep 20 17:53 0 -> /dev/null + l-wx------ 1 root root 64 Sep 20 17:53 1 -> /dev/null + lrwx------ 1 root root 64 Sep 20 17:53 10 -> 'socket:[12539]' + lrwx------ 1 root root 64 Sep 20 17:53 11 -> 'socket:[12540]' + lrwx------ 1 root root 64 Sep 20 17:53 12 -> 'socket:[12542]' + +The number of open files for the process is stored in 'size' member +of stat() output for /proc//fd for fast access. +------------------------------------------------------- + + Chapter 4: Configuring procfs ============================= diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36..fc46d6fe080c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,6 +280,30 @@ out: return 0; } +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + static int proc_readfd(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); @@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } +static int proc_fd_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + int rv = 0; + + generic_fillattr(&init_user_ns, inode, stat); + + /* If it's a directory, put the number of open fds there */ + if (S_ISDIR(inode->i_mode)) { + rv = proc_readfd_count(inode, &stat->size); + if (rv < 0) + return rv; + } + + return rv; +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; -- cgit v1.2.3-58-ga151 From 941baf6febaa88c057192084ca280d976c7c7239 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 8 Sep 2022 21:21:54 +0300 Subject: proc: give /proc/cmdline size Most /proc files don't have length (in fstat sense). This leads to inefficiencies when reading such files with APIs commonly found in modern programming languages. They open file, then fstat descriptor, get st_size == 0 and either assume file is empty or start reading without knowing target size. cat(1) does OK because it uses large enough buffer by default. But naive programs copy-pasted from SO aren't: let mut f = std::fs::File::open("/proc/cmdline").unwrap(); let mut buf: Vec = Vec::new(); f.read_to_end(&mut buf).unwrap(); will result in openat(AT_FDCWD, "/proc/cmdline", O_RDONLY|O_CLOEXEC) = 3 statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address) statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_BASIC_STATS|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0444, stx_size=0, ...}) = 0 lseek(3, 0, SEEK_CUR) = 0 read(3, "BOOT_IMAGE=(hd3,gpt2)/vmlinuz-5.", 32) = 32 read(3, "19.6-100.fc35.x86_64 root=/dev/m", 32) = 32 read(3, "apper/fedora_localhost--live-roo"..., 64) = 64 read(3, "ocalhost--live-swap rd.lvm.lv=fe"..., 128) = 116 read(3, "", 12) open/stat is OK, lseek looks silly but there are 3 unnecessary reads because Rust starts with 32 bytes per Vec and grows from there. In case of /proc/cmdline, the length is known precisely. Make variables readonly while I'm at it. P.S.: I tried to scp /proc/cpuinfo today and got empty file but this is separate story. Link: https://lkml.kernel.org/r/YxoywlbM73JJN3r+@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/cmdline.c | 6 +++++- include/linux/init.h | 1 + init/main.c | 7 +++++-- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index fa762c5fbcb2..91fe1597af7b 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -3,6 +3,7 @@ #include #include #include +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { @@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v) static int __init proc_cmdline_init(void) { - proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); diff --git a/include/linux/init.h b/include/linux/init.h index 077d7f93b402..2e96756fe1ff 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -143,6 +143,7 @@ struct file_system_type; extern int do_one_initcall(initcall_t fn); extern char __initdata boot_command_line[]; extern char *saved_command_line; +extern unsigned int saved_command_line_len; extern unsigned int reset_devices; /* used by init/main.c */ diff --git a/init/main.c b/init/main.c index aa21add5f7c5..d213371cc067 100644 --- a/init/main.c +++ b/init/main.c @@ -145,7 +145,8 @@ void (*__initdata late_time_init)(void); /* Untouched command line saved by arch-specific code. */ char __initdata boot_command_line[COMMAND_LINE_SIZE]; /* Untouched saved command line (eg. for /proc) */ -char *saved_command_line; +char *saved_command_line __ro_after_init; +unsigned int saved_command_line_len __ro_after_init; /* Command line for parameter parsing */ static char *static_command_line; /* Untouched extra command line */ @@ -667,6 +668,8 @@ static void __init setup_command_line(char *command_line) strcpy(saved_command_line + len, extra_init_args); } } + + saved_command_line_len = strlen(saved_command_line); } /* @@ -1379,7 +1382,7 @@ static void __init do_initcall_level(int level, char *command_line) static void __init do_initcalls(void) { int level; - size_t len = strlen(saved_command_line) + 1; + size_t len = saved_command_line_len + 1; char *command_line; command_line = kzalloc(len, GFP_KERNEL); -- cgit v1.2.3-58-ga151