From 2b75869bba676c248d8d25ae6d2bd9221dfffdb6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 13 Oct 2014 16:41:28 +1100 Subject: sysfs/kernfs: allow attributes to request write buffer be pre-allocated. md/raid allows metadata management to be performed in user-space. A various times, particularly on device failure, the metadata needs to be updated before further writes can be permitted. This means that the user-space program which updates metadata much not block on writeout, and so must not allocate memory. mlockall(MCL_CURRENT|MCL_FUTURE) and pre-allocation can avoid all memory allocation issues for user-memory, but that does not help kernel memory. Several kernel objects can be pre-allocated. e.g. files opened before any writes to the array are permitted. However some kernel allocation happens in places that cannot be pre-allocated. In particular, writes to sysfs files (to tell md that it can now allow writes to the array) allocate a buffer using GFP_KERNEL. This patch allows attributes to be marked as "PREALLOC". In that case the maximal buffer is allocated when the file is opened, and then used on each write instead of allocating a new buffer. As the same buffer is now shared for all writes on the same file description, the mutex is extended to cover full use of the buffer including the copy_from_user(). The new __ATTR_PREALLOC() 'or's a new flag in to the 'mode', which is inspected by sysfs_add_file_mode_ns() to determine if the file should be marked as requiring prealloc. Despite the comment, we *do* use ->seq_show together with ->prealloc in this patch. The next patch fixes that. Signed-off-by: NeilBrown Reviewed-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 4429d6d9217f..70186e2e692a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) const struct kernfs_ops *ops; /* - * @of->mutex nests outside active ref and is just to ensure that + * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); @@ -194,7 +194,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, return -ENOMEM; /* - * @of->mutex nests outside active ref and is just to ensure that + * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); @@ -278,19 +278,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, len = min_t(size_t, count, PAGE_SIZE); } - buf = kmalloc(len + 1, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - /* - * @of->mutex nests outside active ref and is just to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -299,19 +296,27 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, goto out_free; } + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_unlock; + } + buf[len] = '\0'; /* guarantee string termination */ + ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len > 0) *ppos += len; + +out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -685,6 +690,14 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) */ of->atomic_write_len = ops->atomic_write_len; + if (ops->prealloc) { + int len = of->atomic_write_len ?: PAGE_SIZE; + of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); + error = -ENOMEM; + if (!of->prealloc_buf) + goto err_free; + } + /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access @@ -715,6 +728,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) err_close: seq_release(inode, file); err_free: + kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); @@ -728,6 +742,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) kernfs_put_open_node(kn, of); seq_release(inode, filp); + kfree(of->prealloc_buf); kfree(of); return 0; -- cgit v1.2.3-58-ga151 From 4ef67a8c95f32ed0c8c6ed5fe01d1dd16358350e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 14 Oct 2014 16:57:26 +1100 Subject: sysfs/kernfs: make read requests on pre-alloc files use the buffer. To match the previous patch which used the pre-alloc buffer for writes, this patch causes reads to use the same buffer. This is not strictly necessary as the current seq_read() will allocate on first read, so user-space can trigger the required pre-alloc. But consistency is valuable. The read function is somewhat simpler than seq_read() and, for example, does not support reading from an offset into the file: reads must be at the start of the file. As seq_read() does not use the prealloc buffer, ->seq_show is incompatible with ->prealloc and caused an EINVAL return from open(). sysfs code which calls into kernfs always chooses the correct function. As the buffer is shared with writes and other reads, the mutex is extended to cover the copy_to_user. Signed-off-by: NeilBrown Reviewed-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 30 +++++++++++++++++++++--------- fs/sysfs/file.c | 32 ++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 13 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 70186e2e692a..697390ea47b8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -189,13 +189,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, const struct kernfs_ops *ops; char *buf; - buf = kmalloc(len, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* - * @of->mutex nests outside active ref and is primarily to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -210,21 +213,22 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len < 0) - goto out_free; + goto out_unlock; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; - goto out_free; + goto out_unlock; } *ppos += len; + out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -690,6 +694,14 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) */ of->atomic_write_len = ops->atomic_write_len; + error = -EINVAL; + /* + * ->seq_show is incompatible with ->prealloc, + * as seq_read does its own allocation. + * ->read must be used instead. + */ + if (ops->prealloc && ops->seq_show) + goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 4ad3721a991c..dfe928a9540f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -102,6 +102,22 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, return battr->read(of->file, kobj, battr, buf, pos, count); } +/* kernfs read callback for regular sysfs files with pre-alloc */ +static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; + + /* + * If buf != of->prealloc_buf, we don't know how + * large it is, so cannot safely pass it to ->show + */ + if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + return 0; + return ops->show(kobj, of->kn->priv, buf); +} + /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) @@ -184,13 +200,18 @@ static const struct kernfs_ops sysfs_file_kfops_rw = { .write = sysfs_kf_write, }; +static const struct kernfs_ops sysfs_prealloc_kfops_ro = { + .read = sysfs_kf_read, + .prealloc = true, +}; + static const struct kernfs_ops sysfs_prealloc_kfops_wo = { .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_rw = { - .seq_show = sysfs_kf_seq_show, + .read = sysfs_kf_read, .write = sysfs_kf_write, .prealloc = true, }; @@ -238,9 +259,12 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, ops = &sysfs_prealloc_kfops_rw; else ops = &sysfs_file_kfops_rw; - } else if (sysfs_ops->show) - ops = &sysfs_file_kfops_ro; - else if (sysfs_ops->store) { + } else if (sysfs_ops->show) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_ro; + else + ops = &sysfs_file_kfops_ro; + } else if (sysfs_ops->store) { if (mode & SYSFS_PREALLOC) ops = &sysfs_prealloc_kfops_wo; else -- cgit v1.2.3-58-ga151