diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-08 10:26:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-08 10:26:08 -0800 |
commit | c604110e662a54568073a03176402b624e740310 (patch) | |
tree | 3121f7a3e57d9cff898029245ad94048a655c792 | |
parent | 1ab33c03145d0f6c345823fc2da935d9a1a9e9fc (diff) | |
parent | dd8f87f21dc3da2eaf46e7401173f935b90b13a8 (diff) |
Merge tag 'vfs-6.8.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull misc vfs updates from Christian Brauner:
"This contains the usual miscellaneous features, cleanups, and fixes
for vfs and individual fses.
Features:
- Add Jan Kara as VFS reviewer
- Show correct device and inode numbers in proc/<pid>/maps for vma
files on stacked filesystems. This is now easily doable thanks to
the backing file work from the last cycles. This comes with
selftests
Cleanups:
- Remove a redundant might_sleep() from wait_on_inode()
- Initialize pointer with NULL, not 0
- Clarify comment on access_override_creds()
- Rework and simplify eventfd_signal() and eventfd_signal_mask()
helpers
- Process aio completions in batches to avoid needless wakeups
- Completely decouple struct mnt_idmap from namespaces. We now only
keep the actual idmapping around and don't stash references to
namespaces
- Reformat maintainer entries to indicate that a given subsystem
belongs to fs/
- Simplify fput() for files that were never opened
- Get rid of various pointless file helpers
- Rename various file helpers
- Rename struct file members after SLAB_TYPESAFE_BY_RCU switch from
last cycle
- Make relatime_need_update() return bool
- Use GFP_KERNEL instead of GFP_USER when allocating superblocks
- Replace deprecated ida_simple_*() calls with their current ida_*()
counterparts
Fixes:
- Fix comments on user namespace id mapping helpers. They aren't
kernel doc comments so they shouldn't be using /**
- s/Retuns/Returns/g in various places
- Add missing parameter documentation on can_move_mount_beneath()
- Rename i_mapping->private_data to i_mapping->i_private_data
- Fix a false-positive lockdep warning in pipe_write() for watch
queues
- Improve __fget_files_rcu() code generation to improve performance
- Only notify writer that pipe resizing has finished after setting
pipe->max_usage otherwise writers are never notified that the pipe
has been resized and hang
- Fix some kernel docs in hfsplus
- s/passs/pass/g in various places
- Fix kernel docs in ntfs
- Fix kcalloc() arguments order reported by gcc 14
- Fix uninitialized value in reiserfs"
* tag 'vfs-6.8.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (36 commits)
reiserfs: fix uninit-value in comp_keys
watch_queue: fix kcalloc() arguments order
ntfs: dir.c: fix kernel-doc function parameter warnings
fs: fix doc comment typo fs tree wide
selftests/overlayfs: verify device and inode numbers in /proc/pid/maps
fs/proc: show correct device and inode numbers in /proc/pid/maps
eventfd: Remove usage of the deprecated ida_simple_xx() API
fs: super: use GFP_KERNEL instead of GFP_USER for super block allocation
fs/hfsplus: wrapper.c: fix kernel-doc warnings
fs: add Jan Kara as reviewer
fs/inode: Make relatime_need_update return bool
pipe: wakeup wr_wait after setting max_usage
file: remove __receive_fd()
file: stop exposing receive_fd_user()
fs: replace f_rcuhead with f_task_work
file: remove pointless wrapper
file: s/close_fd_get_file()/file_close_fd()/g
Improve __fget_files_rcu() code generation (and thus __fget_light())
file: massage cleanup of files that failed to open
fs/pipe: Fix lockdep false-positive in watchqueue pipe_write()
...
83 files changed, 784 insertions, 456 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index a7c4cf8201e0..5898fcb8640b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8103,6 +8103,7 @@ F: include/trace/events/fs_dax.h FILESYSTEMS (VFS and infrastructure) M: Alexander Viro <viro@zeniv.linux.org.uk> M: Christian Brauner <brauner@kernel.org> +R: Jan Kara <jack@suse.cz> L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/* @@ -8123,6 +8124,16 @@ F: fs/exportfs/ F: fs/fhandle.c F: include/linux/exportfs.h +FILESYSTEMS [IDMAPPED MOUNTS] +M: Christian Brauner <brauner@kernel.org> +M: Seth Forshee <sforshee@kernel.org> +L: linux-fsdevel@vger.kernel.org +S: Maintained +F: Documentation/filesystems/idmappings.rst +F: fs/mnt_idmapping.c +F: include/linux/mnt_idmapping.* +F: tools/testing/selftests/mount_setattr/ + FILESYSTEMS [IOMAP] M: Christian Brauner <brauner@kernel.org> R: Darrick J. Wong <djwong@kernel.org> @@ -10202,16 +10213,6 @@ S: Maintained W: https://github.com/o2genum/ideapad-slidebar F: drivers/input/misc/ideapad_slidebar.c -IDMAPPED MOUNTS -M: Christian Brauner <brauner@kernel.org> -M: Seth Forshee <sforshee@kernel.org> -L: linux-fsdevel@vger.kernel.org -S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git -F: Documentation/filesystems/idmappings.rst -F: include/linux/mnt_idmapping.* -F: tools/testing/selftests/mount_setattr/ - IDT VersaClock 5 CLOCK DRIVER M: Luca Ceresoli <luca@lucaceresoli.net> S: Maintained diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 238afd7335e4..4943f6b2bbee 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h if (!eventfd) return HV_STATUS_INVALID_PORT_ID; - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index e53fad915a62..523bb6df5ac9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) if (ret < 0 && ret != -ENOTCONN) return false; } else { - eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + eventfd_signal(evtchnfd->deliver.eventfd.ctx); } *r = 0; diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9711e8fc979d..3a89644f087c 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2044,7 +2044,7 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 notifier_event->events_mask |= event_mask; if (notifier_event->eventfd) - eventfd_signal(notifier_event->eventfd, 1); + eventfd_signal(notifier_event->eventfd); mutex_unlock(¬ifier_event->lock); } diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 92128aae2d06..7658103ba760 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1921,7 +1921,7 @@ static void binder_deferred_fd_close(int fd) if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); - twcb->file = close_fd_get_file(fd); + twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c index dd7a783d53b5..e73f88050f08 100644 --- a/drivers/fpga/dfl.c +++ b/drivers/fpga/dfl.c @@ -1872,7 +1872,7 @@ static irqreturn_t dfl_irq_handler(int irq, void *arg) { struct eventfd_ctx *trigger = arg; - eventfd_signal(trigger, 1); + eventfd_signal(trigger); return IRQ_HANDLED; } diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 01da6789d044..b9cc62982196 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1365,7 +1365,7 @@ static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence, struct syncobj_eventfd_entry *entry = container_of(cb, struct syncobj_eventfd_entry, fence_cb); - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } @@ -1388,13 +1388,13 @@ syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, entry->fence = fence; if (entry->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) { - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } else { ret = dma_fence_add_callback(fence, &entry->fence_cb, syncobj_eventfd_entry_fence_func); if (ret == -ENOENT) { - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } } diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index de3f5903d1a7..c8e7dfc9f791 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -422,7 +422,7 @@ static void init_irq_map(struct intel_gvt_irq *irq) #define MSI_CAP_DATA(offset) (offset + 8) #define MSI_CAP_EN 0x1 -static int inject_virtual_interrupt(struct intel_vgpu *vgpu) +static void inject_virtual_interrupt(struct intel_vgpu *vgpu) { unsigned long offset = vgpu->gvt->device_info.msi_cap_offset; u16 control, data; @@ -434,10 +434,10 @@ static int inject_virtual_interrupt(struct intel_vgpu *vgpu) /* Do not generate MSI if MSIEN is disabled */ if (!(control & MSI_CAP_EN)) - return 0; + return; if (WARN(control & GENMASK(15, 1), "only support one MSI format\n")) - return -EINVAL; + return; trace_inject_msi(vgpu->id, addr, data); @@ -451,10 +451,9 @@ static int inject_virtual_interrupt(struct intel_vgpu *vgpu) * returned and don't inject interrupt into guest. */ if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) - return -ESRCH; - if (vgpu->msi_trigger && eventfd_signal(vgpu->msi_trigger, 1) != 1) - return -EFAULT; - return 0; + return; + if (vgpu->msi_trigger) + eventfd_signal(vgpu->msi_trigger); } static void propagate_event(struct intel_gvt_irq *irq, diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8ba53edf2311..869369cb5b5f 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2498,7 +2498,7 @@ static void dispatch_event_fd(struct list_head *fd_list, list_for_each_entry_rcu(item, fd_list, xa_list) { if (item->eventfd) - eventfd_signal(item->eventfd, 1); + eventfd_signal(item->eventfd); else deliver_event(item, data); } diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index ac69b7f361f5..7eb74711ac96 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -184,7 +184,7 @@ static irqreturn_t irq_handler(void *private) { struct eventfd_ctx *ev_ctx = private; - eventfd_signal(ev_ctx, 1); + eventfd_signal(ev_ctx); return IRQ_HANDLED; } diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index d3f3a611f95b..38c176cf6295 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -115,7 +115,7 @@ static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, /* Notify the guest if more CRWs are on our queue */ if (!list_empty(&private->crw) && private->crw_trigger) - eventfd_signal(private->crw_trigger, 1); + eventfd_signal(private->crw_trigger); return ret; } diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 43601816ea4e..bfb35cfce1ef 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -112,7 +112,7 @@ void vfio_ccw_sch_io_todo(struct work_struct *work) private->state = VFIO_CCW_STATE_IDLE; if (private->io_trigger) - eventfd_signal(private->io_trigger, 1); + eventfd_signal(private->io_trigger); } void vfio_ccw_crw_todo(struct work_struct *work) @@ -122,7 +122,7 @@ void vfio_ccw_crw_todo(struct work_struct *work) private = container_of(work, struct vfio_ccw_private, crw_work); if (!list_empty(&private->crw) && private->crw_trigger) - eventfd_signal(private->crw_trigger, 1); + eventfd_signal(private->crw_trigger); } /* diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index cba4971618ff..ea532a8a4a0c 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -421,7 +421,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private, case VFIO_IRQ_SET_DATA_NONE: { if (*ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } case VFIO_IRQ_SET_DATA_BOOL: @@ -432,7 +432,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private, return -EFAULT; if (trigger && *ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } case VFIO_IRQ_SET_DATA_EVENTFD: @@ -612,7 +612,7 @@ static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count) "Relaying device request to user (#%u)\n", count); - eventfd_signal(private->req_trigger, 1); + eventfd_signal(private->req_trigger); } else if (count == 0) { dev_notice(dev, "No device request channel registered, blocked until released by user\n"); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 4db538a55192..542b5be73a6a 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1794,7 +1794,7 @@ static void vfio_ap_mdev_request(struct vfio_device *vdev, unsigned int count) "Relaying device request to user (#%u)\n", count); - eventfd_signal(matrix_mdev->req_trigger, 1); + eventfd_signal(matrix_mdev->req_trigger); } else if (count == 0) { dev_notice(dev, "No device request registered, blocked until released by user\n"); diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index efe3e3b85769..fdd0fc7b8f25 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work) io_data->kiocb->ki_complete(io_data->kiocb, ret); if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd) - eventfd_signal(io_data->ffs->ffs_eventfd, 1); + eventfd_signal(io_data->ffs->ffs_eventfd); if (io_data->read) kfree(io_data->to_free); @@ -2738,7 +2738,7 @@ static void __ffs_event_add(struct ffs_data *ffs, ffs->ev.types[ffs->ev.count++] = type; wake_up_locked(&ffs->ev.waitq); if (ffs->ffs_eventfd) - eventfd_signal(ffs->ffs_eventfd, 1); + eventfd_signal(ffs->ffs_eventfd); } static void ffs_event_add(struct ffs_data *ffs, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0ddd4b8abecb..1d24da79c399 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -493,7 +493,7 @@ static void vduse_vq_kick(struct vduse_virtqueue *vq) goto unlock; if (vq->kickfd) - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); else vq->kicked = true; unlock: @@ -911,7 +911,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev, eventfd_ctx_put(vq->kickfd); vq->kickfd = ctx; if (vq->ready && vq->kicked && vq->kickfd) { - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); vq->kicked = false; } spin_unlock(&vq->kick_lock); @@ -960,7 +960,7 @@ static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) spin_lock_irq(&vq->irq_lock); if (vq->ready && vq->cb.trigger) { - eventfd_signal(vq->cb.trigger, 1); + eventfd_signal(vq->cb.trigger); signal = true; } spin_unlock_irq(&vq->irq_lock); @@ -1157,7 +1157,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, fput(f); break; } - ret = receive_fd(f, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index c51229fccbd6..d62fbfff20b8 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -54,7 +54,7 @@ static irqreturn_t vfio_fsl_mc_irq_handler(int irq_num, void *arg) { struct vfio_fsl_mc_irq *mc_irq = (struct vfio_fsl_mc_irq *)arg; - eventfd_signal(mc_irq->trigger, 1); + eventfd_signal(mc_irq->trigger); return IRQ_HANDLED; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1929103ee59a..1cbc990d42e0 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -443,7 +443,7 @@ static int vfio_pci_core_runtime_resume(struct device *dev) */ down_write(&vdev->memory_lock); if (vdev->pm_wake_eventfd_ctx) { - eventfd_signal(vdev->pm_wake_eventfd_ctx, 1); + eventfd_signal(vdev->pm_wake_eventfd_ctx); __vfio_pci_runtime_pm_exit(vdev); } up_write(&vdev->memory_lock); @@ -1883,7 +1883,7 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger, 1); + eventfd_signal(vdev->req_trigger); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); @@ -2302,7 +2302,7 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger, 1); + eventfd_signal(vdev->err_trigger); mutex_unlock(&vdev->igate); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index cbb4bcbfbf83..237beac83809 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -94,7 +94,7 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) ctx = vfio_irq_ctx_get(vdev, 0); if (WARN_ON_ONCE(!ctx)) return; - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } } @@ -342,7 +342,7 @@ static irqreturn_t vfio_msihandler(int irq, void *arg) { struct eventfd_ctx *trigger = arg; - eventfd_signal(trigger, 1); + eventfd_signal(trigger); return IRQ_HANDLED; } @@ -689,11 +689,11 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, if (!ctx) continue; if (flags & VFIO_IRQ_SET_DATA_NONE) { - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t *bools = data; if (bools[i - start]) - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } } return 0; @@ -707,7 +707,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, if (flags & VFIO_IRQ_SET_DATA_NONE) { if (*ctx) { if (count) { - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); } else { eventfd_ctx_put(*ctx); *ctx = NULL; @@ -722,7 +722,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, trigger = *(uint8_t *)data; if (trigger && *ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c index 665197caed89..61a1bfb68ac7 100644 --- a/drivers/vfio/platform/vfio_platform_irq.c +++ b/drivers/vfio/platform/vfio_platform_irq.c @@ -155,7 +155,7 @@ static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id) spin_unlock_irqrestore(&irq_ctx->lock, flags); if (ret == IRQ_HANDLED) - eventfd_signal(irq_ctx->trigger, 1); + eventfd_signal(irq_ctx->trigger); return ret; } @@ -164,7 +164,7 @@ static irqreturn_t vfio_irq_handler(int irq, void *dev_id) { struct vfio_platform_irq *irq_ctx = dev_id; - eventfd_signal(irq_ctx->trigger, 1); + eventfd_signal(irq_ctx->trigger); return IRQ_HANDLED; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index da7ec77cdaff..173beda74b38 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -178,7 +178,7 @@ static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) struct eventfd_ctx *call_ctx = vq->call_ctx.ctx; if (call_ctx) - eventfd_signal(call_ctx, 1); + eventfd_signal(call_ctx); return IRQ_HANDLED; } @@ -189,7 +189,7 @@ static irqreturn_t vhost_vdpa_config_cb(void *private) struct eventfd_ctx *config_ctx = v->config_ctx; if (config_ctx) - eventfd_signal(config_ctx, 1); + eventfd_signal(config_ctx); return IRQ_HANDLED; } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e0c181ad17e3..045f666b4f12 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2248,7 +2248,7 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, len -= l; if (!len) { if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); return 0; } } @@ -2271,7 +2271,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) log_used(vq, (used - (void __user *)vq->used), sizeof vq->used->flags); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return 0; } @@ -2289,7 +2289,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq) log_used(vq, (used - (void __user *)vq->used), sizeof *vhost_avail_event(vq)); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return 0; } @@ -2715,7 +2715,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, log_used(vq, offsetof(struct vring_used, idx), sizeof vq->used->idx); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return r; } @@ -2763,7 +2763,7 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { /* Signal the Guest tell them we used something up. */ if (vq->call_ctx.ctx && vhost_notify(dev, vq)) - eventfd_signal(vq->call_ctx.ctx, 1); + eventfd_signal(vq->call_ctx.ctx); } EXPORT_SYMBOL_GPL(vhost_signal); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f60d5f7bef94..9e942fcda5c3 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -249,7 +249,7 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \ - eventfd_signal((vq)->error_ctx, 1);\ + eventfd_signal((vq)->error_ctx);\ } while (0) enum { diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c index ac4037e9f947..4e845c6ca0b5 100644 --- a/drivers/virt/acrn/ioeventfd.c +++ b/drivers/virt/acrn/ioeventfd.c @@ -223,7 +223,7 @@ static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client, mutex_lock(&client->vm->ioeventfds_lock); p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type); if (p) - eventfd_signal(p->eventfd, 1); + eventfd_signal(p->eventfd); mutex_unlock(&client->vm->ioeventfds_lock); return 0; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 0eb337a8ec0f..35b6e306026a 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1147,7 +1147,7 @@ static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && ioreq->size == kioeventfd->addr_len && (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { - eventfd_signal(kioeventfd->eventfd, 1); + eventfd_signal(kioeventfd->eventfd); state = STATE_IORESP_READY; break; } @@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; - inode->i_mapping->private_data = ctx; + inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", @@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx) /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; - spin_lock(&i_mapping->private_lock); - i_mapping->private_data = NULL; + spin_lock(&i_mapping->i_private_lock); + i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&i_mapping->private_lock); + spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } @@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, rc = 0; - /* mapping->private_lock here protects against the kioctx teardown. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; + /* mapping->i_private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->i_private_lock); + ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; @@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, out_unlock: mutex_unlock(&ctx->ring_lock); out: - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return rc; } #else @@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } +struct aio_waiter { + struct wait_queue_entry w; + size_t min_nr; +}; + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned tail, pos, head; + unsigned tail, pos, head, avail; unsigned long flags; /* @@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); + + avail = tail > head + ? tail - head + : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1166,7 +1175,7 @@ static void aio_complete(struct aio_kiocb *iocb) * from IRQ context. */ if (iocb->ki_eventfd) - eventfd_signal(iocb->ki_eventfd, 1); + eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test @@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->wait)) { + struct aio_waiter *curr, *next; + unsigned long flags; + + spin_lock_irqsave(&ctx->wait.lock, flags); + list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) + if (avail >= curr->min_nr) { + list_del_init_careful(&curr->w.entry); + wake_up_process(curr->w.private); + } + spin_unlock_irqrestore(&ctx->wait.lock, flags); + } } static inline void iocb_put(struct aio_kiocb *iocb) @@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { - long ret = 0; + struct hrtimer_sleeper t; + struct aio_waiter w; + long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. @@ -1306,12 +1327,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + aio_read_events(ctx, min_nr, nr, event, &ret); + if (until == 0 || ret < 0 || ret >= min_nr) + return ret; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (until != KTIME_MAX) { + hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); + } + + init_wait(&w.w); + + while (1) { + unsigned long nr_got = ret; + + w.min_nr = min_nr - ret; + + ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); + if (!ret2 && !t.task) + ret2 = -ETIME; + + if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) + break; + + if (nr_got == ret) + schedule(); + } + + finish_wait(&ctx->wait, &w.w); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; } diff --git a/fs/attr.c b/fs/attr.c index bdf5deb06ea9..5a13f0c8495f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f724c54fc8e..b6ff6f320198 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -875,7 +875,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * will not race with any other ebs. */ if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) @@ -1741,16 +1741,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start++; continue; } @@ -1764,7 +1764,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1816,9 +1816,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -1829,16 +1829,16 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -3062,7 +3062,7 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (PagePrivate(page)) { subpage = (struct btrfs_subpage *)page->private; @@ -3085,14 +3085,14 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3116,7 +3116,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag detach_page_private(page); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3139,7 +3139,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ @@ -3520,7 +3520,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3541,10 +3541,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, goto free_eb; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); @@ -3564,7 +3564,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * Thus needs no special handling in error path. */ btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; @@ -4569,12 +4569,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared page private, as if we have * released all ebs in the page, the page private should be cleared now. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } @@ -4590,9 +4590,9 @@ int try_release_extent_buffer(struct page *page) * We need to make sure nobody is changing page->private, as we rely on * page->private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 1; } @@ -4607,10 +4607,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..2347cf15278b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -200,7 +200,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; atomic_inc(&subpage->eb_refs); @@ -215,7 +215,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; ASSERT(atomic_read(&subpage->eb_refs)); diff --git a/fs/buffer.c b/fs/buffer.c index 967f34b70aa8..5ffc44ab4854 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync); * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. + * i_private_lock. * - * Hack idea: for the blockdev mapping, private_lock contention + * Hack idea: for the blockdev mapping, i_private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. + * succeeds, there is no need to take i_private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -204,7 +204,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->private_lock); + spin_lock(&bd_mapping->i_private_lock); head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << bd_inode->i_blkbits); } out_unlock: - spin_unlock(&bd_mapping->private_lock); + spin_unlock(&bd_mapping->i_private_lock); folio_put(folio); out: return ret; @@ -467,25 +467,25 @@ EXPORT_SYMBOL(mark_buffer_async_write); * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. + * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space + * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. + * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, + * mapping->i_private_list will always be protected by the backing blockdev's + * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. + * ->i_private_list must be from the same address_space: the blockdev's. * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) + * address_spaces which do not place buffers at ->i_private_list via these + * utility functions are free to use i_private_lock and i_private_list for + * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The @@ -508,7 +508,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); */ /* - * The buffer's backing address_space's private_lock must be held + * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { @@ -519,7 +519,7 @@ static void __remove_assoc_queue(struct buffer_head *bh) int inode_has_buffers(struct inode *inode) { - return !list_empty(&inode->i_data.private_list); + return !list_empty(&inode->i_data.i_private_list); } /* @@ -561,7 +561,7 @@ repeat: * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * - * Starts I/O against the buffers at mapping->private_list, and waits upon + * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). @@ -570,13 +570,13 @@ repeat: */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->private_data; + struct address_space *buffer_mapping = mapping->i_private_data; - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); + return fsync_buffers_list(&buffer_mapping->i_private_lock, + &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); @@ -673,17 +673,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); - if (!mapping->private_data) { - mapping->private_data = buffer_mapping; + if (!mapping->i_private_data) { + mapping->i_private_data = buffer_mapping; } else { - BUG_ON(mapping->private_data != buffer_mapping); + BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -706,7 +706,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * We use private_lock to lock against try_to_free_buffers while using the + * We use i_private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -718,7 +718,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) struct buffer_head *head; bool newly_dirty; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -734,7 +734,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) */ folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); @@ -827,7 +827,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); @@ -851,7 +851,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ @@ -859,13 +859,13 @@ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); @@ -882,10 +882,10 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { @@ -894,7 +894,7 @@ int remove_inode_buffers(struct inode *inode) } __remove_assoc_queue(bh); } - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } return ret; } @@ -1064,11 +1064,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->private_lock); + spin_lock(&inode->i_mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, (sector_t)index << sizebits, size); - spin_unlock(&inode->i_mapping->private_lock); + spin_unlock(&inode->i_mapping->i_private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; failed: @@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh) if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * block_dirty_folio() via private_lock. try_to_free_buffers + * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *create_empty_buffers(struct folio *folio, @@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh); tail->b_this_page = head; - spin_lock(&folio->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { @@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh != head); } folio_attach_private(folio, head); - spin_unlock(&folio->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return head; } @@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) if (!folio_buffers(folio)) continue; /* - * We use folio lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ @@ -2883,7 +2883,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's i_private_lock. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio @@ -2894,7 +2894,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with - * private_lock. + * i_private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2946,7 +2946,7 @@ bool try_to_free_buffers(struct folio *folio) goto out; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* @@ -2959,13 +2959,13 @@ bool try_to_free_buffers(struct folio *folio) * the folio's buffers clean. We discover that here and clean * the folio also. * - * private_lock must be held over this entire operation in order + * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; @@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; - void *saddr = 0; + void *saddr = NULL; int ret = 0; if (!zero_edge) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 20533266ade6..60456263a338 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, loff_t offset = iocb->ki_pos; const loff_t end = offset + count; struct dio *dio; - struct dio_submit sdio = { 0, }; + struct dio_submit sdio = { NULL, }; struct buffer_head map_bh = { 0, }; struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); diff --git a/fs/eventfd.c b/fs/eventfd.c index 33a918f9566c..ad8186d47ba7 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,7 +43,17 @@ struct eventfd_ctx { int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) +/** + * eventfd_signal_mask - Increment the event counter + * @ctx: [in] Pointer to the eventfd context. + * @mask: [in] poll mask + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + */ +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; @@ -56,45 +66,23 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) - return 0; + return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; - if (ULLONG_MAX - ctx->count < n) - n = ULLONG_MAX - ctx->count; - ctx->count += n; + if (ctx->count < ULLONG_MAX) + ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); - - return n; -} - -/** - * eventfd_signal - Adds @n to the eventfd counter. - * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) -{ - return eventfd_signal_mask(ctx, n, 0); } -EXPORT_SYMBOL_GPL(eventfd_signal); +EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) - ida_simple_remove(&eventfd_ida, ctx->id); + ida_free(&eventfd_ida, ctx->id); kfree(ctx); } @@ -407,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..0558c8c986d4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, @@ -3213,7 +3213,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } diff --git a/fs/file.c b/fs/file.c index 5fb0b146e79e..3b683b9101d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); /** - * pick_file - return file associatd with fd + * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * + * Doesn't take a separate reference count. + * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ -static struct file *pick_file(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) return NULL; @@ -660,7 +664,7 @@ int close_fd(unsigned fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; @@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); @@ -795,26 +799,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) return 0; } -/* - * See close_fd_get_file() below, this variant assumes current->files->file_lock - * is held. - */ -struct file *__close_fd_get_file(unsigned int fd) -{ - return pick_file(current->files, fd); -} - -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file. +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -struct file *close_fd_get_file(unsigned int fd) +struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; @@ -959,31 +958,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; + unsigned long nospec_mask; - if (unlikely(fd >= fdt->max_fds)) - return NULL; - - fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* - * Ok, we have a file pointer. However, because we do - * this all locklessly under RCU, we may be racing with - * that file being closed. - * - * Such a race can take two forms: - * - * (a) the file ref already went down to zero and the - * file hasn't been reused yet or the file count - * isn't zero but the file has already been reused. + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. */ - file = __get_file_rcu(fdentry); + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ + file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; - if (unlikely(IS_ERR(file))) + /* + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. + * + * We need to confirm it by incrementing the refcount + * and then check the lookup again. + * + * atomic_long_inc_not_zero() gives us a full memory + * barrier. We only really need an 'acquire' one to + * protect the loads below, but we don't have that. + */ + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) continue; /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes @@ -991,7 +1004,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } @@ -1128,13 +1142,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ - if (atomic_read_acquire(&files->count) == 1) { + if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask); + file = __fget_files(files, fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; @@ -1282,7 +1296,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1296,7 +1310,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1321,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) __receive_sock(file); return new_fd; } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1336,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..3ba764d73fc9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -75,18 +75,6 @@ static inline void file_free(struct file *f) } } -void release_empty_file(struct file *f) -{ - WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); - if (atomic_long_dec_and_test(&f->f_count)) { - security_file_free(f); - put_cred(f->f_cred); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - kmem_cache_free(filp_cachep, f); - } -} - /* * Return the total number of open files in the system */ @@ -419,7 +407,7 @@ static void delayed_fput(struct work_struct *unused) static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_rcuhead)); + __fput(container_of(work, struct file, f_task_work)); } /* @@ -445,9 +433,13 @@ void fput(struct file *file) if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_rcuhead, ____fput); - if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d6bf1f8c25dc..d8b619ed2f1e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1213,7 +1213,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..00ce89bdf32c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b791adf02e5..b0cb70400996 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,8 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @op: direction of I/O - * @op_flags: request op flags + * @opf: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -43,6 +42,8 @@ struct hfsplus_wd { * that starts at the rounded-down address. As long as the data was * read using hfsplus_submit_bio() and the same buffer is used things * will work correctly. + * + * Returns: %0 on success else -errno code */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..05609ab15cbc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ - resv_map = (struct resv_map *)(&inode->i_data)->private_data; + resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); @@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); - inode->i_mapping->private_data = resv_map; + inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: diff --git a/fs/inode.c b/fs/inode.c index f238d987dec9..6cdb017f45c6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -209,7 +209,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, @@ -398,8 +398,8 @@ static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); + INIT_LIST_HEAD(&mapping->i_private_list); + spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } @@ -620,7 +620,7 @@ void clear_inode(struct inode *inode) * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); - BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); @@ -1836,37 +1836,37 @@ EXPORT_SYMBOL(bmap); * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, +static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) - return 1; + return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) - return 1; + return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) - return 1; + return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) - return 1; + return true; /* * Good, we can skip the atime update: */ - return 0; + return false; } /** @@ -2404,7 +2404,7 @@ EXPORT_SYMBOL(inode_init_owner); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..a7469ddba9b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -94,7 +94,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); -void release_empty_file(struct file *f); static inline void file_put_write_access(struct file *file) { @@ -180,7 +179,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern struct file *__close_fd_get_file(unsigned int fd); +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 57d1dedf3f8f..64c5205e2b5e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -9,8 +9,16 @@ #include "internal.h" +/* + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, + * never from raw values. These are just internal helpers. + */ +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } + struct mnt_idmap { - struct user_namespace *owner; + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; refcount_t count; }; @@ -20,25 +28,11 @@ struct mnt_idmap { * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /** - * check_fsmapping - check whether an mount idmapping is allowed - * @idmap: idmap of the relevent mount - * @sb: super block of the filesystem - * - * Return: true if @idmap is allowed, false if not. - */ -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb) -{ - return idmap->owner != sb->s_user_ns; -} - -/** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * @@ -53,26 +47,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns) } /** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} - -/** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping @@ -81,8 +55,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() @@ -94,13 +68,12 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, - struct user_namespace *fs_userns, - kuid_t kuid) + struct user_namespace *fs_userns, + kuid_t kuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); @@ -108,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); @@ -121,8 +94,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid); * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() @@ -136,9 +109,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); @@ -146,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); @@ -165,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) @@ -193,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) @@ -228,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); +static int copy_mnt_idmap(struct uid_gid_map *map_from, + struct uid_gid_map *map_to) +{ + struct uid_gid_extent *forward, *reverse; + u32 nr_extents = READ_ONCE(map_from->nr_extents); + /* Pairs with smp_wmb() when writing the idmapping. */ + smp_rmb(); + + /* + * Don't blindly copy @map_to into @map_from if nr_extents is + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we + * read @nr_extents someone could have written an idmapping and + * then we might end up with inconsistent data. So just don't do + * anything at all. + */ + if (nr_extents == 0) + return 0; + + /* + * Here we know that nr_extents is greater than zero which means + * a map has been written. Since idmappings can't be changed + * once they have been written we know that we can safely copy + * from @map_to into @map_from. + */ + + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + *map_to = *map_from; + return 0; + } + + forward = kmemdup(map_from->forward, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!forward) + return -ENOMEM; + + reverse = kmemdup(map_from->reverse, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!reverse) { + kfree(forward); + return -ENOMEM; + } + + /* + * The idmapping isn't exposed anywhere so we don't need to care + * about ordering between extent pointers and @nr_extents + * initialization. + */ + map_to->forward = forward; + map_to->reverse = reverse; + map_to->nr_extents = nr_extents; + return 0; +} + +static void free_mnt_idmap(struct mnt_idmap *idmap) +{ + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->uid_map.forward); + kfree(idmap->uid_map.reverse); + } + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->gid_map.forward); + kfree(idmap->gid_map.reverse); + } + kfree(idmap); +} + struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; + int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); - idmap->owner = get_user_ns(mnt_userns); refcount_set(&idmap->count, 1); + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); + if (!ret) + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); + if (ret) { + free_mnt_idmap(idmap); + idmap = ERR_PTR(ret); + } return idmap; } @@ -267,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..faae721e4d63 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -2467,7 +2467,7 @@ static int handle_lookup_down(struct nameidata *nd) return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); @@ -2522,7 +2522,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { @@ -3158,7 +3158,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -3646,7 +3646,7 @@ static int do_open(struct nameidata *nd, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, @@ -3785,10 +3785,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - if (unlikely(file->f_mode & FMODE_OPENED)) - fput(file); - else - release_empty_file(file); + fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -3954,7 +3951,7 @@ EXPORT_SYMBOL(user_path_create); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -4080,7 +4077,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4161,7 +4158,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) @@ -4290,7 +4287,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) @@ -4443,7 +4440,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) @@ -4535,7 +4532,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..78366f114515 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3026,6 +3026,7 @@ static inline bool path_overmounted(const struct path *path) * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount + * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. @@ -4288,7 +4289,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) + if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b664caea8b4e..7248705faef4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return req; } @@ -769,13 +769,13 @@ static void nfs_inode_add_request(struct nfs_page *req) * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); folio_set_private(folio); folio->private = req; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -796,13 +796,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(folio && !folio_test_swapcache(folio))) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index f861f3a0bf5c..2ead36dfa2a3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -214,7 +214,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, /* * The page may not be locked, eg if called from try_to_unmap_one() */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -230,7 +230,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, } else if (ret) { nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..548f3b51aa5f 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = { * * If the page does not have buffers, we create them and set them uptodate. * The page may not be locked which is why we need to handle the buffers under - * the mapping->private_lock. Once the buffers are marked dirty we no longer + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer * need the lock since try_to_free_buffers() does not free dirty buffers. */ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { @@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; @@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { break; set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 4596c90e7b7c..629723a8d712 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) /** * ntfs_dir_fsync - sync a directory to disk * @filp: directory to be synced - * @dentry: dentry describing the directory to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and diff --git a/fs/open.c b/fs/open.c index 3494a9cd8046..954d8fcbb635 100644 --- a/fs/open.c +++ b/fs/open.c @@ -442,7 +442,8 @@ static const struct cred *access_override_creds(void) * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous - * cred accesses will keep things non-RCY. + * cred accesses will keep things non-racy to avoid RCU + * freeing. */ override_cred->non_rcu = 1; @@ -1574,7 +1575,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) int retval; struct file *file; - file = close_fd_get_file(fd); + file = file_close_fd(fd); if (!file) return -EBADF; diff --git a/fs/pipe.c b/fs/pipe.c index 804a7d789452..8d9286a1f2e8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) bool was_empty = false; bool wake_next_writer = false; + /* + * Reject writing to watch queue pipes before the point where we lock + * the pipe. + * Otherwise, lockdep would be unhappy if the caller already has another + * pipe locked. + * If we had to support locking a normal pipe and a notification pipe at + * the same time, we could set up lockdep annotations for that, but + * since we don't actually need that, it's simpler to just bail here. + */ + if (pipe_has_watch_queue(pipe)) + return -EXDEV; + /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; @@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (pipe_has_watch_queue(pipe)) { - ret = -EXDEV; - goto out; - } - /* * If it wasn't empty we try to merge new data into * the last buffer. @@ -1317,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + if (!pipe_has_watch_queue(pipe)) { + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; + } + spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ @@ -1368,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) if (ret < 0) goto out_revert_acct; - pipe->max_usage = nr_slots; - pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a05fe94970ce..e1af20893ebe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..1801e409a061 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = NULL; if (file) { - struct inode *inode = file_inode(vma->vm_file); + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2138ee7d271d..5faf702f8d15 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, INITIALIZE_PATH(path); int item_len = 0; int tb_init = 0; - struct cpu_key cpu_key; + struct cpu_key cpu_key = {}; int retval; int quota_cut_bytes = 0; diff --git a/fs/stat.c b/fs/stat.c index f721d26ec3f7..8a8f7e97a742 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -41,7 +41,7 @@ * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) diff --git a/fs/super.c b/fs/super.c index 076392396e72..6fe482371633 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,7 @@ static void destroy_unused_super(struct super_block *s) static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { - struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index b9d83652c097..e32bee4345fb 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -58,15 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { - return -ENOSYS; -} - -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, - unsigned mask) -{ - return -ENOSYS; } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) @@ -92,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) #endif +static inline void eventfd_signal(struct eventfd_ctx *ctx) +{ + eventfd_signal_mask(ctx, 0); +} + #endif /* _LINUX_EVENTFD_H */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index bc4c3287a65e..78c8326d74ae 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -83,12 +83,17 @@ struct dentry; static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) { - fd = array_index_nospec(fd, fdt->max_fds); - return rcu_dereference_raw(fdt->fd[fd]); - } - return NULL; + unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); + struct file *needs_masking; + + /* + * 'mask' is zero for an out-of-bounds fd, all ones for ok. + * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. + * + * Accessing fdt->fd[0] is ok, but needs masking of the result. + */ + needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); + return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) @@ -114,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern struct file *close_fd_get_file(unsigned int fd); +extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); diff --git a/include/linux/file.h b/include/linux/file.h index 6e9099d29343..6834a29338c4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,18 +96,8 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); -extern int receive_fd(struct file *file, unsigned int o_flags); - -static inline int receive_fd_user(struct file *file, int __user *ufd, - unsigned int o_flags) -{ - if (ufd == NULL) - return -EFAULT; - return __receive_fd(file, ufd, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..3d58376ed39e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -484,9 +484,9 @@ struct address_space { unsigned long flags; struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; - void *private_data; + spinlock_t i_private_lock; + struct list_head i_private_list; + void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but @@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) */ struct file { union { + /* fput() uses task work when closing and freeing file (default). */ + struct callback_head f_task_work; + /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; @@ -2523,20 +2525,28 @@ struct file *backing_file_open(const struct path *user_path, int flags, struct path *backing_file_user_path(struct file *f); /* - * file_user_path - get the path to display for memory mapped file - * * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying - * filesystem. When the mapped file path is displayed to user (e.g. via - * /proc/<pid>/maps), this helper should be used to get the path to display - * to the user, which is the path of the fd that user has requested to map. + * filesystem. When the mapped file path and inode number are displayed to + * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the + * path and inode number to display to the user, which is the path of the fd + * that user has requested to map and the inode number that would be returned + * by fstat() on that same fd. */ +/* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } +/* Get the inode whose inode number to display in /proc/<pid>/maps */ +static inline const struct inode *file_user_inode(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return d_inode(backing_file_user_path(f)->dentry); + return file_inode(f); +} static inline struct file *file_clone_open(struct file *file) { diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b8da2db4ecd2..cd4d5c8781f5 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb); - #endif /* _LINUX_MNT_IDMAPPING_H */ diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb..415a7ca2b882 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -17,6 +17,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct uid_gid_map; typedef struct { uid_t val; @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c..6d0a14f7019d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } diff --git a/include/net/scm.h b/include/net/scm.h index e8c76b4be2fe..cf68acec4d70 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,6 +5,7 @@ #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> +#include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> @@ -208,5 +209,13 @@ static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, scm_destroy_cred(scm); } +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) +{ + if (!ufd) + return -EFAULT; + return receive_fd(f, ufd, flags); +} + #endif /* __LINUX_NET_SCM_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9626a363f121..9839016f82ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -561,7 +561,7 @@ static void io_eventfd_ops(struct rcu_head *rcu) int ops = atomic_xchg(&ev_fd->ops, 0); if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback * ordering in a race but if references are 0 we know we have to free @@ -597,7 +597,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) goto out; if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index fb73adb89067..74fc22461f48 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -241,7 +241,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - file = __close_fd_get_file(close->fd); + file = file_close_fd_locked(files, close->fd); spin_unlock(&files->file_lock); if (!file) goto err; diff --git a/kernel/pid.c b/kernel/pid.c index 6500ef956f2f..b52b10865454 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = receive_fd(file, O_CLOEXEC); + ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..aca7b437882e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn */ list_del_init(&addfd->list); if (!addfd->setfd) - fd = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index eabe8bcc7042..ce4d99df5f0e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -231,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -/** +/* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ @@ -241,7 +241,7 @@ struct idmap_key { u32 count; /* == 0 unless used with map_id_range_down() */ }; -/** +/* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ @@ -271,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e) return 1; } -/** +/* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -288,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou sizeof(struct uid_gid_extent), cmp_map_id); } -/** +/* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -332,12 +332,12 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } -/** +/* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -358,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) return NULL; } -/** +/* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) sizeof(struct uid_gid_extent), cmp_map_id); } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; @@ -770,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } -/** +/* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -839,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b) return 0; } -/** +/* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 778b4056700f..03b90d7d2175 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; ret = -ENOMEM; - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6feb3e0630d1..c466551e2fd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1141,7 +1141,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ - return (struct resv_map *)(&inode->i_data)->private_data; + return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 651b86ff03dd..73692cd8c142 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4379,7 +4379,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * only one element of the array here. */ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) - eventfd_signal(t->entries[i].eventfd, 1); + eventfd_signal(t->entries[i].eventfd); /* i = current_threshold + 1 */ i++; @@ -4391,7 +4391,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * only one element of the array here. */ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) - eventfd_signal(t->entries[i].eventfd, 1); + eventfd_signal(t->entries[i].eventfd); /* Update current_threshold */ t->current_threshold = i - 1; @@ -4431,7 +4431,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) spin_lock(&memcg_oom_lock); list_for_each_entry(ev, &memcg->oom_notify, list) - eventfd_signal(ev->eventfd, 1); + eventfd_signal(ev->eventfd); spin_unlock(&memcg_oom_lock); return 0; @@ -4650,7 +4650,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, /* already in OOM ? */ if (memcg->under_oom) - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); spin_unlock(&memcg_oom_lock); return 0; @@ -4942,7 +4942,7 @@ static void memcg_event_remove(struct work_struct *work) event->unregister_event(memcg, event->eventfd); /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); + eventfd_signal(event->eventfd); eventfd_ctx_put(event->eventfd); kfree(event); diff --git a/mm/migrate.c b/mm/migrate.c index 397f2a6e34cb..36c011a36b71 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -753,7 +753,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, recheck_buffers: busy = false; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); bh = head; do { if (atomic_read(&bh->b_count)) { @@ -767,7 +767,7 @@ recheck_buffers: rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -794,7 +794,7 @@ recheck_buffers: rc = MIGRATEPAGE_SUCCESS; unlock_buffers: if (check_refs) - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head; do { unlock_buffer(bh); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 22c6689d9302..bd5183dfd879 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -169,7 +169,7 @@ static bool vmpressure_event(struct vmpressure *vmpr, continue; if (level < ev->level) continue; - eventfd_signal(ev->efd, 1); + eventfd_signal(ev->efd); ret = true; } mutex_unlock(&vmpr->events_lock); diff --git a/net/compat.c b/net/compat.c index 6564720f32b7..485db8ee9b28 100644 --- a/net/compat.c +++ b/net/compat.c @@ -297,7 +297,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) int err = 0, i; for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } diff --git a/net/core/scm.c b/net/core/scm.c index 7dc47c17d863..db3f7cd519c2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -325,7 +325,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 69ba0281f9e0..2284b3751240 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -234,10 +234,10 @@ static void mtty_trigger_interrupt(struct mdev_state *mdev_state) if (is_msi(mdev_state)) { if (mdev_state->msi_evtfd) - eventfd_signal(mdev_state->msi_evtfd, 1); + eventfd_signal(mdev_state->msi_evtfd); } else if (is_intx(mdev_state)) { if (mdev_state->intx_evtfd && !mdev_state->intx_mask) { - eventfd_signal(mdev_state->intx_evtfd, 1); + eventfd_signal(mdev_state->intx_evtfd); mdev_state->intx_mask = true; } } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 8247a7c69c36..27f9f679ed15 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -26,6 +26,7 @@ TARGETS += filesystems TARGETS += filesystems/binderfs TARGETS += filesystems/epoll TARGETS += filesystems/fat +TARGETS += filesystems/overlayfs TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/overlayfs/.gitignore b/tools/testing/selftests/filesystems/overlayfs/.gitignore new file mode 100644 index 000000000000..52ae618fdd98 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +dev_in_maps diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile new file mode 100644 index 000000000000..56b2b48a765b --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_PROGS := dev_in_maps + +CFLAGS := -Wall -Werror + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c new file mode 100644 index 000000000000..e19ab0e85709 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <inttypes.h> +#include <unistd.h> +#include <stdio.h> + +#include <linux/unistd.h> +#include <linux/types.h> +#include <linux/mount.h> +#include <sys/syscall.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <sys/mman.h> +#include <sched.h> +#include <fcntl.h> + +#include "../../kselftest.h" +#include "log.h" + +static int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} + +static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +static int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags); +} + +static long get_file_dev_and_inode(void *addr, struct statx *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) + return pr_perror("fopen(/proc/self/maps)"); + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) + return pr_perror("unable to parse: %s", buf); + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + return pr_err("unable to find the mapping"); +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd == -1) + return pr_perror("fsopen(tmpfs)"); + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) + return pr_perror("FSCONFIG_CMD_CREATE"); + + tmpfs = sys_fsmount(fsfd, 0, 0); + if (tmpfs == -1) + return pr_perror("fsmount"); + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) + return pr_perror("move_mount"); + close(tmpfs); + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) + return pr_perror("mkdir"); + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) + return pr_perror("fsopen(overlay)"); + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) + return pr_perror("fsconfig"); + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) + return pr_perror("fsconfig"); + ovl = sys_fsmount(fsfd, 0, 0); + if (ovl == -1) + return pr_perror("fsmount"); + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int test(void) +{ + struct statx stx, mstx; + int ovl, fd; + void *addr; + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) + return pr_perror("openat"); + + addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + return pr_perror("mmap"); + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (statx(fd, "", AT_EMPTY_PATH | AT_STATX_SYNC_AS_STAT, STATX_INO, &stx)) + return pr_perror("statx"); + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) + return pr_fail("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + + ksft_test_result_pass("devices are matched\n"); + return 0; +} + +int main(int argc, char **argv) +{ + int fsfd; + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) { + ksft_test_result_skip("unable to create overlay mount\n"); + return 1; + } + close(fsfd); + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + ksft_test_result_skip("unable to create a new mount namespace\n"); + return 1; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("mount"); + return 1; + } + + ksft_set_plan(1); + + if (test()) + return 1; + + ksft_exit_pass(); + return 0; +} diff --git a/tools/testing/selftests/filesystems/overlayfs/log.h b/tools/testing/selftests/filesystems/overlayfs/log.h new file mode 100644 index 000000000000..db64df2a8483 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/log.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTEST_TIMENS_LOG_H__ +#define __SELFTEST_TIMENS_LOG_H__ + +#define pr_msg(fmt, lvl, ...) \ + ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \ + lvl, __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + ({ \ + ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_fail(fmt, ...) \ + ({ \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__) + +#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 89912a17f5d5..c0e230f4c3e9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -61,7 +61,7 @@ static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, srcu_read_lock_held(&resampler->kvm->irq_srcu)) - eventfd_signal(irqfd->resamplefd, 1); + eventfd_signal(irqfd->resamplefd); } /* @@ -786,7 +786,7 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; - eventfd_signal(p->eventfd, 1); + eventfd_signal(p->eventfd); return 0; } |