diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 10:07:42 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 10:07:42 -0700 |
commit | 84da111de0b4be15bd500deff773f5116f39f7be (patch) | |
tree | 76b5796f8258397bf7a3926b742a89166a8501ef /drivers/infiniband | |
parent | 227c3e9eb5cf3552c2cc83225df6d14adb05f8e8 (diff) | |
parent | 62974fc389b364d8af70e044836362222bd3ae53 (diff) |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This is more cleanup and consolidation of the hmm APIs and the very
strongly related mmu_notifier interfaces. Many places across the tree
using these interfaces are touched in the process. Beyond that a
cleanup to the page walker API and a few memremap related changes
round out the series:
- General improvement of hmm_range_fault() and related APIs, more
documentation, bug fixes from testing, API simplification &
consolidation, and unused API removal
- Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE,
and make them internal kconfig selects
- Hoist a lot of code related to mmu notifier attachment out of
drivers by using a refcount get/put attachment idiom and remove the
convoluted mmu_notifier_unregister_no_release() and related APIs.
- General API improvement for the migrate_vma API and revision of its
only user in nouveau
- Annotate mmu_notifiers with lockdep and sleeping region debugging
Two series unrelated to HMM or mmu_notifiers came along due to
dependencies:
- Allow pagemap's memremap_pages family of APIs to work without
providing a struct device
- Make walk_page_range() and related use a constant structure for
function pointers"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits)
libnvdimm: Enable unit test infrastructure compile checks
mm, notifier: Catch sleeping/blocking for !blockable
kernel.h: Add non_block_start/end()
drm/radeon: guard against calling an unpaired radeon_mn_unregister()
csky: add missing brackets in a macro for tlb.h
pagewalk: use lockdep_assert_held for locking validation
pagewalk: separate function pointers from iterator data
mm: split out a new pagewalk.h header from mm.h
mm/mmu_notifiers: annotate with might_sleep()
mm/mmu_notifiers: prime lockdep
mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end
mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports
mm/hmm: hmm_range_fault() infinite loop
mm/hmm: hmm_range_fault() NULL pointer bug
mm/hmm: fix hmm_range_fault()'s handling of swapped out pages
mm/mmu_notifiers: remove unregister_no_release
RDMA/odp: remove ib_ucontext from ib_umem
RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'
RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
RDMA/mlx5: Use ib_umem_start instead of umem.address
...
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/Kconfig | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/device.c | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/umem.c | 54 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 524 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_main.c | 1 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 9 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 38 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 88 |
10 files changed, 344 insertions, 390 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 85e103b147cc..b44b1c322ec8 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -55,6 +55,7 @@ config INFINIBAND_ON_DEMAND_PAGING bool "InfiniBand on-demand paging support" depends on INFINIBAND_USER_MEM select MMU_NOTIFIER + select INTERVAL_TREE default y ---help--- On demand paging support for the InfiniBand subsystem. diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ea8661a00651..b5631b8a0397 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2562,6 +2562,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, init_port); + SET_DEVICE_OP(dev_ops, invalidate_range); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 56553668256f..41f9e268e3fb 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -184,9 +184,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * If access flags indicate ODP memory, avoid pinning. Instead, stores - * the mm for future page fault handling in conjunction with MMU notifiers. - * * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -231,36 +228,19 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - if (access & IB_ACCESS_ON_DEMAND) { - umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - umem->is_odp = 1; - } else { - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - } + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); - umem->context = context; + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->ibdev = context->device; umem->length = size; umem->address = addr; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); - if (access & IB_ACCESS_ON_DEMAND) { - if (WARN_ON_ONCE(!context->invalidate_range)) { - ret = -EINVAL; - goto umem_kfree; - } - - ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); - if (ret) - goto umem_kfree; - return umem; - } - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; @@ -346,15 +326,6 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void __ib_umem_release_tail(struct ib_umem *umem) -{ - mmdrop(umem->owning_mm); - if (umem->is_odp) - kfree(to_ib_umem_odp(umem)); - else - kfree(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release @@ -363,17 +334,14 @@ void ib_umem_release(struct ib_umem *umem) { if (!umem) return; + if (umem->is_odp) + return ib_umem_odp_release(to_ib_umem_odp(umem)); - if (umem->is_odp) { - ib_umem_odp_release(to_ib_umem_odp(umem)); - __ib_umem_release_tail(umem); - return; - } - - __ib_umem_release(umem->context->device, umem, 1); + __ib_umem_release(umem->ibdev, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); - __ib_umem_release_tail(umem); + mmdrop(umem->owning_mm); + kfree(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c0e15db34680..9aebe9ce8b07 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -39,44 +39,14 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hugetlb.h> -#include <linux/interval_tree_generic.h> +#include <linux/interval_tree.h> #include <linux/pagemap.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> -/* - * The ib_umem list keeps track of memory regions for which the HW - * device request to receive notification when the related memory - * mapping is changed. - * - * ib_umem_lock protects the list. - */ - -static u64 node_start(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_start(umem_odp); -} - -/* Note that the representation of the intervals in the interval tree - * considers the ending point as contained in the interval, while the - * function ib_umem_end returns the first address which is not contained - * in the umem. - */ -static u64 node_last(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_end(umem_odp) - 1; -} - -INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, - node_start, node_last, static, rbt_ib_umem) +#include "uverbs.h" static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { @@ -104,31 +74,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); } -static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, - u64 start, u64 end, void *cookie) -{ - /* - * Increase the number of notifiers running, to - * prevent any further fault handling on this MR. - */ - ib_umem_notifier_start_account(umem_odp); - complete_all(&umem_odp->notifier_completion); - umem_odp->umem.context->invalidate_range( - umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - return 0; -} - static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + struct rb_node *node; down_read(&per_mm->umem_rwsem); - if (per_mm->active) - rbt_ib_umem_for_each_in_range( - &per_mm->umem_tree, 0, ULLONG_MAX, - ib_umem_notifier_release_trampoline, true, NULL); + if (!per_mm->mn.users) + goto out; + + for (node = rb_first_cached(&per_mm->umem_tree); node; + node = rb_next(node)) { + struct ib_umem_odp *umem_odp = + rb_entry(node, struct ib_umem_odp, interval_tree.rb); + + /* + * Increase the number of notifiers running, to prevent any + * further fault handling on this MR. + */ + ib_umem_notifier_start_account(umem_odp); + complete_all(&umem_odp->notifier_completion); + umem_odp->umem.ibdev->ops.invalidate_range( + umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + } + +out: up_read(&per_mm->umem_rwsem); } @@ -136,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); + item->umem.ibdev->ops.invalidate_range(item, start, end); return 0; } @@ -152,10 +125,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - if (!per_mm->active) { + if (!per_mm->mn.users) { up_read(&per_mm->umem_rwsem); /* - * At this point active is permanently set and visible to this + * At this point users is permanently zero and visible to this * CPU without a lock, that fact is relied on to skip the unlock * in range_end. */ @@ -185,7 +158,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (unlikely(!per_mm->active)) + if (unlikely(!per_mm->mn.users)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, @@ -194,212 +167,250 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, up_read(&per_mm->umem_rwsem); } -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, -}; - -static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - up_write(&per_mm->umem_rwsem); -} - -static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - - up_write(&per_mm->umem_rwsem); -} - -static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, - struct mm_struct *mm) +static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; - int ret; per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); if (!per_mm) return ERR_PTR(-ENOMEM); - per_mm->context = ctx; - per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - per_mm->active = true; + WARN_ON(mm != current->mm); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); + return &per_mm->mn; +} - WARN_ON(mm != current->mm); - - per_mm->mn.ops = &ib_umem_notifiers; - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); - if (ret) { - dev_err(&ctx->device->dev, - "Failed to register mmu_notifier %d\n", ret); - goto out_pid; - } +static void ib_umem_free_notifier(struct mmu_notifier *mn) +{ + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); - return per_mm; + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); -out_pid: put_pid(per_mm->tgid); kfree(per_mm); - return ERR_PTR(ret); } -static int get_per_mm(struct ib_umem_odp *umem_odp) +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, + .alloc_notifier = ib_umem_alloc_notifier, + .free_notifier = ib_umem_free_notifier, +}; + +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *ctx = umem_odp->umem.context; struct ib_ucontext_per_mm *per_mm; + struct mmu_notifier *mn; + int ret; - /* - * Generally speaking we expect only one or two per_mm in this list, - * so no reason to optimize this search today. - */ - mutex_lock(&ctx->per_mm_list_lock); - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { - if (per_mm->mm == umem_odp->umem.owning_mm) - goto found; + umem_odp->umem.is_odp = 1; + if (!umem_odp->is_implicit_odp) { + size_t page_size = 1UL << umem_odp->page_shift; + size_t pages; + + umem_odp->interval_tree.start = + ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + umem_odp->umem.length, + &umem_odp->interval_tree.last)) + return -EOVERFLOW; + umem_odp->interval_tree.last = + ALIGN(umem_odp->interval_tree.last, page_size); + if (unlikely(umem_odp->interval_tree.last < page_size)) + return -EOVERFLOW; + + pages = (umem_odp->interval_tree.last - + umem_odp->interval_tree.start) >> + umem_odp->page_shift; + if (!pages) + return -EINVAL; + + /* + * Note that the representation of the intervals in the + * interval tree considers the ending point as contained in + * the interval. + */ + umem_odp->interval_tree.last--; + + umem_odp->page_list = kvcalloc( + pages, sizeof(*umem_odp->page_list), GFP_KERNEL); + if (!umem_odp->page_list) + return -ENOMEM; + + umem_odp->dma_list = kvcalloc( + pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); + if (!umem_odp->dma_list) { + ret = -ENOMEM; + goto out_page_list; + } } - per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); - if (IS_ERR(per_mm)) { - mutex_unlock(&ctx->per_mm_list_lock); - return PTR_ERR(per_mm); + mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); + if (IS_ERR(mn)) { + ret = PTR_ERR(mn); + goto out_dma_list; } + umem_odp->per_mm = per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); -found: - umem_odp->per_mm = per_mm; - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); + mutex_init(&umem_odp->umem_mutex); + init_completion(&umem_odp->notifier_completion); + + if (!umem_odp->is_implicit_odp) { + down_write(&per_mm->umem_rwsem); + interval_tree_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + up_write(&per_mm->umem_rwsem); + } + mmgrab(umem_odp->umem.owning_mm); return 0; -} -static void free_per_mm(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +out_dma_list: + kvfree(umem_odp->dma_list); +out_page_list: + kvfree(umem_odp->page_list); + return ret; } -static void put_per_mm(struct ib_umem_odp *umem_odp) +/** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem + * + * Implicit ODP umems do not have a VA range and do not have any page lists. + * They exist only to hold the per_mm reference to help the driver create + * children umems. + * + * @udata: udata from the syscall being used to create the umem + * @access: ib_reg_mr access flags + */ +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, + int access) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_ucontext *ctx = umem_odp->umem.context; - bool need_free; - - mutex_lock(&ctx->per_mm_list_lock); - umem_odp->per_mm = NULL; - per_mm->odp_mrs_count--; - need_free = per_mm->odp_mrs_count == 0; - if (need_free) - list_del(&per_mm->ucontext_list); - mutex_unlock(&ctx->per_mm_list_lock); - - if (!need_free) - return; + struct ib_ucontext *context = + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; + int ret; - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in an start/end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. - */ - down_write(&per_mm->umem_rwsem); - per_mm->active = false; - up_write(&per_mm->umem_rwsem); + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); - WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); - put_pid(per_mm->tgid); - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); + if (!context) + return ERR_PTR(-EIO); + if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + umem = &umem_odp->umem; + umem->ibdev = context->device; + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; + + ret = ib_init_umem_odp(umem_odp); + if (ret) { + kfree(umem_odp); + return ERR_PTR(ret); + } + return umem_odp; } +EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, - unsigned long addr, size_t size) +/** + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit + * parent ODP umem + * + * @root: The parent umem enclosing the child. This must be allocated using + * ib_alloc_implicit_odp_umem() + * @addr: The starting userspace VA + * @size: The length of the userspace VA + */ +struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, + unsigned long addr, size_t size) { - struct ib_ucontext_per_mm *per_mm = root->per_mm; - struct ib_ucontext *ctx = per_mm->context; + /* + * Caller must ensure that root cannot be freed during the call to + * ib_alloc_odp_umem. + */ struct ib_umem_odp *odp_data; struct ib_umem *umem; - int pages = size >> PAGE_SHIFT; int ret; + if (WARN_ON(!root->is_implicit_odp)) + return ERR_PTR(-EINVAL); + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = ctx; + umem->ibdev = root->umem.ibdev; umem->length = size; umem->address = addr; - odp_data->page_shift = PAGE_SHIFT; umem->writable = root->umem.writable; - umem->is_odp = 1; - odp_data->per_mm = per_mm; - umem->owning_mm = per_mm->mm; - mmgrab(umem->owning_mm); - - mutex_init(&odp_data->umem_mutex); - init_completion(&odp_data->notifier_completion); - - odp_data->page_list = - vzalloc(array_size(pages, sizeof(*odp_data->page_list))); - if (!odp_data->page_list) { - ret = -ENOMEM; - goto out_odp_data; - } + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; - odp_data->dma_list = - vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); - if (!odp_data->dma_list) { - ret = -ENOMEM; - goto out_page_list; + ret = ib_init_umem_odp(odp_data); + if (ret) { + kfree(odp_data); + return ERR_PTR(ret); } - - /* - * Caller must ensure that the umem_odp that the per_mm came from - * cannot be freed during the call to ib_alloc_odp_umem. - */ - mutex_lock(&ctx->per_mm_list_lock); - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - add_umem_to_per_mm(odp_data); - return odp_data; - -out_page_list: - vfree(odp_data->page_list); -out_odp_data: - mmdrop(umem->owning_mm); - kfree(odp_data); - return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_odp_umem); +EXPORT_SYMBOL(ib_umem_odp_alloc_child); -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +/** + * ib_umem_odp_get - Create a umem_odp for a userspace va + * + * @udata: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * + * The driver should use when the access flags indicate ODP memory. It avoids + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. + */ +struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, + size_t size, int access) { - struct ib_umem *umem = &umem_odp->umem; - /* - * NOTE: This must called in a process context where umem->owning_mm - * == current->mm - */ - struct mm_struct *mm = umem->owning_mm; - int ret_val; + struct ib_umem_odp *umem_odp; + struct ib_ucontext *context; + struct mm_struct *mm; + int ret; + + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); + + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || + WARN_ON_ONCE(!context->device->ops.invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + + umem_odp->umem.ibdev = context->device; + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = mm = current->mm; umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { @@ -410,63 +421,63 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) vma = find_vma(mm, ib_umem_start(umem_odp)); if (!vma || !is_vm_hugetlb_page(vma)) { up_read(&mm->mmap_sem); - return -EINVAL; + ret = -EINVAL; + goto err_free; } h = hstate_vma(vma); umem_odp->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); } - mutex_init(&umem_odp->umem_mutex); - - init_completion(&umem_odp->notifier_completion); - - if (ib_umem_odp_num_pages(umem_odp)) { - umem_odp->page_list = - vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->page_list) - return -ENOMEM; - - umem_odp->dma_list = - vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->dma_list) { - ret_val = -ENOMEM; - goto out_page_list; - } - } - - ret_val = get_per_mm(umem_odp); - if (ret_val) - goto out_dma_list; - add_umem_to_per_mm(umem_odp); - - return 0; + ret = ib_init_umem_odp(umem_odp); + if (ret) + goto err_free; + return umem_odp; -out_dma_list: - vfree(umem_odp->dma_list); -out_page_list: - vfree(umem_odp->page_list); - return ret_val; +err_free: + kfree(umem_odp); + return ERR_PTR(ret); } +EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + if (!umem_odp->is_implicit_odp) { + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + kvfree(umem_odp->dma_list); + kvfree(umem_odp->page_list); + } - remove_umem_from_per_mm(umem_odp); - put_per_mm(umem_odp); - vfree(umem_odp->dma_list); - vfree(umem_odp->page_list); + down_write(&per_mm->umem_rwsem); + if (!umem_odp->is_implicit_odp) { + interval_tree_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + } + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in a missing end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + * Thus we call the mmu_notifier_put under the rwsem and test the + * internal users count to reliably see if we are past this point. + */ + mmu_notifier_put(&per_mm->mn); + up_write(&per_mm->umem_rwsem); + + mmdrop(umem_odp->umem.owning_mm); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /* * Map for DMA and insert a single page into the on-demand paging page tables. @@ -493,8 +504,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_ucontext *context = umem_odp->umem.context; - struct ib_device *dev = context->device; + struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t dma_addr; int remove_existing_mapping = 0; int ret = 0; @@ -534,7 +544,7 @@ out: if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); - context->invalidate_range( + dev->ops.invalidate_range( umem_odp, ib_umem_start(umem_odp) + (page_index << umem_odp->page_shift), @@ -707,7 +717,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, { int idx; u64 addr; - struct ib_device *dev = umem_odp->umem.context->device; + struct ib_device *dev = umem_odp->umem.ibdev; virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); @@ -761,35 +771,21 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, void *cookie) { int ret_val = 0; - struct umem_odp_node *node, *next; + struct interval_tree_node *node, *next; struct ib_umem_odp *umem; if (unlikely(start == last)) return ret_val; - for (node = rbt_ib_umem_iter_first(root, start, last - 1); + for (node = interval_tree_iter_first(root, start, last - 1); node; node = next) { /* TODO move the blockable decision up to the callback */ if (!blockable) return -EAGAIN; - next = rbt_ib_umem_iter_next(node, start, last - 1); + next = interval_tree_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; } -EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); - -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, - u64 addr, u64 length) -{ - struct umem_odp_node *node; - - node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); - if (node) - return container_of(node, struct ib_umem_odp, interval_tree); - return NULL; - -} -EXPORT_SYMBOL(rbt_ib_umem_lookup); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 7ddd0e5bc6b3..7c10dfe417a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; - mutex_init(&ucontext->per_mm_list_lock); - INIT_LIST_HEAD(&ucontext->per_mm_list); - ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; @@ -275,8 +272,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); if (ret) goto err_file; - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; rdma_restrack_uadd(&ucontext->res); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 11c13c1381cf..e369ac0d6f51 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1487,6 +1487,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + mmu_notifier_synchronize(); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4e9f1507ffd9..bface798ee59 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1867,10 +1867,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, if (err) goto out_sys_pages; - if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) - context->ibucontext.invalidate_range = - &mlx5_ib_invalidate_range; - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { err = mlx5_ib_devx_create(dev, true); if (err < 0) @@ -1999,11 +1995,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; - /* All umem's must be destroyed before destroying the ucontext. */ - mutex_lock(&ibcontext->per_mm_list_lock); - WARN_ON(!list_empty(&ibcontext->per_mm_list)); - mutex_unlock(&ibcontext->per_mm_list_lock); - bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index a40e0abf2338..b5aece786b36 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -56,19 +56,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, struct scatterlist *sg; int entry; - if (umem->is_odp) { - struct ib_umem_odp *odp = to_ib_umem_odp(umem); - unsigned int page_shift = odp->page_shift; - - *ncont = ib_umem_odp_num_pages(odp); - *count = *ncont << (page_shift - PAGE_SHIFT); - *shift = page_shift; - if (order) - *order = ilog2(roundup_pow_of_two(*ncont)); - - return; - } - addr = addr >> PAGE_SHIFT; tmp = (unsigned long)addr; m = find_first_bit(&tmp, BITS_PER_LONG); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3401f5f6792e..1eff031ef048 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -784,19 +784,37 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, int *ncont, int *order) { struct ib_umem *u; - int err; *umem = NULL; - u = ib_umem_get(udata, start, length, access_flags, 0); - err = PTR_ERR_OR_ZERO(u); - if (err) { - mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); - return err; + if (access_flags & IB_ACCESS_ON_DEMAND) { + struct ib_umem_odp *odp; + + odp = ib_umem_odp_get(udata, start, length, access_flags); + if (IS_ERR(odp)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", + PTR_ERR(odp)); + return PTR_ERR(odp); + } + + u = &odp->umem; + + *page_shift = odp->page_shift; + *ncont = ib_umem_odp_num_pages(odp); + *npages = *ncont << (*page_shift - PAGE_SHIFT); + if (order) + *order = ilog2(roundup_pow_of_two(*ncont)); + } else { + u = ib_umem_get(udata, start, length, access_flags, 0); + if (IS_ERR(u)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); + return PTR_ERR(u); + } + + mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, + page_shift, ncont, order); } - mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, - page_shift, ncont, order); if (!*npages) { mlx5_ib_warn(dev, "avoid zero region\n"); ib_umem_release(u); @@ -1599,7 +1617,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem_odp->page_list) + if (!umem_odp->is_implicit_odp) mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); @@ -1610,7 +1628,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) * so that there will not be any invalidations in * flight, looking at the *mr struct. */ - ib_umem_release(umem); + ib_umem_odp_release(umem_odp); atomic_sub(npages, &dev->mdev->priv.reg_pages); /* Avoid double-freeing the umem. */ diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 0a59912a4cef..dd26e7acb37e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem.address == va) { + if (odp && ib_umem_start(odp) == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work) mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(&odp->umem); + ib_umem_odp_release(odp); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -386,7 +386,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, } static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, bool ksm, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -404,7 +404,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, mr->dev = dev; mr->access_flags = access_flags; mr->mmkey.iova = 0; - mr->umem = umem; + mr->umem = &umem_odp->umem; if (ksm) { err = mlx5_ib_update_xlt(mr, 0, @@ -464,18 +464,17 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr, addr, - MLX5_IMR_MTT_SIZE); + odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, + mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&odp_mr->umem_mutex); - ib_umem_release(&odp->umem); + ib_umem_odp_release(odp); return ERR_CAST(mtt); } @@ -497,7 +496,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem.address != addr) + if (odp && ib_umem_start(odp) != addr) odp = NULL; goto next_mr; } @@ -521,19 +520,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { struct mlx5_ib_mr *imr; - struct ib_umem *umem; + struct ib_umem_odp *umem_odp; - umem = ib_umem_get(udata, 0, 0, access_flags, 0); - if (IS_ERR(umem)) - return ERR_CAST(umem); + umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); - imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); + imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); if (IS_ERR(imr)) { - ib_umem_release(umem); + ib_umem_odp_release(umem_odp); return ERR_CAST(imr); } - imr->umem = umem; + imr->umem = &umem_odp->umem; init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); @@ -541,34 +540,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, - void *cookie) +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - - if (mr->parent != imr) - return 0; - - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); + struct rb_node *node; - if (umem_odp->dying) - return 0; + down_read(&per_mm->umem_rwsem); + for (node = rb_first_cached(&per_mm->umem_tree); node; + node = rb_next(node)) { + struct ib_umem_odp *umem_odp = + rb_entry(node, struct ib_umem_odp, interval_tree.rb); + struct mlx5_ib_mr *mr = umem_odp->private; - WRITE_ONCE(umem_odp->dying, 1); - atomic_inc(&imr->num_leaf_free); - schedule_work(&umem_odp->work); + if (mr->parent != imr) + continue; - return 0; -} + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); + if (umem_odp->dying) + continue; - down_read(&per_mm->umem_rwsem); - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, - mr_leaf_free, true, imr); + WRITE_ONCE(umem_odp->dying, 1); + atomic_inc(&imr->num_leaf_free); + schedule_work(&umem_odp->work); + } up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); @@ -589,7 +585,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, struct ib_umem_odp *odp; size_t size; - if (!odp_mr->page_list) { + if (odp_mr->is_implicit_odp) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -607,7 +603,7 @@ next_mr: start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; - if (prefetch && !downgrade && !mr->umem->writable) { + if (prefetch && !downgrade && !odp->umem.writable) { /* prefetch with write-access must * be supported by the MR */ @@ -615,7 +611,7 @@ next_mr: goto out; } - if (mr->umem->writable && !downgrade) + if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; current_seq = READ_ONCE(odp->notifiers_seq); @@ -625,8 +621,8 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, - access_mask, current_seq); + ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, + current_seq); if (ret < 0) goto out; @@ -634,8 +630,7 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), - current_seq)) { + if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already @@ -668,7 +663,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem.address != io_virt)) { + if (unlikely(!next || ib_umem_start(next) != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; @@ -1618,6 +1613,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) static const struct ib_device_ops mlx5_ib_dev_odp_ops = { .advise_mr = mlx5_ib_advise_mr, + .invalidate_range = mlx5_ib_invalidate_range, }; int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) |