summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:33:14 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:33:14 -0800
commitaa32f1169148beb90d71494e2f2a1999ba7b5366 (patch)
treeec8c434bff07bf0beb2df08629089824927f62f9 /drivers
parentd5bb349dbbe27537e90a03b9597deeb07723a86d (diff)
parent93f4e735b6d98ee4b7a1252d81e815a983e359f2 (diff)
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe: "This is another round of bug fixing and cleanup. This time the focus is on the driver pattern to use mmu notifiers to monitor a VA range. This code is lifted out of many drivers and hmm_mirror directly into the mmu_notifier core and written using the best ideas from all the driver implementations. This removes many bugs from the drivers and has a very pleasing diffstat. More drivers can still be converted, but that is for another cycle. - A shared branch with RDMA reworking the RDMA ODP implementation - New mmu_interval_notifier API. This is focused on the use case of monitoring a VA and simplifies the process for drivers - A common seq-count locking scheme built into the mmu_interval_notifier API usable by drivers that call get_user_pages() or hmm_range_fault() with the VA range - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen GntDev drivers to the new API. This deletes a lot of wonky driver code. - Two improvements for hmm_range_fault(), from testing done by Ralph" * tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap mm/hmm: make full use of walk_page_range() xen/gntdev: use mmu_interval_notifier_insert mm/hmm: remove hmm_mirror and related drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror drm/amdgpu: Call find_vma under mmap_sem nouveau: use mmu_interval_notifier instead of hmm_mirror nouveau: use mmu_notifier directly for invalidate_range_start drm/radeon: use mmu_interval_notifier_insert RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv RDMA/odp: Use mmu_interval_notifier_insert() mm/hmm: define the pre-processor related parts of hmm.h even if disabled mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror mm/mmu_notifier: add an interval tree notifier mm/mmu_notifier: define the header pre-processor parts even if disabled mm/hmm: allow snapshot of the special zero page
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c444
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h53
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_object.h13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c145
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_svm.c230
-rw-r--r--drivers/gpu/drm/radeon/radeon.h9
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c218
-rw-r--r--drivers/infiniband/core/device.c1
-rw-r--r--drivers/infiniband/core/umem_odp.c303
-rw-r--r--drivers/infiniband/hw/hfi1/file_ops.c2
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h2
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c146
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h3
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h7
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c3
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c50
-rw-r--r--drivers/xen/gntdev-common.h8
-rw-r--r--drivers/xen/gntdev.c179
22 files changed, 537 insertions, 1305 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bcc5d40a8d5f..0c229a92a24b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -967,6 +967,8 @@ struct amdgpu_device {
struct mutex lock_reset;
struct amdgpu_doorbell_index doorbell_index;
+ struct mutex notifier_lock;
+
int asic_reset_res;
struct work_struct xgmi_reset_work;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ae6f5446262c..7d35b5b66229 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -505,8 +505,7 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
*
* Returns 0 for success, negative errno for errors.
*/
-static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
- uint64_t user_addr)
+static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
{
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
@@ -1199,7 +1198,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
if (user_addr) {
- ret = init_user_pages(*mem, current->mm, user_addr);
+ ret = init_user_pages(*mem, user_addr);
if (ret)
goto allocate_init_user_pages_failed;
}
@@ -1744,6 +1743,10 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
return ret;
}
+ /*
+ * FIXME: Cannot ignore the return code, must hold
+ * notifier_lock
+ */
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
/* Mark the BO as valid unless it was invalidated
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a169ff16277f..5ca905b4a0fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -538,8 +538,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
e->tv.num_shared = 2;
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
- if (p->bo_list->first_userptr != p->bo_list->num_entries)
- p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX);
INIT_LIST_HEAD(&duplicates);
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
@@ -1219,11 +1217,11 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
if (r)
goto error_unlock;
- /* No memory allocation is allowed while holding the mn lock.
- * p->mn is hold until amdgpu_cs_submit is finished and fence is added
- * to BOs.
+ /* No memory allocation is allowed while holding the notifier lock.
+ * The lock is held until amdgpu_cs_submit is finished and fence is
+ * added to BOs.
*/
- amdgpu_mn_lock(p->mn);
+ mutex_lock(&p->adev->notifier_lock);
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
@@ -1266,13 +1264,13 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
- amdgpu_mn_unlock(p->mn);
+ mutex_unlock(&p->adev->notifier_lock);
return 0;
error_abort:
drm_sched_job_cleanup(&job->base);
- amdgpu_mn_unlock(p->mn);
+ mutex_unlock(&p->adev->notifier_lock);
error_unlock:
amdgpu_job_free(job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4f76beafb2fa..c17505fba988 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2794,6 +2794,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset);
+ mutex_init(&adev->notifier_lock);
mutex_init(&adev->virt.dpm_mutex);
mutex_init(&adev->psp.mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 392300f77b13..828b5167ff12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -51,439 +51,107 @@
#include "amdgpu_amdkfd.h"
/**
- * struct amdgpu_mn_node
+ * amdgpu_mn_invalidate_gfx - callback to notify about mm change
*
- * @it: interval node defining start-last of the affected address range
- * @bos: list of all BOs in the affected address range
- *
- * Manages all BOs which are affected of a certain range of address space.
- */
-struct amdgpu_mn_node {
- struct interval_tree_node it;
- struct list_head bos;
-};
-
-/**
- * amdgpu_mn_destroy - destroy the HMM mirror
- *
- * @work: previously sheduled work item
- *
- * Lazy destroys the notifier from a work item
- */
-static void amdgpu_mn_destroy(struct work_struct *work)
-{
- struct amdgpu_mn *amn = container_of(work, struct amdgpu_mn, work);
- struct amdgpu_device *adev = amn->adev;
- struct amdgpu_mn_node *node, *next_node;
- struct amdgpu_bo *bo, *next_bo;
-
- mutex_lock(&adev->mn_lock);
- down_write(&amn->lock);
- hash_del(&amn->node);
- rbtree_postorder_for_each_entry_safe(node, next_node,
- &amn->objects.rb_root, it.rb) {
- list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
- bo->mn = NULL;
- list_del_init(&bo->mn_list);
- }
- kfree(node);
- }
- up_write(&amn->lock);
- mutex_unlock(&adev->mn_lock);
-
- hmm_mirror_unregister(&amn->mirror);
- kfree(amn);
-}
-
-/**
- * amdgpu_hmm_mirror_release - callback to notify about mm destruction
- *
- * @mirror: the HMM mirror (mm) this callback is about
- *
- * Shedule a work item to lazy destroy HMM mirror.
- */
-static void amdgpu_hmm_mirror_release(struct hmm_mirror *mirror)
-{
- struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
-
- INIT_WORK(&amn->work, amdgpu_mn_destroy);
- schedule_work(&amn->work);
-}
-
-/**
- * amdgpu_mn_lock - take the write side lock for this notifier
- *
- * @mn: our notifier
- */
-void amdgpu_mn_lock(struct amdgpu_mn *mn)
-{
- if (mn)
- down_write(&mn->lock);
-}
-
-/**
- * amdgpu_mn_unlock - drop the write side lock for this notifier
- *
- * @mn: our notifier
- */
-void amdgpu_mn_unlock(struct amdgpu_mn *mn)
-{
- if (mn)
- up_write(&mn->lock);
-}
-
-/**
- * amdgpu_mn_read_lock - take the read side lock for this notifier
- *
- * @amn: our notifier
- * @blockable: is the notifier blockable
- */
-static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
-{
- if (blockable)
- down_read(&amn->lock);
- else if (!down_read_trylock(&amn->lock))
- return -EAGAIN;
-
- return 0;
-}
-
-/**
- * amdgpu_mn_read_unlock - drop the read side lock for this notifier
- *
- * @amn: our notifier
- */
-static void amdgpu_mn_read_unlock(struct amdgpu_mn *amn)
-{
- up_read(&amn->lock);
-}
-
-/**
- * amdgpu_mn_invalidate_node - unmap all BOs of a node
- *
- * @node: the node with the BOs to unmap
- * @start: start of address range affected
- * @end: end of address range affected
+ * @mni: the range (mm) is about to update
+ * @range: details on the invalidation
+ * @cur_seq: Value to pass to mmu_interval_set_seq()
*
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
-static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
- unsigned long start,
- unsigned long end)
+static bool amdgpu_mn_invalidate_gfx(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
- struct amdgpu_bo *bo;
+ struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
+ struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
long r;
- list_for_each_entry(bo, &node->bos, mn_list) {
-
- if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
- continue;
-
- r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
- true, false, MAX_SCHEDULE_TIMEOUT);
- if (r <= 0)
- DRM_ERROR("(%ld) failed to wait for user bo\n", r);
- }
-}
-
-/**
- * amdgpu_mn_sync_pagetables_gfx - callback to notify about mm change
- *
- * @mirror: the hmm_mirror (mm) is about to update
- * @update: the update start, end address
- *
- * Block for operations on BOs to finish and mark pages as accessed and
- * potentially dirty.
- */
-static int
-amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
- const struct mmu_notifier_range *update)
-{
- struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
- unsigned long start = update->start;
- unsigned long end = update->end;
- bool blockable = mmu_notifier_range_blockable(update);
- struct interval_tree_node *it;
+ if (!mmu_notifier_range_blockable(range))
+ return false;
- /* notification is exclusive, but interval is inclusive */
- end -= 1;
+ mutex_lock(&adev->notifier_lock);
- /* TODO we should be able to split locking for interval tree and
- * amdgpu_mn_invalidate_node
- */
- if (amdgpu_mn_read_lock(amn, blockable))
- return -EAGAIN;
+ mmu_interval_set_seq(mni, cur_seq);
- it = interval_tree_iter_first(&amn->objects, start, end);
- while (it) {
- struct amdgpu_mn_node *node;
-
- if (!blockable) {
- amdgpu_mn_read_unlock(amn);
- return -EAGAIN;
- }
-
- node = container_of(it, struct amdgpu_mn_node, it);
- it = interval_tree_iter_next(it, start, end);
-
- amdgpu_mn_invalidate_node(node, start, end);
- }
-
- amdgpu_mn_read_unlock(amn);
-
- return 0;
-}
-
-/**
- * amdgpu_mn_sync_pagetables_hsa - callback to notify about mm change
- *
- * @mirror: the hmm_mirror (mm) is about to update
- * @update: the update start, end address
- *
- * We temporarily evict all BOs between start and end. This
- * necessitates evicting all user-mode queues of the process. The BOs
- * are restorted in amdgpu_mn_invalidate_range_end_hsa.
- */
-static int
-amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
- const struct mmu_notifier_range *update)
-{
- struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
- unsigned long start = update->start;
- unsigned long end = update->end;
- bool blockable = mmu_notifier_range_blockable(update);
- struct interval_tree_node *it;
-
- /* notification is exclusive, but interval is inclusive */
- end -= 1;
-
- if (amdgpu_mn_read_lock(amn, blockable))
- return -EAGAIN;
-
- it = interval_tree_iter_first(&amn->objects, start, end);
- while (it) {
- struct amdgpu_mn_node *node;
- struct amdgpu_bo *bo;
-
- if (!blockable) {
- amdgpu_mn_read_unlock(amn);
- return -EAGAIN;
- }
-
- node = container_of(it, struct amdgpu_mn_node, it);
- it = interval_tree_iter_next(it, start, end);
-
- list_for_each_entry(bo, &node->bos, mn_list) {
- struct kgd_mem *mem = bo->kfd_bo;
-
- if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
- start, end))
- amdgpu_amdkfd_evict_userptr(mem, amn->mm);
- }
- }
-
- amdgpu_mn_read_unlock(amn);
-
- return 0;
+ r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
+ MAX_SCHEDULE_TIMEOUT);
+ mutex_unlock(&adev->notifier_lock);
+ if (r <= 0)
+ DRM_ERROR("(%ld) failed to wait for user bo\n", r);
+ return true;
}
-/* Low bits of any reasonable mm pointer will be unused due to struct
- * alignment. Use these bits to make a unique key from the mm pointer
- * and notifier type.
- */
-#define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type))
-
-static struct hmm_mirror_ops amdgpu_hmm_mirror_ops[] = {
- [AMDGPU_MN_TYPE_GFX] = {
- .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_gfx,
- .release = amdgpu_hmm_mirror_release
- },
- [AMDGPU_MN_TYPE_HSA] = {
- .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_hsa,
- .release = amdgpu_hmm_mirror_release
- },
+static const struct mmu_interval_notifier_ops amdgpu_mn_gfx_ops = {
+ .invalidate = amdgpu_mn_invalidate_gfx,
};
/**
- * amdgpu_mn_get - create HMM mirror context
+ * amdgpu_mn_invalidate_hsa - callback to notify about mm change
*
- * @adev: amdgpu device pointer
- * @type: type of MMU notifier context
+ * @mni: the range (mm) is about to update
+ * @range: details on the invalidation
+ * @cur_seq: Value to pass to mmu_interval_set_seq()
*
- * Creates a HMM mirror context for current->mm.
+ * We temporarily evict the BO attached to this range. This necessitates
+ * evicting all user-mode queues of the process.
*/
-struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
- enum amdgpu_mn_type type)
+static bool amdgpu_mn_invalidate_hsa(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
- struct mm_struct *mm = current->mm;
- struct amdgpu_mn *amn;
- unsigned long key = AMDGPU_MN_KEY(mm, type);
- int r;
-
- mutex_lock(&adev->mn_lock);
- if (down_write_killable(&mm->mmap_sem)) {
- mutex_unlock(&adev->mn_lock);
- return ERR_PTR(-EINTR);
- }
-
- hash_for_each_possible(adev->mn_hash, amn, node, key)
- if (AMDGPU_MN_KEY(amn->mm, amn->type) == key)
- goto release_locks;
-
- amn = kzalloc(sizeof(*amn), GFP_KERNEL);
- if (!amn) {
- amn = ERR_PTR(-ENOMEM);
- goto release_locks;
- }
-
- amn->adev = adev;
- amn->mm = mm;
- init_rwsem(&amn->lock);
- amn->type = type;
- amn->objects = RB_ROOT_CACHED;
-
- amn->mirror.ops = &amdgpu_hmm_mirror_ops[type];
- r = hmm_mirror_register(&amn->mirror, mm);
- if (r)
- goto free_amn;
+ struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
+ struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- hash_add(adev->mn_hash, &amn->node, AMDGPU_MN_KEY(mm, type));
+ if (!mmu_notifier_range_blockable(range))
+ return false;
-release_locks:
- up_write(&mm->mmap_sem);
- mutex_unlock(&adev->mn_lock);
+ mutex_lock(&adev->notifier_lock);
- return amn;
+ mmu_interval_set_seq(mni, cur_seq);
-free_amn:
- up_write(&mm->mmap_sem);
- mutex_unlock(&adev->mn_lock);
- kfree(amn);
+ amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm);
+ mutex_unlock(&adev->notifier_lock);
- return ERR_PTR(r);
+ return true;
}
+static const struct mmu_interval_notifier_ops amdgpu_mn_hsa_ops = {
+ .invalidate = amdgpu_mn_invalidate_hsa,
+};
+
/**
* amdgpu_mn_register - register a BO for notifier updates
*
* @bo: amdgpu buffer object
* @addr: userptr addr we should monitor
*
- * Registers an HMM mirror for the given BO at the specified address.
+ * Registers a mmu_notifier for the given BO at the specified address.
* Returns 0 on success, -ERRNO if anything goes wrong.
*/
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
{
- unsigned long end = addr + amdgpu_bo_size(bo) - 1;
- struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- enum amdgpu_mn_type type =
- bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX;
- struct amdgpu_mn *amn;
- struct amdgpu_mn_node *node = NULL, *new_node;
- struct list_head bos;
- struct interval_tree_node *it;
-
- amn = amdgpu_mn_get(adev, type);
- if (IS_ERR(amn))
- return PTR_ERR(amn);
-
- new_node = kmalloc(sizeof(*new_node), GFP_KERNEL);
- if (!new_node)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&bos);
-
- down_write(&amn->lock);
-
- while ((it = interval_tree_iter_first(&amn->objects, addr, end))) {
- kfree(node);
- node = container_of(it, struct amdgpu_mn_node, it);
- interval_tree_remove(&node->it, &amn->objects);
- addr = min(it->start, addr);
- end = max(it->last, end);
- list_splice(&node->bos, &bos);
- }
-
- if (!node)
- node = new_node;
- else
- kfree(new_node);
-
- bo->mn = amn;
-
- node->it.start = addr;
- node->it.last = end;
- INIT_LIST_HEAD(&node->bos);
- list_splice(&bos, &node->bos);
- list_add(&bo->mn_list, &node->bos);
-
- interval_tree_insert(&node->it, &amn->objects);
-
- up_write(&amn->lock);
-
- return 0;
+ if (bo->kfd_bo)
+ return mmu_interval_notifier_insert(&bo->notifier, current->mm,
+ addr, amdgpu_bo_size(bo),
+ &amdgpu_mn_hsa_ops);
+ return mmu_interval_notifier_insert(&bo->notifier, current->mm, addr,
+ amdgpu_bo_size(bo),
+ &amdgpu_mn_gfx_ops);
}
/**
- * amdgpu_mn_unregister - unregister a BO for HMM mirror updates
+ * amdgpu_mn_unregister - unregister a BO for notifier updates
*
* @bo: amdgpu buffer object
*
- * Remove any registration of HMM mirror updates from the buffer object.
+ * Remove any registration of mmu notifier updates from the buffer object.
*/
void amdgpu_mn_unregister(struct amdgpu_bo *bo)
{
- struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- struct amdgpu_mn *amn;
- struct list_head *head;
-
- mutex_lock(&adev->mn_lock);
-
- amn = bo->mn;
- if (amn == NULL) {
- mutex_unlock(&adev->mn_lock);
+ if (!bo->notifier.mm)
return;
- }
-
- down_write(&amn->lock);
-
- /* save the next list entry for later */
- head = bo->mn_list.next;
-
- bo->mn = NULL;
- list_del_init(&bo->mn_list);
-
- if (list_empty(head)) {
- struct amdgpu_mn_node *node;
-
- node = container_of(head, struct amdgpu_mn_node, bos);
- interval_tree_remove(&node->it, &amn->objects);
- kfree(node);
- }
-
- up_write(&amn->lock);
- mutex_unlock(&adev->mn_lock);
-}
-
-/* flags used by HMM internal, not related to CPU/GPU PTE flags */
-static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
- (1 << 0), /* HMM_PFN_VALID */
- (1 << 1), /* HMM_PFN_WRITE */
- 0 /* HMM_PFN_DEVICE_PRIVATE */
-};
-
-static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
- 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
- 0, /* HMM_PFN_NONE */
- 0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
-};
-
-void amdgpu_hmm_init_range(struct hmm_range *range)
-{
- if (range) {
- range->flags = hmm_range_flags;
- range->values = hmm_range_values;
- range->pfn_shift = PAGE_SHIFT;
- }
+ mmu_interval_notifier_remove(&bo->notifier);
+ bo->notifier.mm = NULL;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
index b8ed68943625..a292238f75eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
@@ -30,63 +30,10 @@
#include <linux/workqueue.h>
#include <linux/interval_tree.h>
-enum amdgpu_mn_type {
- AMDGPU_MN_TYPE_GFX,
- AMDGPU_MN_TYPE_HSA,
-};
-
-/**
- * struct amdgpu_mn
- *
- * @adev: amdgpu device pointer
- * @mm: process address space
- * @type: type of MMU notifier
- * @work: destruction work item
- * @node: hash table node to find structure by adev and mn
- * @lock: rw semaphore protecting the notifier nodes
- * @objects: interval tree containing amdgpu_mn_nodes
- * @mirror: HMM mirror function support
- *
- * Data for each amdgpu device and process address space.
- */
-struct amdgpu_mn {
- /* constant after initialisation */
- struct amdgpu_device *adev;
- struct mm_struct *mm;
- enum amdgpu_mn_type type;
-
- /* only used on destruction */
- struct work_struct work;
-
- /* protected by adev->mn_lock */
- struct hlist_node node;
-
- /* objects protected by lock */
- struct rw_semaphore lock;
- struct rb_root_cached objects;
-
-#ifdef CONFIG_HMM_MIRROR
- /* HMM mirror */
- struct hmm_mirror mirror;
-#endif
-};
-
#if defined(CONFIG_HMM_MIRROR)
-void amdgpu_mn_lock(struct amdgpu_mn *mn);
-void amdgpu_mn_unlock(struct amdgpu_mn *mn);
-struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
- enum amdgpu_mn_type type);
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr);
void amdgpu_mn_unregister(struct amdgpu_bo *bo);
-void amdgpu_hmm_init_range(struct hmm_range *range);
#else
-static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {}
-static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {}
-static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
- enum amdgpu_mn_type type)
-{
- return NULL;
-}
static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
{
DRM_WARN_ONCE("HMM_MIRROR kernel config option is not enabled, "
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 7e99f6c58c48..36dec51d1ef1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -30,6 +30,9 @@
#include <drm/amdgpu_drm.h>
#include "amdgpu.h"
+#ifdef CONFIG_MMU_NOTIFIER
+#include <linux/mmu_notifier.h>
+#endif
#define AMDGPU_BO_INVALID_OFFSET LONG_MAX
#define AMDGPU_BO_MAX_PLACEMENTS 3
@@ -101,10 +104,12 @@ struct amdgpu_bo {
struct ttm_bo_kmap_obj dma_buf_vmap;
struct amdgpu_mn *mn;
- union {
- struct list_head mn_list;
- struct list_head shadow_list;
- };
+
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_interval_notifier notifier;
+#endif
+
+ struct list_head shadow_list;
struct kgd_mem *kfd_bo;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 61d9b7774d42..2616e2eafdeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -35,6 +35,7 @@
#include <linux/hmm.h>
#include <linux/pagemap.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/swap.h>
@@ -769,6 +770,20 @@ struct amdgpu_ttm_tt {
#endif
};
+#ifdef CONFIG_DRM_AMDGPU_USERPTR
+/* flags used by HMM internal, not related to CPU/GPU PTE flags */
+static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
+ (1 << 0), /* HMM_PFN_VALID */
+ (1 << 1), /* HMM_PFN_WRITE */
+ 0 /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
+ 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
+ 0, /* HMM_PFN_NONE */
+ 0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
+};
+
/**
* amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
* memory and start HMM tracking CPU page table update
@@ -776,85 +791,89 @@ struct amdgpu_ttm_tt {
* Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
* once afterwards to stop HMM tracking
*/
-#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
-
-#define MAX_RETRY_HMM_RANGE_FAULT 16
-
int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
{
- struct hmm_mirror *mirror = bo->mn ? &bo->mn->mirror : NULL;
struct ttm_tt *ttm = bo->tbo.ttm;
struct amdgpu_ttm_tt *gtt = (void *)ttm;
- struct mm_struct *mm = gtt->usertask->mm;
unsigned long start = gtt->userptr;
struct vm_area_struct *vma;
struct hmm_range *range;
+ unsigned long timeout;
+ struct mm_struct *mm;
unsigned long i;
- uint64_t *pfns;
int r = 0;
- if (!mm) /* Happens during process shutdown */
- return -ESRCH;
-
- if (unlikely(!mirror)) {
- DRM_DEBUG_DRIVER("Failed to get hmm_mirror\n");
- r = -EFAULT;
- goto out;
+ mm = bo->notifier.mm;
+ if (unlikely(!mm)) {
+ DRM_DEBUG_DRIVER("BO is not registered?\n");
+ return -EFAULT;
}
- vma = find_vma(mm, start);
- if (unlikely(!vma || start < vma->vm_start)) {
- r = -EFAULT;
- goto out;
- }
- if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
- vma->vm_file)) {
- r = -EPERM;
- goto out;
- }
+ /* Another get_user_pages is running at the same time?? */
+ if (WARN_ON(gtt->range))
+ return -EFAULT;
+
+ if (!mmget_not_zero(mm)) /* Happens during process shutdown */
+ return -ESRCH;
range = kzalloc(sizeof(*range), GFP_KERNEL);
if (unlikely(!range)) {
r = -ENOMEM;
goto out;
}
+ range->notifier = &bo->notifier;
+ range->flags = hmm_range_flags;
+ range->values = hmm_range_values;
+ range->pfn_shift = PAGE_SHIFT;
+ range->start = bo->notifier.interval_tree.start;
+ range->end = bo->notifier.interval_tree.last + 1;
+ range->default_flags = hmm_range_flags[HMM_PFN_VALID];
+ if (!amdgpu_ttm_tt_is_readonly(ttm))
+ range->default_flags |= range->flags[HMM_PFN_WRITE];
- pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
- if (unlikely(!pfns)) {
+ range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
+ GFP_KERNEL);
+ if (unlikely(!range->pfns)) {
r = -ENOMEM;
goto out_free_ranges;
}
- amdgpu_hmm_init_range(range);
- range->default_flags = range->flags[HMM_PFN_VALID];
- range->default_flags |= amdgpu_ttm_tt_is_readonly(ttm) ?
- 0 : range->flags[HMM_PFN_WRITE];
- range->pfn_flags_mask = 0;
- range->pfns = pfns;
- range->start = start;
- range->end = start + ttm->num_pages * PAGE_SIZE;
-
- hmm_range_register(range, mirror);
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (unlikely(!vma || start < vma->vm_start)) {
+ r = -EFAULT;
+ goto out_unlock;
+ }
+ if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
+ vma->vm_file)) {
+ r = -EPERM;
+ goto out_unlock;
+ }
+ up_read(&mm->mmap_sem);
+ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
- /*
- * Just wait for range to be valid, safe to ignore return value as we
- * will use the return value of hmm_range_fault() below under the
- * mmap_sem to ascertain the validity of the range.
- */
- hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
+retry:
+ range->notifier_seq = mmu_interval_read_begin(&bo->notifier);
down_read(&mm->mmap_sem);
r = hmm_range_fault(range, 0);
up_read(&mm->mmap_sem);
-
- if (unlikely(r < 0))
+ if (unlikely(r <= 0)) {
+ /*
+ * FIXME: This timeout should encompass the retry from
+ * mmu_interval_read_retry() as well.
+ */
+ if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
+ goto retry;
goto out_free_pfns;
+ }
for (i = 0; i < ttm->num_pages; i++) {
- pages[i] = hmm_device_entry_to_page(range, pfns[i]);
+ /* FIXME: The pages cannot be touched outside the notifier_lock */
+ pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
if (unlikely(!pages[i])) {
pr_err("Page fault failed for pfn[%lu] = 0x%llx\n",
- i, pfns[i]);
+ i, range->pfns[i]);
r = -ENOMEM;
goto out_free_pfns;
@@ -862,15 +881,18 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
}
gtt->range = range;
+ mmput(mm);
return 0;
+out_unlock:
+ up_read(&mm->mmap_sem);
out_free_pfns:
- hmm_range_unregister(range);
- kvfree(pfns);
+ kvfree(range->pfns);
out_free_ranges:
kfree(range);
out:
+ mmput(mm);
return r;
}
@@ -895,15 +917,18 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
"No user pages to check\n");
if (gtt->range) {
- r = hmm_range_valid(gtt->range);
- hmm_range_unregister(gtt->range);
-
+ /*
+ * FIXME: Must always hold notifier_lock for this, and must
+ * not ignore the return code.
+ */
+ r = mmu_interval_read_retry(gtt->range->notifier,
+ gtt->range->notifier_seq);
kvfree(gtt->range->pfns);
kfree(gtt->range);
gtt->range = NULL;
}
- return r;
+ return !r;
}
#endif
@@ -984,10 +1009,18 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
sg_free_table(ttm->sg);
#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
- if (gtt->range &&
- ttm->pages[0] == hmm_device_entry_to_page(gtt->range,
- gtt->range->pfns[0]))
- WARN_ONCE(1, "Missing get_user_page_done\n");
+ if (gtt->range) {
+ unsigned long i;
+
+ for (i = 0; i < ttm->num_pages; i++) {
+ if (ttm->pages[i] !=
+ hmm_device_entry_to_page(gtt->range,
+ gtt->range->pfns[i]))
+ break;
+ }
+
+ WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
+ }
#endif
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 668d4bd0c118..df9bf1fd1bc0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -88,6 +88,7 @@ nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
}
struct nouveau_svmm {
+ struct mmu_notifier notifier;
struct nouveau_vmm *vmm;
struct {
unsigned long start;
@@ -95,9 +96,6 @@ struct nouveau_svmm {
} unmanaged;
struct mutex mutex;
-
- struct mm_struct *mm;
- struct hmm_mirror mirror;
};
#define SVMM_DBG(s,f,a...) \
@@ -251,10 +249,11 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
}
static int
-nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
- const struct mmu_notifier_range *update)
+nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *update)
{
- struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
+ struct nouveau_svmm *svmm =
+ container_of(mn, struct nouveau_svmm, notifier);
unsigned long start = update->start;
unsigned long limit = update->end;
@@ -264,6 +263,9 @@ nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
mutex_lock(&svmm->mutex);
+ if (unlikely(!svmm->vmm))
+ goto out;
+
if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
if (start < svmm->unmanaged.start) {
nouveau_svmm_invalidate(svmm, start,
@@ -273,19 +275,20 @@ nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
}
nouveau_svmm_invalidate(svmm, start, limit);
+
+out:
mutex_unlock(&svmm->mutex);
return 0;
}
-static void
-nouveau_svmm_release(struct hmm_mirror *mirror)
+static void nouveau_svmm_free_notifier(struct mmu_notifier *mn)
{
+ kfree(container_of(mn, struct nouveau_svmm, notifier));
}
-static const struct hmm_mirror_ops
-nouveau_svmm = {
- .sync_cpu_device_pagetables = nouveau_svmm_sync_cpu_device_pagetables,
- .release = nouveau_svmm_release,
+static const struct mmu_notifier_ops nouveau_mn_ops = {
+ .invalidate_range_start = nouveau_svmm_invalidate_range_start,
+ .free_notifier = nouveau_svmm_free_notifier,
};
void
@@ -293,8 +296,10 @@ nouveau_svmm_fini(struct nouveau_svmm **psvmm)
{
struct nouveau_svmm *svmm = *psvmm;
if (svmm) {
- hmm_mirror_unregister(&svmm->mirror);
- kfree(*psvmm);
+ mutex_lock(&svmm->mutex);
+ svmm->vmm = NULL;
+ mutex_unlock(&svmm->mutex);
+ mmu_notifier_put(&svmm->notifier);
*psvmm = NULL;
}
}
@@ -320,7 +325,7 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
mutex_lock(&cli->mutex);
if (cli->svm.cli) {
ret = -EBUSY;
- goto done;
+ goto out_free;
}
/* Allocate a new GPU VMM that can support SVM (managed by the
@@ -335,24 +340,26 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
.fault_replay = true,
}, sizeof(struct gp100_vmm_v0), &cli->svm.vmm);
if (ret)
- goto done;
-
- /* Enable HMM mirroring of CPU address-space to VMM. */
- svmm->mm = get_task_mm(current);
- down_write(&svmm->mm->mmap_sem);
- svmm->mirror.ops = &nouveau_svmm;
- ret = hmm_mirror_register(&svmm->mirror, svmm->mm);
- if (ret == 0) {
- cli->svm.svmm = svmm;
- cli->svm.cli = cli;
- }
- up_write(&svmm->mm->mmap_sem);
- mmput(svmm->mm);
+ goto out_free;
-done:
+ down_write(&current->mm->mmap_sem);
+ svmm->notifier.ops = &nouveau_mn_ops;
+ ret = __mmu_notifier_register(&svmm->notifier, current->mm);
if (ret)
- nouveau_svmm_fini(&svmm);
+ goto out_mm_unlock;
+ /* Note, ownership of svmm transfers to mmu_notifier */
+
+ cli->svm.svmm = svmm;
+ cli->svm.cli = cli;
+ up_write(&current->mm->mmap_sem);
mutex_unlock(&cli->mutex);
+ return 0;
+
+out_mm_unlock:
+ up_write(&current->mm->mmap_sem);
+out_free:
+ mutex_unlock(&cli->mutex);
+ kfree(svmm);
return ret;
}
@@ -475,43 +482,90 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm,
fault->inst, fault->addr, fault->access);
}
-static inline bool
-nouveau_range_done(struct hmm_range *range)
+struct svm_notifier {
+ struct mmu_interval_notifier notifier;
+ struct nouveau_svmm *svmm;
+};
+
+static bool nouveau_svm_range_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
- bool ret = hmm_range_valid(range);
+ struct svm_notifier *sn =
+ container_of(mni, struct svm_notifier, notifier);
- hmm_range_unregister(range);
- return ret;
+ /*
+ * serializes the update to mni->invalidate_seq done by caller and
+ * prevents invalidation of the PTE from progressing while HW is being
+ * programmed. This is very hacky and only works because the normal
+ * notifier that does invalidation is always called after the range
+ * notifier.
+ */
+ if (mmu_notifier_range_blockable(range))
+ mutex_lock(&sn->svmm->mutex);
+ else if (!mutex_trylock(&sn->svmm->mutex))
+ return false;
+ mmu_interval_set_seq(mni, cur_seq);
+ mutex_unlock(&sn->svmm->mutex);
+ return true;
}
-static int
-nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range)
+static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
+ .invalidate = nouveau_svm_range_invalidate,
+};
+
+static int nouveau_range_fault(struct nouveau_svmm *svmm,
+ struct nouveau_drm *drm, void *data, u32 size,
+ u64 *pfns, struct svm_notifier *notifier)
{
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ /* Have HMM fault pages within the fault window to the GPU. */
+ struct hmm_range range = {
+ .notifier = &notifier->notifier,
+ .start = notifier->notifier.interval_tree.start,
+ .end = notifier->notifier.interval_tree.last + 1,
+ .pfns = pfns,
+ .flags = nouveau_svm_pfn_flags,
+ .values = nouveau_svm_pfn_values,
+ .pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
+ };
+ struct mm_struct *mm = notifier->notifier.mm;
long ret;
- range->default_flags = 0;
- range->pfn_flags_mask = -1UL;
+ while (true) {
+ if (time_after(jiffies, timeout))
+ return -EBUSY;
+
+ range.notifier_seq = mmu_interval_read_begin(range.notifier);
+ range.default_flags = 0;
+ range.pfn_flags_mask = -1UL;
+ down_read(&mm->mmap_sem);
+ ret = hmm_range_fault(&range, 0);
+ up_read(&mm->mmap_sem);
+ if (ret <= 0) {
+ if (ret == 0 || ret == -EBUSY)
+ continue;
+ return ret;
+ }
- ret = hmm_range_register(range, &svmm->mirror);
- if (ret) {
- up_read(&svmm->mm->mmap_sem);
- return (int)ret;
+ mutex_lock(&svmm->mutex);
+ if (mmu_interval_read_retry(range.notifier,
+ range.notifier_seq)) {
+ mutex_unlock(&svmm->mutex);
+ continue;
+ }
+ break;
}
- if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
- up_read(&svmm->mm->mmap_sem);
- return -EBUSY;
- }
+ nouveau_dmem_convert_pfn(drm, &range);
- ret = hmm_range_fault(range, 0);
- if (ret <= 0) {
- if (ret == 0)
- ret = -EBUSY;
- up_read(&svmm->mm->mmap_sem);
- hmm_range_unregister(range);
- return ret;
- }
- return 0;
+ svmm->vmm->vmm.object.client->super = true;
+ ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
+ svmm->vmm->vmm.object.client->super = false;
+ mutex_unlock(&svmm->mutex);
+
+ return ret;
}
static int
@@ -531,7 +585,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
} i;
u64 phys[16];
} args;
- struct hmm_range range;
struct vm_area_struct *vma;
u64 inst, start, limit;
int fi, fn, pi, fill;
@@ -587,6 +640,9 @@ nouveau_svm_fault(struct nvif_notify *notify)
args.i.p.version = 0;
for (fi = 0; fn = fi + 1, fi < buffer->fault_nr; fi = fn) {
+ struct svm_notifier notifier;
+ struct mm_struct *mm;
+
/* Cancel any faults from non-SVM channels. */
if (!(svmm = buffer->fault[fi]->svmm)) {
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
@@ -606,24 +662,32 @@ nouveau_svm_fault(struct nvif_notify *notify)
start = max_t(u64, start, svmm->unmanaged.limit);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
+ mm = svmm->notifier.mm;
+ if (!mmget_not_zero(mm)) {
+ nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
+ continue;
+ }
+
/* Intersect fault window with the CPU VMA, cancelling
* the fault if the address is invalid.
*/
- down_read(&svmm->mm->mmap_sem);
- vma = find_vma_intersection(svmm->mm, start, limit);
+ down_read(&mm->mmap_sem);
+ vma = find_vma_intersection(mm, start, limit);
if (!vma) {
SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit);
- up_read(&svmm->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
continue;
}
start = max_t(u64, start, vma->vm_start);
limit = min_t(u64, limit, vma->vm_end);
+ up_read(&mm->mmap_sem);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
if (buffer->fault[fi]->addr != start) {
SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr);
- up_read(&svmm->mm->mmap_sem);
+ mmput(mm);
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
continue;
}
@@ -679,33 +743,19 @@ nouveau_svm_fault(struct nvif_notify *notify)
args.i.p.addr,
args.i.p.addr + args.i.p.size, fn - fi);
- /* Have HMM fault pages within the fault window to the GPU. */
- range.start = args.i.p.addr;
- range.end = args.i.p.addr + args.i.p.size;
- range.pfns = args.phys;
- range.flags = nouveau_svm_pfn_flags;
- range.values = nouveau_svm_pfn_values;
- range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
-again:
- ret = nouveau_range_fault(svmm, &range);
- if (ret == 0) {
- mutex_lock(&svmm->mutex);
- if (!nouveau_range_done(&range)) {
- mutex_unlock(&svmm->mutex);
- goto again;
- }
-
- nouveau_dmem_convert_pfn(svm->drm, &range);
-
- svmm->vmm->vmm.object.client->super = true;
- ret = nvif_object_ioctl(&svmm->vmm->vmm.object,
- &args, sizeof(args.i) +
- pi * sizeof(args.phys[0]),
- NULL);
- svmm->vmm->vmm.object.client->super = false;
- mutex_unlock(&svmm->mutex);
- up_read(&svmm->mm->mmap_sem);
+ notifier.svmm = svmm;
+ ret = mmu_interval_notifier_insert(&notifier.notifier,
+ svmm->notifier.mm,
+ args.i.p.addr, args.i.p.size,
+ &nouveau_svm_mni_ops);
+ if (!ret) {
+ ret = nouveau_range_fault(
+ svmm, svm->drm, &args,
+ sizeof(args.i) + pi * sizeof(args.phys[0]),
+ args.phys, &notifier);
+ mmu_interval_notifier_remove(&notifier.notifier);
}
+ mmput(mm);
/* Cancel any faults in the window whose pages didn't manage
* to keep their valid bit, or stay writeable when required.
@@ -714,10 +764,10 @@ again:
*/
while (fi < fn) {
struct nouveau_svm_fault *fault = buffer->fault[fi++];
- pi = (fault->addr - range.start) >> PAGE_SHIFT;
+ pi = (fault->addr - args.i.p.addr) >> PAGE_SHIFT;
if (ret ||
- !(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_V) ||
- (!(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_W) &&
+ !(args.phys[pi] & NVIF_VMM_PFNMAP_V0_V) ||
+ (!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_W) &&
fault->access != 0 && fault->access != 3)) {
nouveau_svm_fault_cancel_fault(svm, fault);
continue;
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index d59b004f6695..30e32adc1fc6 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -68,6 +68,10 @@
#include <linux/hashtable.h>
#include <linux/dma-fence.h>
+#ifdef CONFIG_MMU_NOTIFIER
+#include <linux/mmu_notifier.h>
+#endif
+
#include <drm/ttm/ttm_bo_api.h>
#include <drm/ttm/ttm_bo_driver.h>
#include <drm/ttm/ttm_placement.h>
@@ -509,8 +513,9 @@ struct radeon_bo {
struct ttm_bo_kmap_obj dma_buf_vmap;
pid_t pid;
- struct radeon_mn *mn;
- struct list_head mn_list;
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_interval_notifier notifier;
+#endif
};
#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index dbab9a3a969b..f93829f08a4d 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -36,131 +36,51 @@
#include "radeon.h"
-struct radeon_mn {
- struct mmu_notifier mn;
-
- /* objects protected by lock */
- struct mutex lock;
- struct rb_root_cached objects;
-};
-
-struct radeon_mn_node {
- struct interval_tree_node it;
- struct list_head bos;
-};
-
/**
- * radeon_mn_invalidate_range_start - callback to notify about mm change
+ * radeon_mn_invalidate - callback to notify about mm change
*
* @mn: our notifier
- * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: the VMA under invalidation
*
* We block for all BOs between start and end to be idle and
* unmap them by move them into system domain again.
*/
-static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+static bool radeon_mn_invalidate(struct mmu_interval_notifier *mn,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
- struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
+ struct radeon_bo *bo = container_of(mn, struct radeon_bo, notifier);
struct ttm_operation_ctx ctx = { false, false };
- struct interval_tree_node *it;
- unsigned long end;
- int ret = 0;
-
- /* notification is exclusive, but interval is inclusive */
- end = range->end - 1;
-
- /* TODO we should be able to split locking for interval tree and
- * the tear down.
- */
- if (mmu_notifier_range_blockable(range))
- mutex_lock(&rmn->lock);
- else if (!mutex_trylock(&rmn->lock))
- return -EAGAIN;
-
- it = interval_tree_iter_first(&rmn->objects, range->start, end);
- while (it) {
- struct radeon_mn_node *node;
- struct radeon_bo *bo;
- long r;
-
- if (!mmu_notifier_range_blockable(range)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
-
- node = container_of(it, struct radeon_mn_node, it);
- it = interval_tree_iter_next(it, range->start, end);
+ long r;
- list_for_each_entry(bo, &node->bos, mn_list) {
+ if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
+ return true;
- if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
- continue;
+ if (!mmu_notifier_range_blockable(range))
+ return false;
- r = radeon_bo_reserve(bo, true);
- if (r) {
- DRM_ERROR("(%ld) failed to reserve user bo\n", r);
- continue;
- }
-
- r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
- true, false, MAX_SCHEDULE_TIMEOUT);
- if (r <= 0)
- DRM_ERROR("(%ld) failed to wait for user bo\n", r);
-
- radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
- r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
- if (r)
- DRM_ERROR("(%ld) failed to validate user bo\n", r);
-
- radeon_bo_unreserve(bo);
- }
+ r = radeon_bo_reserve(bo, true);
+ if (r) {
+ DRM_ERROR("(%ld) failed to reserve user bo\n", r);
+ return true;
}
-
-out_unlock:
- mutex_unlock(&rmn->lock);
-
- return ret;
-}
-
-static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
-{
- struct mmu_notifier_range range = {
- .mm = mm,
- .start = 0,
- .end = ULONG_MAX,
- .flags = 0,
- .event = MMU_NOTIFY_UNMAP,
- };
-
- radeon_mn_invalidate_range_start(mn, &range);
-}
-
-static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
-{
- struct radeon_mn *rmn;
- rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
- if (!rmn)
- return ERR_PTR(-ENOMEM);
+ r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
+ MAX_SCHEDULE_TIMEOUT);
+ if (r <= 0)
+ DRM_ERROR("(%ld) failed to wait for user bo\n", r);
- mutex_init(&rmn->lock);
- rmn->objects = RB_ROOT_CACHED;
- return &rmn->mn;
-}
+ radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
+ r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ if (r)
+ DRM_ERROR("(%ld) failed to validate user bo\n", r);
-static void radeon_mn_free_notifier(struct mmu_notifier *mn)
-{
- kfree(container_of(mn, struct radeon_mn, mn));
+ radeon_bo_unreserve(bo);
+ return true;
}
-static const struct mmu_notifier_ops radeon_mn_ops = {
- .release = radeon_mn_release,
- .invalidate_range_start = radeon_mn_invalidate_range_start,
- .alloc_notifier = radeon_mn_alloc_notifier,
- .free_notifier = radeon_mn_free_notifier,
+static const struct mmu_interval_notifier_ops radeon_mn_ops = {
+ .invalidate = radeon_mn_invalidate,
};
/**
@@ -174,51 +94,20 @@ static const struct mmu_notifier_ops radeon_mn_ops = {
*/
int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
{
- unsigned long end = addr + radeon_bo_size(bo) - 1;
- struct mmu_notifier *mn;
- struct radeon_mn *rmn;
- struct radeon_mn_node *node = NULL;
- struct list_head bos;
- struct interval_tree_node *it;
-
- mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
- if (IS_ERR(mn))
- return PTR_ERR(mn);
- rmn = container_of(mn, struct radeon_mn, mn);
-
- INIT_LIST_HEAD(&bos);
-
- mutex_lock(&rmn->lock);
-
- while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) {
- kfree(node);
- node = container_of(it, struct radeon_mn_node, it);
- interval_tree_remove(&node->it, &rmn->objects);
- addr = min(it->start, addr);
- end = max(it->last, end);
- list_splice(&node->bos, &bos);
- }
-
- if (!node) {
- node = kmalloc(sizeof(struct radeon_mn_node), GFP_KERNEL);
- if (!node) {
- mutex_unlock(&rmn->lock);
- return -ENOMEM;
- }
- }
-
- bo->mn = rmn;
-
- node->it.start = addr;
- node->it.last = end;
- INIT_LIST_HEAD(&node->bos);
- list_splice(&bos, &node->bos);
- list_add(&bo->mn_list, &node->bos);
-
- interval_tree_insert(&node->it, &rmn->objects);
-
- mutex_unlock(&rmn->lock);
-
+ int ret;
+
+ ret = mmu_interval_notifier_insert(&bo->notifier, current->mm, addr,
+ radeon_bo_size(bo), &radeon_mn_ops);
+ if (ret)
+ return ret;
+
+ /*
+ * FIXME: radeon appears to allow get_user_pages to run during
+ * invalidate_range_start/end, which is not a safe way to read the
+ * PTEs. It should use the mmu_interval_read_begin() scheme around the
+ * get_user_pages to ensure that the PTEs are read properly
+ */
+ mmu_interval_read_begin(&bo->notifier);
return 0;
}
@@ -231,27 +120,8 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
*/
void radeon_mn_unregister(struct radeon_bo *bo)
{
- struct radeon_mn *rmn = bo->mn;
- struct list_head *head;
-
- if (!rmn)
+ if (!bo->notifier.mm)
return;
-
- mutex_lock(&rmn->lock);
- /* save the next list entry for later */
- head = bo->mn_list.next;
-
- list_del(&bo->mn_list);
-
- if (list_empty(head)) {
- struct radeon_mn_node *node;
- node = container_of(head, struct radeon_mn_node, bos);
- interval_tree_remove(&node->it, &rmn->objects);
- kfree(node);
- }
-
- mutex_unlock(&rmn->lock);
-
- mmu_notifier_put(&rmn->mn);
- bo->mn = NULL;
+ mmu_interval_notifier_remove(&bo->notifier);
+ bo->notifier.mm = NULL;
}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9ebb09a75049..84dd74fe13b8 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2634,7 +2634,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vf_guid);
SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port);
- SET_DEVICE_OP(dev_ops, invalidate_range);
SET_DEVICE_OP(dev_ops, iw_accept);
SET_DEVICE_OP(dev_ops, iw_add_ref);
SET_DEVICE_OP(dev_ops, iw_connect);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index d7d5fadf0899..e42d44e501fd 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -48,197 +48,33 @@
#include "uverbs.h"
-static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
+static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
{
- mutex_lock(&umem_odp->umem_mutex);
- if (umem_odp->notifiers_count++ == 0)
- /*
- * Initialize the completion object for waiting on
- * notifiers. Since notifier_count is zero, no one should be
- * waiting right now.
- */
- reinit_completion(&umem_odp->notifier_completion);
- mutex_unlock(&umem_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
-{
- mutex_lock(&umem_odp->umem_mutex);
- /*
- * This sequence increase will notify the QP page fault that the page
- * that is going to be mapped in the spte could have been freed.
- */
- ++umem_odp->notifiers_seq;
- if (--umem_odp->notifiers_count == 0)
- complete_all(&umem_odp->notifier_completion);
- mutex_unlock(&umem_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
-{
- struct ib_ucontext_per_mm *per_mm =
- container_of(mn, struct ib_ucontext_per_mm, mn);
- struct rb_node *node;
-
- down_read(&per_mm->umem_rwsem);
- if (!per_mm->mn.users)
- goto out;
-
- for (node = rb_first_cached(&per_mm->umem_tree); node;
- node = rb_next(node)) {
- struct ib_umem_odp *umem_odp =
- rb_entry(node, struct ib_umem_odp, interval_tree.rb);
-
- /*
- * Increase the number of notifiers running, to prevent any
- * further fault handling on this MR.
- */
- ib_umem_notifier_start_account(umem_odp);
- complete_all(&umem_odp->notifier_completion);
- umem_odp->umem.ibdev->ops.invalidate_range(
- umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- }
-
-out:
- up_read(&per_mm->umem_rwsem);
-}
-
-static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
- u64 start, u64 end, void *cookie)
-{
- ib_umem_notifier_start_account(item);
- item->umem.ibdev->ops.invalidate_range(item, start, end);
- return 0;
-}
-
-static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
-{
- struct ib_ucontext_per_mm *per_mm =
- container_of(mn, struct ib_ucontext_per_mm, mn);
- int rc;
-
- if (mmu_notifier_range_blockable(range))
- down_read(&per_mm->umem_rwsem);
- else if (!down_read_trylock(&per_mm->umem_rwsem))
- return -EAGAIN;
-
- if (!per_mm->mn.users) {
- up_read(&per_mm->umem_rwsem);
- /*
- * At this point users is permanently zero and visible to this
- * CPU without a lock, that fact is relied on to skip the unlock
- * in range_end.
- */
- return 0;
- }
-
- rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
- range->end,
- invalidate_range_start_trampoline,
- mmu_notifier_range_blockable(range),
- NULL);
- if (rc)
- up_read(&per_mm->umem_rwsem);
- return rc;
-}
-
-static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
- u64 end, void *cookie)
-{
- ib_umem_notifier_end_account(item);
- return 0;
-}
-
-static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
-{
- struct ib_ucontext_per_mm *per_mm =
- container_of(mn, struct ib_ucontext_per_mm, mn);
-
- if (unlikely(!per_mm->mn.users))
- return;
-
- rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
- range->end,
- invalidate_range_end_trampoline, true, NULL);
- up_read(&per_mm->umem_rwsem);
-}
-
-static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
-{
- struct ib_ucontext_per_mm *per_mm;
-
- per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
- if (!per_mm)
- return ERR_PTR(-ENOMEM);
-
- per_mm->umem_tree = RB_ROOT_CACHED;
- init_rwsem(&per_mm->umem_rwsem);
-
- WARN_ON(mm != current->mm);
- rcu_read_lock();
- per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- rcu_read_unlock();
- return &per_mm->mn;
-}
-
-static void ib_umem_free_notifier(struct mmu_notifier *mn)
-{
- struct ib_ucontext_per_mm *per_mm =
- container_of(mn, struct ib_ucontext_per_mm, mn);
-
- WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
-
- put_pid(per_mm->tgid);
- kfree(per_mm);
-}
-
-static const struct mmu_notifier_ops ib_umem_notifiers = {
- .release = ib_umem_notifier_release,
- .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
- .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
- .alloc_notifier = ib_umem_alloc_notifier,
- .free_notifier = ib_umem_free_notifier,
-};
-
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
-{
- struct ib_ucontext_per_mm *per_mm;
- struct mmu_notifier *mn;
int ret;
umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+
if (!umem_odp->is_implicit_odp) {
size_t page_size = 1UL << umem_odp->page_shift;
+ unsigned long start;
+ unsigned long end;
size_t pages;
- umem_odp->interval_tree.start =
- ALIGN_DOWN(umem_odp->umem.address, page_size);
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
if (check_add_overflow(umem_odp->umem.address,
(unsigned long)umem_odp->umem.length,
- &umem_odp->interval_tree.last))
+ &end))
return -EOVERFLOW;
- umem_odp->interval_tree.last =
- ALIGN(umem_odp->interval_tree.last, page_size);
- if (unlikely(umem_odp->interval_tree.last < page_size))
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
return -EOVERFLOW;
- pages = (umem_odp->interval_tree.last -
- umem_odp->interval_tree.start) >>
- umem_odp->page_shift;
+ pages = (end - start) >> umem_odp->page_shift;
if (!pages)
return -EINVAL;
- /*
- * Note that the representation of the intervals in the
- * interval tree considers the ending point as contained in
- * the interval.
- */
- umem_odp->interval_tree.last--;
-
umem_odp->page_list = kvcalloc(
pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
if (!umem_odp->page_list)
@@ -250,26 +86,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
ret = -ENOMEM;
goto out_page_list;
}
- }
- mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
- if (IS_ERR(mn)) {
- ret = PTR_ERR(mn);
- goto out_dma_list;
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm,
+ start, end - start, ops);
+ if (ret)
+ goto out_dma_list;
}
- umem_odp->per_mm = per_mm =
- container_of(mn, struct ib_ucontext_per_mm, mn);
-
- mutex_init(&umem_odp->umem_mutex);
- init_completion(&umem_odp->notifier_completion);
-
- if (!umem_odp->is_implicit_odp) {
- down_write(&per_mm->umem_rwsem);
- interval_tree_insert(&umem_odp->interval_tree,
- &per_mm->umem_tree);
- up_write(&per_mm->umem_rwsem);
- }
- mmgrab(umem_odp->umem.owning_mm);
return 0;
@@ -305,8 +128,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
if (!context)
return ERR_PTR(-EIO);
- if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
- return ERR_PTR(-EINVAL);
umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
if (!umem_odp)
@@ -318,8 +139,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
- ret = ib_init_umem_odp(umem_odp);
+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ ret = ib_init_umem_odp(umem_odp, NULL);
if (ret) {
+ put_pid(umem_odp->tgid);
kfree(umem_odp);
return ERR_PTR(ret);
}
@@ -336,8 +159,10 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
* @addr: The starting userspace VA
* @size: The length of the userspace VA
*/
-struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
- unsigned long addr, size_t size)
+struct ib_umem_odp *
+ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
+ size_t size,
+ const struct mmu_interval_notifier_ops *ops)
{
/*
* Caller must ensure that root cannot be freed during the call to
@@ -360,9 +185,12 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
umem->writable = root->umem.writable;
umem->owning_mm = root->umem.owning_mm;
odp_data->page_shift = PAGE_SHIFT;
+ odp_data->notifier.ops = ops;
- ret = ib_init_umem_odp(odp_data);
+ odp_data->tgid = get_pid(root->tgid);
+ ret = ib_init_umem_odp(odp_data, ops);
if (ret) {
+ put_pid(odp_data->tgid);
kfree(odp_data);
return ERR_PTR(ret);
}
@@ -383,7 +211,8 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
* conjunction with MMU notifiers.
*/
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
- size_t size, int access)
+ size_t size, int access,
+ const struct mmu_interval_notifier_ops *ops)
{
struct ib_umem_odp *umem_odp;
struct ib_ucontext *context;
@@ -398,8 +227,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
if (!context)
return ERR_PTR(-EIO);
- if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
- WARN_ON_ONCE(!context->device->ops.invalidate_range))
+ if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
return ERR_PTR(-EINVAL);
umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
@@ -411,6 +239,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);
umem_odp->umem.owning_mm = mm = current->mm;
+ umem_odp->notifier.ops = ops;
umem_odp->page_shift = PAGE_SHIFT;
if (access & IB_ACCESS_HUGETLB) {
@@ -429,11 +258,14 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
up_read(&mm->mmap_sem);
}
- ret = ib_init_umem_odp(umem_odp);
+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ ret = ib_init_umem_odp(umem_odp, ops);
if (ret)
- goto err_free;
+ goto err_put_pid;
return umem_odp;
+err_put_pid:
+ put_pid(umem_odp->tgid);
err_free:
kfree(umem_odp);
return ERR_PTR(ret);
@@ -442,8 +274,6 @@ EXPORT_SYMBOL(ib_umem_odp_get);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-
/*
* Ensure that no more pages are mapped in the umem.
*
@@ -455,28 +285,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
kvfree(umem_odp->dma_list);
kvfree(umem_odp->page_list);
+ put_pid(umem_odp->tgid);
}
-
- down_write(&per_mm->umem_rwsem);
- if (!umem_odp->is_implicit_odp) {
- interval_tree_remove(&umem_odp->interval_tree,
- &per_mm->umem_tree);
- complete_all(&umem_odp->notifier_completion);
- }
- /*
- * NOTE! mmu_notifier_unregister() can happen between a start/end
- * callback, resulting in a missing end, and thus an unbalanced
- * lock. This doesn't really matter to us since we are about to kfree
- * the memory that holds the lock, however LOCKDEP doesn't like this.
- * Thus we call the mmu_notifier_put under the rwsem and test the
- * internal users count to reliably see if we are past this point.
- */
- mmu_notifier_put(&per_mm->mn);
- up_write(&per_mm->umem_rwsem);
-
- mmdrop(umem_odp->umem.owning_mm);
kfree(umem_odp);
}
EXPORT_SYMBOL(ib_umem_odp_release);
@@ -501,7 +314,7 @@ EXPORT_SYMBOL(ib_umem_odp_release);
*/
static int ib_umem_odp_map_dma_single_page(
struct ib_umem_odp *umem_odp,
- int page_index,
+ unsigned int page_index,
struct page *page,
u64 access_mask,
unsigned long current_seq)
@@ -510,12 +323,7 @@ static int ib_umem_odp_map_dma_single_page(
dma_addr_t dma_addr;
int ret = 0;
- /*
- * Note: we avoid writing if seq is different from the initial seq, to
- * handle case of a racing notifier. This check also allows us to bail
- * early if we have a notifier running in parallel with us.
- */
- if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
+ if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) {
ret = -EAGAIN;
goto out;
}
@@ -618,7 +426,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
* existing beyond the lifetime of the originating process.. Presumably
* mmget_not_zero will fail in this case.
*/
- owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
+ owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
if (!owning_process || !mmget_not_zero(owning_mm)) {
ret = -EINVAL;
goto out_put_task;
@@ -762,32 +570,3 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
-
-/* @last is not a part of the interval. See comment for function
- * node_last.
- */
-int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
- u64 start, u64 last,
- umem_call_back cb,
- bool blockable,
- void *cookie)
-{
- int ret_val = 0;
- struct interval_tree_node *node, *next;
- struct ib_umem_odp *umem;
-
- if (unlikely(start == last))
- return ret_val;
-
- for (node = interval_tree_iter_first(root, start, last - 1);
- node; node = next) {
- /* TODO move the blockable decision up to the callback */
- if (!blockable)
- return -EAGAIN;
- next = interval_tree_iter_next(node, start, last - 1);
- umem = container_of(node, struct ib_umem_odp, interval_tree);
- ret_val = cb(umem, start, last, cookie) || ret_val;
- }
-
- return ret_val;
-}
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index f9a7e9d29c8b..7c5e3fb22413 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -1138,7 +1138,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
/* adjust flag if this fd is not able to cache */
- if (!fd->handler)
+ if (!fd->use_mn)
cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
cinfo.num_active = hfi1_count_active_units();
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index fa45350a9a1d..fc10d65fc3e1 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1444,7 +1444,7 @@ struct hfi1_filedata {
/* for cpu affinity; -1 if none */
int rec_cpu_num;
u32 tid_n_pinned;
- struct mmu_rb_handler *handler;
+ bool use_mn;
struct tid_rb_node **entry_to_rb;
spinlock_t tid_lock; /* protect tid_[limit,used] counters */
u32 tid_limit;
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 3592a9ec155e..f05742ac0949 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -59,11 +59,11 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
struct tid_user_buf *tbuf,
u32 rcventry, struct tid_group *grp,
u16 pageidx, unsigned int npages);
-static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
struct tid_rb_node *tnode);
-static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
-static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
+static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
struct tid_group *grp,
unsigned int start, u16 count,
@@ -73,10 +73,8 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
struct tid_group **grp);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
-static struct mmu_rb_ops tid_rb_ops = {
- .insert = tid_rb_insert,
- .remove = tid_rb_remove,
- .invalidate = tid_rb_invalidate
+static const struct mmu_interval_notifier_ops tid_mn_ops = {
+ .invalidate = tid_rb_invalidate,
};
/*
@@ -87,7 +85,6 @@ static struct mmu_rb_ops tid_rb_ops = {
int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
struct hfi1_ctxtdata *uctxt)
{
- struct hfi1_devdata *dd = uctxt->dd;
int ret = 0;
spin_lock_init(&fd->tid_lock);
@@ -109,20 +106,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
fd->entry_to_rb = NULL;
return -ENOMEM;
}
-
- /*
- * Register MMU notifier callbacks. If the registration
- * fails, continue without TID caching for this context.
- */
- ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
- dd->pport->hfi1_wq,
- &fd->handler);
- if (ret) {
- dd_dev_info(dd,
- "Failed MMU notifier registration %d\n",
- ret);
- ret = 0;
- }
+ fd->use_mn = true;
}
/*
@@ -139,7 +123,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
* init.
*/
spin_lock(&fd->tid_lock);
- if (uctxt->subctxt_cnt && fd->handler) {
+ if (uctxt->subctxt_cnt && fd->use_mn) {
u16 remainder;
fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
@@ -158,18 +142,10 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
- /*
- * The notifier would have been removed when the process'es mm
- * was freed.
- */
- if (fd->handler) {
- hfi1_mmu_rb_unregister(fd->handler);
- } else {
- if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
- unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
- if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
- unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
- }
+ if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+ unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
+ if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+ unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
kfree(fd->invalid_tids);
fd->invalid_tids = NULL;
@@ -201,7 +177,7 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd,
if (mapped) {
pci_unmap_single(dd->pcidev, node->dma_addr,
- node->mmu.len, PCI_DMA_FROMDEVICE);
+ node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
pages = &node->pages[idx];
} else {
pages = &tidbuf->pages[idx];
@@ -777,8 +753,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
return -EFAULT;
}
- node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
- node->mmu.len = npages * PAGE_SIZE;
+ node->fdata = fd;
node->phys = page_to_phys(pages[0]);
node->npages = npages;
node->rcventry = rcventry;
@@ -787,23 +762,35 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
node->freed = false;
memcpy(node->pages, pages, sizeof(struct page *) * npages);
- if (!fd->handler)
- ret = tid_rb_insert(fd, &node->mmu);
- else
- ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
-
- if (ret) {
- hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
- node->rcventry, node->mmu.addr, node->phys, ret);
- pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
- PCI_DMA_FROMDEVICE);
- kfree(node);
- return -EFAULT;
+ if (fd->use_mn) {
+ ret = mmu_interval_notifier_insert(
+ &node->notifier, fd->mm,
+ tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
+ &tid_mn_ops);
+ if (ret)
+ goto out_unmap;
+ /*
+ * FIXME: This is in the wrong order, the notifier should be
+ * established before the pages are pinned by pin_rcv_pages.
+ */
+ mmu_interval_read_begin(&node->notifier);
}
+ fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
+
hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
- node->mmu.addr, node->phys, phys);
+ node->notifier.interval_tree.start, node->phys,
+ phys);
return 0;
+
+out_unmap:
+ hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+ node->rcventry, node->notifier.interval_tree.start,
+ node->phys, ret);
+ pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+ kfree(node);
+ return -EFAULT;
}
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
@@ -833,10 +820,9 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
if (grp)
*grp = node->grp;
- if (!fd->handler)
- cacheless_tid_rb_remove(fd, node);
- else
- hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(&node->notifier);
+ cacheless_tid_rb_remove(fd, node);
return 0;
}
@@ -847,7 +833,8 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
struct hfi1_devdata *dd = uctxt->dd;
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
- node->npages, node->mmu.addr, node->phys,
+ node->npages,
+ node->notifier.interval_tree.start, node->phys,
node->dma_addr);
/*
@@ -894,30 +881,29 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
if (!node || node->rcventry != rcventry)
continue;
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(
+ &node->notifier);
cacheless_tid_rb_remove(fd, node);
}
}
}
}
-/*
- * Always return 0 from this function. A non-zero return indicates that the
- * remove operation will be called and that memory should be unpinned.
- * However, the driver cannot unpin out from under PSM. Instead, retain the
- * memory (by returning 0) and inform PSM that the memory is going away. PSM
- * will call back later when it has removed the memory from its list.
- */
-static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
- struct hfi1_filedata *fdata = arg;
- struct hfi1_ctxtdata *uctxt = fdata->uctxt;
struct tid_rb_node *node =
- container_of(mnode, struct tid_rb_node, mmu);
+ container_of(mni, struct tid_rb_node, notifier);
+ struct hfi1_filedata *fdata = node->fdata;
+ struct hfi1_ctxtdata *uctxt = fdata->uctxt;
if (node->freed)
- return 0;
+ return true;
- trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+ trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
+ node->notifier.interval_tree.start,
node->rcventry, node->npages, node->dma_addr);
node->freed = true;
@@ -946,18 +932,7 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
fdata->invalid_tid_idx++;
}
spin_unlock(&fdata->invalid_lock);
- return 0;
-}
-
-static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
-{
- struct hfi1_filedata *fdata = arg;
- struct tid_rb_node *tnode =
- container_of(node, struct tid_rb_node, mmu);
- u32 base = fdata->uctxt->expected_base;
-
- fdata->entry_to_rb[tnode->rcventry - base] = tnode;
- return 0;
+ return true;
}
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
@@ -968,12 +943,3 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
fdata->entry_to_rb[tnode->rcventry - base] = NULL;
clear_tid_node(fdata, tnode);
}
-
-static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
-{
- struct hfi1_filedata *fdata = arg;
- struct tid_rb_node *tnode =
- container_of(node, struct tid_rb_node, mmu);
-
- cacheless_tid_rb_remove(fdata, tnode);
-}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index 43b105de1d54..6257eee083a1 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -65,7 +65,8 @@ struct tid_user_buf {
};
struct tid_rb_node {
- struct mmu_rb_node mmu;
+ struct mmu_interval_notifier notifier;
+ struct hfi1_filedata *fdata;
unsigned long phys;
struct tid_group *grp;
u32 rcventry;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 03af54b53bf7..5986953ec2fa 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1258,8 +1258,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
- unsigned long end);
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
size_t nentries, struct mlx5_ib_mr *mr, int flags);
@@ -1289,11 +1287,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
{
return -EOPNOTSUPP;
}
-static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
- unsigned long start,
- unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
+
/* Needed for rep profile */
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 60d39b9ec41c..ea8bfc3e2d8d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -749,7 +749,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_umem_odp *odp;
- odp = ib_umem_odp_get(udata, start, length, access_flags);
+ odp = ib_umem_odp_get(udata, start, length, access_flags,
+ &mlx5_mn_ops);
if (IS_ERR(odp)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
PTR_ERR(odp));
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 45ee40c2f36e..f924250f80c2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -241,18 +241,27 @@ out_unlock:
xa_unlock(&imr->implicit_children);
}
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
- unsigned long end)
+static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
+ struct ib_umem_odp *umem_odp =
+ container_of(mni, struct ib_umem_odp, notifier);
struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
sizeof(struct mlx5_mtt)) - 1;
u64 idx = 0, blk_start_idx = 0;
u64 invalidations = 0;
+ unsigned long start;
+ unsigned long end;
int in_block = 0;
u64 addr;
+ if (!mmu_notifier_range_blockable(range))
+ return false;
+
mutex_lock(&umem_odp->umem_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
/*
* If npages is zero then umem_odp->private may not be setup yet. This
* does not complete until after the first page is mapped for DMA.
@@ -261,8 +270,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
goto out;
mr = umem_odp->private;
- start = max_t(u64, ib_umem_start(umem_odp), start);
- end = min_t(u64, ib_umem_end(umem_odp), end);
+ start = max_t(u64, ib_umem_start(umem_odp), range->start);
+ end = min_t(u64, ib_umem_end(umem_odp), range->end);
/*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -319,8 +328,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
destroy_unused_implicit_child_mr(mr);
out:
mutex_unlock(&umem_odp->umem_mutex);
+ return true;
}
+const struct mmu_interval_notifier_ops mlx5_mn_ops = {
+ .invalidate = mlx5_ib_invalidate_range,
+};
+
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
struct ib_odp_caps *caps = &dev->odp_caps;
@@ -419,7 +433,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
idx * MLX5_IMR_MTT_SIZE,
- MLX5_IMR_MTT_SIZE);
+ MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
if (IS_ERR(odp))
return ERR_CAST(odp);
@@ -606,8 +620,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
u64 user_va, size_t bcnt, u32 *bytes_mapped,
u32 flags)
{
- int current_seq, page_shift, ret, np;
+ int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
+ unsigned long current_seq;
u64 access_mask;
u64 start_idx, page_mask;
@@ -619,12 +634,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
- current_seq = READ_ONCE(odp->notifiers_seq);
- /*
- * Ensure the sequence number is valid for some time before we call
- * gup.
- */
- smp_rmb();
+ current_seq = mmu_interval_read_begin(&odp->notifier);
np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
current_seq);
@@ -632,7 +642,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np;
mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
+ if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
@@ -662,19 +672,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np << (page_shift - PAGE_SHIFT);
out:
- if (ret == -EAGAIN) {
- unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
-
- if (!wait_for_completion_timeout(&odp->notifier_completion,
- timeout)) {
- mlx5_ib_warn(
- mr->dev,
- "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
- current_seq, odp->notifiers_seq,
- odp->notifiers_count);
- }
- }
-
return ret;
}
@@ -1622,7 +1619,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr,
- .invalidate_range = mlx5_ib_invalidate_range,
};
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h
index 2f8b949c3eeb..91e44c04f787 100644
--- a/drivers/xen/gntdev-common.h
+++ b/drivers/xen/gntdev-common.h
@@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv;
struct gntdev_priv {
/* Maps with visible offsets in the file descriptor. */
struct list_head maps;
- /*
- * Maps that are not visible; will be freed on munmap.
- * Only populated if populate_freeable_maps == 1
- */
- struct list_head freeable_maps;
/* lock protects maps and freeable_maps. */
struct mutex lock;
- struct mm_struct *mm;
- struct mmu_notifier mn;
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
/* Device for which DMA memory is allocated. */
@@ -49,6 +42,7 @@ struct gntdev_unmap_notify {
};
struct gntdev_grant_map {
+ struct mmu_interval_notifier notifier;
struct list_head next;
struct vm_area_struct *vma;
int index;
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 81401f386c9c..a04ddf2a68af 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -63,7 +63,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
static atomic_t pages_mapped = ATOMIC_INIT(0);
static int use_ptemod;
-#define populate_freeable_maps use_ptemod
static int unmap_grant_pages(struct gntdev_grant_map *map,
int offset, int pages);
@@ -249,12 +248,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
evtchn_put(map->notify.event);
}
- if (populate_freeable_maps && priv) {
- mutex_lock(&priv->lock);
- list_del(&map->next);
- mutex_unlock(&priv->lock);
- }
-
if (map->pages && !use_ptemod)
unmap_grant_pages(map, 0, map->count);
gntdev_free_map(map);
@@ -444,16 +437,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
pr_debug("gntdev_vma_close %p\n", vma);
if (use_ptemod) {
- /* It is possible that an mmu notifier could be running
- * concurrently, so take priv->lock to ensure that the vma won't
- * vanishing during the unmap_grant_pages call, since we will
- * spin here until that completes. Such a concurrent call will
- * not do any unmapping, since that has been done prior to
- * closing the vma, but it may still iterate the unmap_ops list.
- */
- mutex_lock(&priv->lock);
+ WARN_ON(map->vma != vma);
+ mmu_interval_notifier_remove(&map->notifier);
map->vma = NULL;
- mutex_unlock(&priv->lock);
}
vma->vm_private_data = NULL;
gntdev_put_map(priv, map);
@@ -475,109 +461,44 @@ static const struct vm_operations_struct gntdev_vmops = {
/* ------------------------------------------------------------------ */
-static bool in_range(struct gntdev_grant_map *map,
- unsigned long start, unsigned long end)
-{
- if (!map->vma)
- return false;
- if (map->vma->vm_start >= end)
- return false;
- if (map->vma->vm_end <= start)
- return false;
-
- return true;
-}
-
-static int unmap_if_in_range(struct gntdev_grant_map *map,
- unsigned long start, unsigned long end,
- bool blockable)
+static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
+ struct gntdev_grant_map *map =
+ container_of(mn, struct gntdev_grant_map, notifier);
unsigned long mstart, mend;
int err;
- if (!in_range(map, start, end))
- return 0;
+ if (!mmu_notifier_range_blockable(range))
+ return false;
- if (!blockable)
- return -EAGAIN;
+ /*
+ * If the VMA is split or otherwise changed the notifier is not
+ * updated, but we don't want to process VA's outside the modified
+ * VMA. FIXME: It would be much more understandable to just prevent
+ * modifying the VMA in the first place.
+ */
+ if (map->vma->vm_start >= range->end ||
+ map->vma->vm_end <= range->start)
+ return true;
- mstart = max(start, map->vma->vm_start);
- mend = min(end, map->vma->vm_end);
+ mstart = max(range->start, map->vma->vm_start);
+ mend = min(range->end, map->vma->vm_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
map->index, map->count,
map->vma->vm_start, map->vma->vm_end,
- start, end, mstart, mend);
+ range->start, range->end, mstart, mend);
err = unmap_grant_pages(map,
(mstart - map->vma->vm_start) >> PAGE_SHIFT,
(mend - mstart) >> PAGE_SHIFT);
WARN_ON(err);
- return 0;
-}
-
-static int mn_invl_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
-{
- struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
- struct gntdev_grant_map *map;
- int ret = 0;
-
- if (mmu_notifier_range_blockable(range))
- mutex_lock(&priv->lock);
- else if (!mutex_trylock(&priv->lock))
- return -EAGAIN;
-
- list_for_each_entry(map, &priv->maps, next) {
- ret = unmap_if_in_range(map, range->start, range->end,
- mmu_notifier_range_blockable(range));
- if (ret)
- goto out_unlock;
- }
- list_for_each_entry(map, &priv->freeable_maps, next) {
- ret = unmap_if_in_range(map, range->start, range->end,
- mmu_notifier_range_blockable(range));
- if (ret)
- goto out_unlock;
- }
-
-out_unlock:
- mutex_unlock(&priv->lock);
-
- return ret;
-}
-
-static void mn_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
-{
- struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
- struct gntdev_grant_map *map;
- int err;
-
- mutex_lock(&priv->lock);
- list_for_each_entry(map, &priv->maps, next) {
- if (!map->vma)
- continue;
- pr_debug("map %d+%d (%lx %lx)\n",
- map->index, map->count,
- map->vma->vm_start, map->vma->vm_end);
- err = unmap_grant_pages(map, /* offset */ 0, map->count);
- WARN_ON(err);
- }
- list_for_each_entry(map, &priv->freeable_maps, next) {
- if (!map->vma)
- continue;
- pr_debug("map %d+%d (%lx %lx)\n",
- map->index, map->count,
- map->vma->vm_start, map->vma->vm_end);
- err = unmap_grant_pages(map, /* offset */ 0, map->count);
- WARN_ON(err);
- }
- mutex_unlock(&priv->lock);
+ return true;
}
-static const struct mmu_notifier_ops gntdev_mmu_ops = {
- .release = mn_release,
- .invalidate_range_start = mn_invl_range_start,
+static const struct mmu_interval_notifier_ops gntdev_mmu_ops = {
+ .invalidate = gntdev_invalidate,
};
/* ------------------------------------------------------------------ */
@@ -592,7 +513,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
return -ENOMEM;
INIT_LIST_HEAD(&priv->maps);
- INIT_LIST_HEAD(&priv->freeable_maps);
mutex_init(&priv->lock);
#ifdef CONFIG_XEN_GNTDEV_DMABUF
@@ -604,17 +524,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
}
#endif
- if (use_ptemod) {
- priv->mm = get_task_mm(current);
- if (!priv->mm) {
- kfree(priv);
- return -ENOMEM;
- }
- priv->mn.ops = &gntdev_mmu_ops;
- ret = mmu_notifier_register(&priv->mn, priv->mm);
- mmput(priv->mm);
- }
-
if (ret) {
kfree(priv);
return ret;
@@ -644,16 +553,12 @@ static int gntdev_release(struct inode *inode, struct file *flip)
list_del(&map->next);
gntdev_put_map(NULL /* already removed */, map);
}
- WARN_ON(!list_empty(&priv->freeable_maps));
mutex_unlock(&priv->lock);
#ifdef CONFIG_XEN_GNTDEV_DMABUF
gntdev_dmabuf_fini(priv->dmabuf_priv);
#endif
- if (use_ptemod)
- mmu_notifier_unregister(&priv->mn, priv->mm);
-
kfree(priv);
return 0;
}
@@ -714,8 +619,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
if (map) {
list_del(&map->next);
- if (populate_freeable_maps)
- list_add_tail(&map->next, &priv->freeable_maps);
err = 0;
}
mutex_unlock(&priv->lock);
@@ -1087,11 +990,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
goto unlock_out;
if (use_ptemod && map->vma)
goto unlock_out;
- if (use_ptemod && priv->mm != vma->vm_mm) {
- pr_warn("Huh? Other mm?\n");
- goto unlock_out;
- }
-
refcount_inc(&map->users);
vma->vm_ops = &gntdev_vmops;
@@ -1102,10 +1000,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
vma->vm_flags |= VM_DONTCOPY;
vma->vm_private_data = map;
-
- if (use_ptemod)
- map->vma = vma;
-
if (map->flags) {
if ((vma->vm_flags & VM_WRITE) &&
(map->flags & GNTMAP_readonly))
@@ -1116,8 +1010,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
map->flags |= GNTMAP_readonly;
}
+ if (use_ptemod) {
+ map->vma = vma;
+ err = mmu_interval_notifier_insert_locked(
+ &map->notifier, vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
+ if (err)
+ goto out_unlock_put;
+ }
mutex_unlock(&priv->lock);
+ /*
+ * gntdev takes the address of the PTE in find_grant_ptes() and passes
+ * it to the hypervisor in gntdev_map_grant_pages(). The purpose of
+ * the notifier is to prevent the hypervisor pointer to the PTE from
+ * going stale.
+ *
+ * Since this vma's mappings can't be touched without the mmap_sem,
+ * and we are holding it now, there is no need for the notifier_range
+ * locking pattern.
+ */
+ mmu_interval_read_begin(&map->notifier);
+
if (use_ptemod) {
map->pages_vm_start = vma->vm_start;
err = apply_to_page_range(vma->vm_mm, vma->vm_start,
@@ -1166,8 +1080,11 @@ out_unlock_put:
mutex_unlock(&priv->lock);
out_put_map:
if (use_ptemod) {
- map->vma = NULL;
unmap_grant_pages(map, 0, map->count);
+ if (map->vma) {
+ mmu_interval_notifier_remove(&map->notifier);
+ map->vma = NULL;
+ }
}
gntdev_put_map(priv, map);
return err;