diff options
author | Haggai Eran <haggaie@mellanox.com> | 2014-12-11 17:04:18 +0200 |
---|---|---|
committer | Roland Dreier <roland@purestorage.com> | 2014-12-15 18:13:36 -0800 |
commit | 882214e2b12860bff1ccff15a3ec2bbb29d58c02 (patch) | |
tree | a3609ca71cec22f0c80b4f1b3d5bebf8024051bb /include | |
parent | 8ada2c1c0c1d75a60723cd2ca7d49c594a146af6 (diff) |
IB/core: Implement support for MMU notifiers regarding on demand paging regions
* Add an interval tree implementation for ODP umems. Create an
interval tree for each ucontext (including a count of the number of
ODP MRs in this context, semaphore, etc.), and register ODP umems in
the interval tree.
* Add MMU notifiers handling functions, using the interval tree to
notify only the relevant umems and underlying MRs.
* Register to receive MMU notifier events from the MM subsystem upon
ODP MR registration (and unregister accordingly).
* Add a completion object to synchronize the destruction of ODP umems.
* Add mechanism to abort page faults when there's a concurrent invalidation.
The way we synchronize between concurrent invalidations and page
faults is by keeping a counter of currently running invalidations, and
a sequence number that is incremented whenever an invalidation is
caught. The page fault code checks the counter and also verifies that
the sequence number hasn't progressed before it updates the umem's
page tables. This is similar to what the kvm module does.
In order to prevent the case where we register a umem in the middle of
an ongoing notifier, we also keep a per ucontext counter of the total
number of active mmu notifiers. We only enable new umems when all the
running notifiers complete.
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yuval Dagan <yuvalda@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'include')
-rw-r--r-- | include/rdma/ib_umem_odp.h | 65 | ||||
-rw-r--r-- | include/rdma/ib_verbs.h | 19 |
2 files changed, 83 insertions, 1 deletions
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index b5a2df1923b7..3da0b167041b 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -34,6 +34,13 @@ #define IB_UMEM_ODP_H #include <rdma/ib_umem.h> +#include <rdma/ib_verbs.h> +#include <linux/interval_tree.h> + +struct umem_odp_node { + u64 __subtree_last; + struct rb_node rb; +}; struct ib_umem_odp { /* @@ -51,10 +58,27 @@ struct ib_umem_odp { dma_addr_t *dma_list; /* * The umem_mutex protects the page_list and dma_list fields of an ODP - * umem, allowing only a single thread to map/unmap pages. + * umem, allowing only a single thread to map/unmap pages. The mutex + * also protects access to the mmu notifier counters. */ struct mutex umem_mutex; void *private; /* for the HW driver to use. */ + + /* When false, use the notifier counter in the ucontext struct. */ + bool mn_counters_active; + int notifiers_seq; + int notifiers_count; + + /* A linked list of umems that don't have private mmu notifier + * counters yet. */ + struct list_head no_private_counters; + struct ib_umem *umem; + + /* Tree tracking */ + struct umem_odp_node interval_tree; + + struct completion notifier_completion; + int dying; }; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING @@ -82,6 +106,45 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bound); +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, + void *cookie); +/* + * Call the callback on each ib_umem in the range. Returns the logical or of + * the return values of the functions called. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, + umem_call_back cb, void *cookie); + +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, + u64 start, u64 last); +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, + u64 start, u64 last); + +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, + unsigned long mmu_seq) +{ + /* + * This code is strongly based on the KVM code from + * mmu_notifier_retry. Should be called with + * the relevant locks taken (item->odp_data->umem_mutex + * and the ucontext umem_mutex semaphore locked for read). + */ + + /* Do not allow page faults while the new ib_umem hasn't seen a state + * with zero notifiers yet, and doesn't have its own valid set of + * private counters. */ + if (!item->odp_data->mn_counters_active) + return 1; + + if (unlikely(item->odp_data->notifiers_count)) + return 1; + if (item->odp_data->notifiers_seq != mmu_seq) + return 1; + return 0; +} + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int ib_umem_odp_get(struct ib_ucontext *context, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3af5dcad1b69..0d74f1de99aa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -51,6 +51,7 @@ #include <uapi/linux/if_ether.h> #include <linux/atomic.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> extern struct workqueue_struct *ib_wq; @@ -1139,6 +1140,8 @@ struct ib_fmr_attr { u8 page_shift; }; +struct ib_umem; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; @@ -1153,6 +1156,22 @@ struct ib_ucontext { int closing; struct pid *tgid; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct rb_root umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + void (*invalidate_range)(struct ib_umem *umem, + unsigned long start, unsigned long end); + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + int odp_mrs_count; +#endif }; struct ib_uobject { |