summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 10:07:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 10:07:42 -0700
commit84da111de0b4be15bd500deff773f5116f39f7be (patch)
tree76b5796f8258397bf7a3926b742a89166a8501ef /include
parent227c3e9eb5cf3552c2cc83225df6d14adb05f8e8 (diff)
parent62974fc389b364d8af70e044836362222bd3ae53 (diff)
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe: "This is more cleanup and consolidation of the hmm APIs and the very strongly related mmu_notifier interfaces. Many places across the tree using these interfaces are touched in the process. Beyond that a cleanup to the page walker API and a few memremap related changes round out the series: - General improvement of hmm_range_fault() and related APIs, more documentation, bug fixes from testing, API simplification & consolidation, and unused API removal - Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE, and make them internal kconfig selects - Hoist a lot of code related to mmu notifier attachment out of drivers by using a refcount get/put attachment idiom and remove the convoluted mmu_notifier_unregister_no_release() and related APIs. - General API improvement for the migrate_vma API and revision of its only user in nouveau - Annotate mmu_notifiers with lockdep and sleeping region debugging Two series unrelated to HMM or mmu_notifiers came along due to dependencies: - Allow pagemap's memremap_pages family of APIs to work without providing a struct device - Make walk_page_range() and related use a constant structure for function pointers" * tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits) libnvdimm: Enable unit test infrastructure compile checks mm, notifier: Catch sleeping/blocking for !blockable kernel.h: Add non_block_start/end() drm/radeon: guard against calling an unpaired radeon_mn_unregister() csky: add missing brackets in a macro for tlb.h pagewalk: use lockdep_assert_held for locking validation pagewalk: separate function pointers from iterator data mm: split out a new pagewalk.h header from mm.h mm/mmu_notifiers: annotate with might_sleep() mm/mmu_notifiers: prime lockdep mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports mm/hmm: hmm_range_fault() infinite loop mm/hmm: hmm_range_fault() NULL pointer bug mm/hmm: fix hmm_range_fault()'s handling of swapped out pages mm/mmu_notifiers: remove unregister_no_release RDMA/odp: remove ib_ucontext from ib_umem RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm' RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/hmm.h125
-rw-r--r--include/linux/ioport.h2
-rw-r--r--include/linux/kernel.h23
-rw-r--r--include/linux/memremap.h3
-rw-r--r--include/linux/migrate.h120
-rw-r--r--include/linux/mm.h46
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/mmu_notifier.h59
-rw-r--r--include/linux/pagewalk.h66
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/rdma/ib_umem.h2
-rw-r--r--include/rdma/ib_umem_odp.h58
-rw-r--r--include/rdma/ib_verbs.h7
13 files changed, 223 insertions, 298 deletions
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7ef56dc18050..3fec513b9c00 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -84,15 +84,12 @@
* @notifiers: count of active mmu notifiers
*/
struct hmm {
- struct mm_struct *mm;
- struct kref kref;
+ struct mmu_notifier mmu_notifier;
spinlock_t ranges_lock;
struct list_head ranges;
struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
struct rw_semaphore mirrors_sem;
wait_queue_head_t wq;
- struct rcu_head rcu;
long notifiers;
};
@@ -158,13 +155,11 @@ enum hmm_pfn_value_e {
* @values: pfn value for some special case (none, special, error, ...)
* @default_flags: default flags for the range (write, read, ... see hmm doc)
* @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
- * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct hmm *hmm;
- struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
unsigned long end;
@@ -173,32 +168,11 @@ struct hmm_range {
const uint64_t *values;
uint64_t default_flags;
uint64_t pfn_flags_mask;
- uint8_t page_shift;
uint8_t pfn_shift;
bool valid;
};
/*
- * hmm_range_page_shift() - return the page shift for the range
- * @range: range being queried
- * Return: page shift (page size = 1 << page shift) for the range
- */
-static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
-{
- return range->page_shift;
-}
-
-/*
- * hmm_range_page_size() - return the page size for the range
- * @range: range being queried
- * Return: page size for the range in bytes
- */
-static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
-{
- return 1UL << hmm_range_page_shift(range);
-}
-
-/*
* hmm_range_wait_until_valid() - wait for range to be valid
* @range: range affected by invalidation to wait on
* @timeout: time out for wait in ms (ie abort wait after that period of time)
@@ -291,40 +265,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
}
/*
- * Old API:
- * hmm_pfn_to_page()
- * hmm_pfn_to_pfn()
- * hmm_pfn_from_page()
- * hmm_pfn_from_pfn()
- *
- * This are the OLD API please use new API, it is here to avoid cross-tree
- * merge painfullness ie we convert things to new API in stages.
- */
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
- uint64_t pfn)
-{
- return hmm_device_entry_to_page(range, pfn);
-}
-
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
- uint64_t pfn)
-{
- return hmm_device_entry_to_pfn(range, pfn);
-}
-
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
- struct page *page)
-{
- return hmm_device_entry_from_page(range, page);
-}
-
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
-{
- return hmm_device_entry_from_pfn(range, pfn);
-}
-
-/*
* Mirroring: how to synchronize device page table with CPU page table.
*
* A device driver that is participating in HMM mirroring must always
@@ -375,29 +315,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
struct hmm_mirror;
/*
- * enum hmm_update_event - type of update
- * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
- */
-enum hmm_update_event {
- HMM_UPDATE_INVALIDATE,
-};
-
-/*
- * struct hmm_update - HMM update information for callback
- *
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
- * @event: event triggering the update (what is happening)
- * @blockable: can the callback block/sleep ?
- */
-struct hmm_update {
- unsigned long start;
- unsigned long end;
- enum hmm_update_event event;
- bool blockable;
-};
-
-/*
* struct hmm_mirror_ops - HMM mirror device operations callback
*
* @update: callback to update range on a device
@@ -417,9 +334,9 @@ struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
- * @update: update information (see struct hmm_update)
- * Return: -EAGAIN if update.blockable false and callback need to
- * block, 0 otherwise.
+ * @update: update information (see struct mmu_notifier_range)
+ * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
+ * and callback needs to block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
@@ -430,8 +347,9 @@ struct hmm_mirror_ops {
* page tables are completely updated (TLBs flushed, etc); this is a
* synchronous call.
*/
- int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- const struct hmm_update *update);
+ int (*sync_cpu_device_pagetables)(
+ struct hmm_mirror *mirror,
+ const struct mmu_notifier_range *update);
};
/*
@@ -457,20 +375,24 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
-int hmm_range_register(struct hmm_range *range,
- struct hmm_mirror *mirror,
- unsigned long start,
- unsigned long end,
- unsigned page_shift);
+int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
void hmm_range_unregister(struct hmm_range *range);
-long hmm_range_snapshot(struct hmm_range *range);
-long hmm_range_fault(struct hmm_range *range, bool block);
+
+/*
+ * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
+ */
+#define HMM_FAULT_ALLOW_RETRY (1 << 0)
+
+/* Don't fault in missing PTEs, just snapshot the current state. */
+#define HMM_FAULT_SNAPSHOT (1 << 1)
+
+long hmm_range_fault(struct hmm_range *range, unsigned int flags);
+
long hmm_range_dma_map(struct hmm_range *range,
struct device *device,
dma_addr_t *daddrs,
- bool block);
+ unsigned int flags);
long hmm_range_dma_unmap(struct hmm_range *range,
- struct vm_area_struct *vma,
struct device *device,
dma_addr_t *daddrs,
bool dirty);
@@ -484,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
*/
#define HMM_RANGE_DEFAULT_TIMEOUT 1000
-/* Below are for HMM internal use only! Not to be used by device driver! */
-static inline void hmm_mm_init(struct mm_struct *mm)
-{
- mm->hmm = NULL;
-}
-#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-static inline void hmm_mm_init(struct mm_struct *mm) {}
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
#endif /* LINUX_HMM_H */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5b6a7121c9f0..7bddddfc76d6 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -297,6 +297,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size);
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name);
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_IOPORT_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4fa360a13c1e..d83d403dac2e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -217,7 +217,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
* might_sleep - annotation for functions that can sleep
*
* this macro will print a stack trace if it is executed in an atomic
- * context (spinlock, irq-handler, ...).
+ * context (spinlock, irq-handler, ...). Additional sections where blocking is
+ * not allowed can be annotated with non_block_start() and non_block_end()
+ * pairs.
*
* This is a useful debugging help to be able to catch problems early and not
* be bitten later when the calling function happens to sleep when it is not
@@ -233,6 +235,23 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
# define cant_sleep() \
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
+/**
+ * non_block_start - annotate the start of section where sleeping is prohibited
+ *
+ * This is on behalf of the oom reaper, specifically when it is calling the mmu
+ * notifiers. The problem is that if the notifier were to block on, for example,
+ * mutex_lock() and if the process which holds that mutex were to perform a
+ * sleeping memory allocation, the oom reaper is now blocked on completion of
+ * that memory allocation. Other blocking calls like wait_event() pose similar
+ * issues.
+ */
+# define non_block_start() (current->non_block_count++)
+/**
+ * non_block_end - annotate the end of section where sleeping is prohibited
+ *
+ * Closes a section opened by non_block_start().
+ */
+# define non_block_end() WARN_ON(current->non_block_count-- == 0)
#else
static inline void ___might_sleep(const char *file, int line,
int preempt_offset) { }
@@ -241,6 +260,8 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
+# define non_block_start() do { } while (0)
+# define non_block_end() do { } while (0)
#endif
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f8a5b2a19945..fb2a0bd826b9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -109,7 +109,6 @@ struct dev_pagemap {
struct percpu_ref *ref;
struct percpu_ref internal_ref;
struct completion done;
- struct device *dev;
enum memory_type type;
unsigned int flags;
u64 pci_p2pdma_bus_offset;
@@ -124,6 +123,8 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+void *memremap_pages(struct dev_pagemap *pgmap, int nid);
+void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7f04754c7f2b..72120061b7d4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -166,8 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
#define MIGRATE_PFN_MIGRATE (1UL << 1)
#define MIGRATE_PFN_LOCKED (1UL << 2)
#define MIGRATE_PFN_WRITE (1UL << 3)
-#define MIGRATE_PFN_DEVICE (1UL << 4)
-#define MIGRATE_PFN_ERROR (1UL << 5)
#define MIGRATE_PFN_SHIFT 6
static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
@@ -182,107 +180,27 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
}
-/*
- * struct migrate_vma_ops - migrate operation callback
- *
- * @alloc_and_copy: alloc destination memory and copy source memory to it
- * @finalize_and_map: allow caller to map the successfully migrated pages
- *
- *
- * The alloc_and_copy() callback happens once all source pages have been locked,
- * unmapped and checked (checked whether pinned or not). All pages that can be
- * migrated will have an entry in the src array set with the pfn value of the
- * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
- * flags might be set but should be ignored by the callback).
- *
- * The alloc_and_copy() callback can then allocate destination memory and copy
- * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
- * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
- * callback must update each corresponding entry in the dst array with the pfn
- * value of the destination page and with the MIGRATE_PFN_VALID and
- * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
- * locked, via lock_page()).
- *
- * At this point the alloc_and_copy() callback is done and returns.
- *
- * Note that the callback does not have to migrate all the pages that are
- * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
- * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
- * set in the src array entry). If the device driver cannot migrate a device
- * page back to system memory, then it must set the corresponding dst array
- * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
- * access any of the virtual addresses originally backed by this page. Because
- * a SIGBUS is such a severe result for the userspace process, the device
- * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
- * unrecoverable state.
- *
- * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
- * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing device driver to allocate device memory for those unback virtual
- * address. For this the device driver simply have to allocate device memory
- * and properly set the destination entry like for regular migration. Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entry inside the finalize_and_map()
- * callback just like for regular migration.
- *
- * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
- * OR BAD THINGS WILL HAPPEN !
- *
- *
- * The finalize_and_map() callback happens after struct page migration from
- * source to destination (destination struct pages are the struct pages for the
- * memory allocated by the alloc_and_copy() callback). Migration can fail, and
- * thus the finalize_and_map() allows the driver to inspect which pages were
- * successfully migrated, and which were not. Successfully migrated pages will
- * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
- *
- * It is safe to update device page table from within the finalize_and_map()
- * callback because both destination and source page are still locked, and the
- * mmap_sem is held in read mode (hence no one can unmap the range being
- * migrated).
- *
- * Once callback is done cleaning up things and updating its page table (if it
- * chose to do so, this is not an obligation) then it returns. At this point,
- * the HMM core will finish up the final steps, and the migration is complete.
- *
- * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
- * ENTRIES OR BAD THINGS WILL HAPPEN !
- */
-struct migrate_vma_ops {
- void (*alloc_and_copy)(struct vm_area_struct *vma,
- const unsigned long *src,
- unsigned long *dst,
- unsigned long start,
- unsigned long end,
- void *private);
- void (*finalize_and_map)(struct vm_area_struct *vma,
- const unsigned long *src,
- const unsigned long *dst,
- unsigned long start,
- unsigned long end,
- void *private);
+struct migrate_vma {
+ struct vm_area_struct *vma;
+ /*
+ * Both src and dst array must be big enough for
+ * (end - start) >> PAGE_SHIFT entries.
+ *
+ * The src array must not be modified by the caller after
+ * migrate_vma_setup(), and must not change the dst array after
+ * migrate_vma_pages() returns.
+ */
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
+ unsigned long start;
+ unsigned long end;
};
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
-int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private);
-#else
-static inline int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private)
-{
- return -EINVAL;
-}
-#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
+int migrate_vma_setup(struct migrate_vma *args);
+void migrate_vma_pages(struct migrate_vma *migrate);
+void migrate_vma_finalize(struct migrate_vma *migrate);
#endif /* CONFIG_MIGRATION */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..7cf955feb823 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long start, unsigned long end);
-/**
- * mm_walk - callbacks for walk_page_range
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
- * this handler should only handle pud_trans_huge() puds.
- * the pmd_entry or pte_entry callbacks will be used for
- * regular PUDs.
- * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
- * this handler is required to be able to handle
- * pmd_trans_huge() pmds. They may simply choose to
- * split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
- * @pte_hole: if set, called for each hole at all levels
- * @hugetlb_entry: if set, called for each hugetlb entry
- * @test_walk: caller specific callback function to determine whether
- * we walk over the current vma or not. Returning 0
- * value means "do page table walk over the current vma,"
- * and a negative one means "abort current page table walk
- * right now." 1 means "skip the current vma."
- * @mm: mm_struct representing the target process of page table walk
- * @vma: vma currently walked (NULL if walking outside vmas)
- * @private: private data for callbacks' usage
- *
- * (see the comment on walk_page_range() for more details)
- */
-struct mm_walk {
- int (*pud_entry)(pud_t *pud, unsigned long addr,
- unsigned long next, struct mm_walk *walk);
- int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
- unsigned long next, struct mm_walk *walk);
- int (*pte_entry)(pte_t *pte, unsigned long addr,
- unsigned long next, struct mm_walk *walk);
- int (*pte_hole)(unsigned long addr, unsigned long next,
- struct mm_walk *walk);
- int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long next,
- struct mm_walk *walk);
- int (*test_walk)(unsigned long addr, unsigned long next,
- struct mm_walk *walk);
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- void *private;
-};
-
struct mmu_notifier_range;
-int walk_page_range(unsigned long addr, unsigned long end,
- struct mm_walk *walk);
-int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a7a1083b6fb..0b739f360cec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -25,7 +25,6 @@
struct address_space;
struct mem_cgroup;
-struct hmm;
/*
* Each physical page in the system has a struct page associated with
@@ -511,11 +510,6 @@ struct mm_struct {
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
-
-#ifdef CONFIG_HMM_MIRROR
- /* HMM needs to track a few things per mm */
- struct hmm *hmm;
-#endif
} __randomize_layout;
/*
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b6c004bd9f6a..1bd8e6a09a3c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -42,6 +42,10 @@ enum mmu_notifier_event {
#ifdef CONFIG_MMU_NOTIFIER
+#ifdef CONFIG_LOCKDEP
+extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
+#endif
+
/*
* The mmu notifier_mm structure is allocated and installed in
* mm->mmu_notifier_mm inside the mm_take_all_locks() protected
@@ -211,6 +215,19 @@ struct mmu_notifier_ops {
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
+
+ /*
+ * These callbacks are used with the get/put interface to manage the
+ * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
+ * notifier for use with the mm.
+ *
+ * free_notifier() is only called after the mmu_notifier has been
+ * fully put, calls to any ops callback are prevented and no ops
+ * callbacks are currently running. It is called from a SRCU callback
+ * and cannot sleep.
+ */
+ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
+ void (*free_notifier)(struct mmu_notifier *mn);
};
/*
@@ -227,6 +244,9 @@ struct mmu_notifier_ops {
struct mmu_notifier {
struct hlist_node hlist;
const struct mmu_notifier_ops *ops;
+ struct mm_struct *mm;
+ struct rcu_head rcu;
+ unsigned int users;
};
static inline int mm_has_notifiers(struct mm_struct *mm)
@@ -234,14 +254,27 @@ static inline int mm_has_notifiers(struct mm_struct *mm)
return unlikely(mm->mmu_notifier_mm);
}
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
+ struct mm_struct *mm);
+static inline struct mmu_notifier *
+mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
+{
+ struct mmu_notifier *ret;
+
+ down_write(&mm->mmap_sem);
+ ret = mmu_notifier_get_locked(ops, mm);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+void mmu_notifier_put(struct mmu_notifier *mn);
+void mmu_notifier_synchronize(void);
+
extern int mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm);
-extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
- struct mm_struct *mm);
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
@@ -310,25 +343,36 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
+ might_sleep();
+
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
if (mm_has_notifiers(range->mm)) {
range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
+ int ret = 0;
+
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
if (mm_has_notifiers(range->mm)) {
range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
- return __mmu_notifier_invalidate_range_start(range);
+ ret = __mmu_notifier_invalidate_range_start(range);
}
- return 0;
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ return ret;
}
static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
+ if (mmu_notifier_range_blockable(range))
+ might_sleep();
+
if (mm_has_notifiers(range->mm))
__mmu_notifier_invalidate_range_end(range, false);
}
@@ -482,9 +526,6 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
set_pte_at(___mm, ___address, __ptep, ___pte); \
})
-extern void mmu_notifier_call_srcu(struct rcu_head *rcu,
- void (*func)(struct rcu_head *rcu));
-
#else /* CONFIG_MMU_NOTIFIER */
struct mmu_notifier_range {
@@ -581,6 +622,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
#define set_pte_at_notify set_pte_at
+static inline void mmu_notifier_synchronize(void)
+{
+}
+
#endif /* CONFIG_MMU_NOTIFIER */
#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
new file mode 100644
index 000000000000..bddd9759bab9
--- /dev/null
+++ b/include/linux/pagewalk.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PAGEWALK_H
+#define _LINUX_PAGEWALK_H
+
+#include <linux/mm.h>
+
+struct mm_walk;
+
+/**
+ * mm_walk_ops - callbacks for walk_page_range
+ * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
+ * this handler should only handle pud_trans_huge() puds.
+ * the pmd_entry or pte_entry callbacks will be used for
+ * regular PUDs.
+ * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
+ * this handler is required to be able to handle
+ * pmd_trans_huge() pmds. They may simply choose to
+ * split_huge_page() instead of handling it explicitly.
+ * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
+ * @pte_hole: if set, called for each hole at all levels
+ * @hugetlb_entry: if set, called for each hugetlb entry
+ * @test_walk: caller specific callback function to determine whether
+ * we walk over the current vma or not. Returning 0 means
+ * "do page table walk over the current vma", returning
+ * a negative value means "abort current page table walk
+ * right now" and returning 1 means "skip the current vma"
+ */
+struct mm_walk_ops {
+ int (*pud_entry)(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pte_entry)(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pte_hole)(unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
+ int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
+ int (*test_walk)(unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
+};
+
+/**
+ * mm_walk - walk_page_range data
+ * @ops: operation to call during the walk
+ * @mm: mm_struct representing the target process of page table walk
+ * @vma: vma currently walked (NULL if walking outside vmas)
+ * @private: private data for callbacks' usage
+ *
+ * (see the comment on walk_page_range() for more details)
+ */
+struct mm_walk {
+ const struct mm_walk_ops *ops;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ void *private;
+};
+
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private);
+
+#endif /* _LINUX_PAGEWALK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b75b28287005..70db597d6fd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -958,6 +958,10 @@ struct task_struct {
struct mutex_waiter *blocked_on;
#endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ int non_block_count;
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 1052d0d62be7..a91b2af64ec4 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -42,7 +42,7 @@ struct ib_ucontext;
struct ib_umem_odp;
struct ib_umem {
- struct ib_ucontext *context;
+ struct ib_device *ibdev;
struct mm_struct *owning_mm;
size_t length;
unsigned long address;
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 479db5c98ff6..253df1a1fa54 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -37,11 +37,6 @@
#include <rdma/ib_verbs.h>
#include <linux/interval_tree.h>
-struct umem_odp_node {
- u64 __subtree_last;
- struct rb_node rb;
-};
-
struct ib_umem_odp {
struct ib_umem umem;
struct ib_ucontext_per_mm *per_mm;
@@ -72,7 +67,15 @@ struct ib_umem_odp {
int npages;
/* Tree tracking */
- struct umem_odp_node interval_tree;
+ struct interval_tree_node interval_tree;
+
+ /*
+ * An implicit odp umem cannot be DMA mapped, has 0 length, and serves
+ * only as an anchor for the driver to hold onto the per_mm. FIXME:
+ * This should be removed and drivers should work with the per_mm
+ * directly.
+ */
+ bool is_implicit_odp;
struct completion notifier_completion;
int dying;
@@ -88,14 +91,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
/* Returns the first page of an ODP umem. */
static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
{
- return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift);
+ return umem_odp->interval_tree.start;
}
/* Returns the address of the page after the last one of an ODP umem. */
static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
{
- return ALIGN(umem_odp->umem.address + umem_odp->umem.length,
- 1UL << umem_odp->page_shift);
+ return umem_odp->interval_tree.last + 1;
}
static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
@@ -120,25 +122,20 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_ucontext_per_mm {
- struct ib_ucontext *context;
- struct mm_struct *mm;
+ struct mmu_notifier mn;
struct pid *tgid;
- bool active;
struct rb_root_cached umem_tree;
/* Protects umem_tree */
struct rw_semaphore umem_rwsem;
-
- struct mmu_notifier mn;
- unsigned int odp_mrs_count;
-
- struct list_head ucontext_list;
- struct rcu_head rcu;
};
-int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access);
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem,
- unsigned long addr, size_t size);
+struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
+ size_t size, int access);
+struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
+ int access);
+struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem,
+ unsigned long addr, size_t size);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
@@ -163,8 +160,17 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
* Find first region intersecting with address range.
* Return NULL if not found
*/
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
- u64 addr, u64 length);
+static inline struct ib_umem_odp *
+rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_first(root, addr, addr + length - 1);
+ if (!node)
+ return NULL;
+ return container_of(node, struct ib_umem_odp, interval_tree);
+
+}
static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
unsigned long mmu_seq)
@@ -185,9 +191,11 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
-static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
+static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata,
+ unsigned long addr,
+ size_t size, int access)
{
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4f225175cb91..f659f4a02aa9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1417,11 +1417,6 @@ struct ib_ucontext {
bool cleanup_retryable;
- void (*invalidate_range)(struct ib_umem_odp *umem_odp,
- unsigned long start, unsigned long end);
- struct mutex per_mm_list_lock;
- struct list_head per_mm_list;
-
struct ib_rdmacg_object cg_obj;
/*
* Implementation details of the RDMA core, don't use in drivers:
@@ -2378,6 +2373,8 @@ struct ib_device_ops {
u64 iova);
int (*unmap_fmr)(struct list_head *fmr_list);
int (*dealloc_fmr)(struct ib_fmr *fmr);
+ void (*invalidate_range)(struct ib_umem_odp *umem_odp,
+ unsigned long start, unsigned long end);
int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,