diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 10:07:42 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 10:07:42 -0700 |
commit | 84da111de0b4be15bd500deff773f5116f39f7be (patch) | |
tree | 76b5796f8258397bf7a3926b742a89166a8501ef /include | |
parent | 227c3e9eb5cf3552c2cc83225df6d14adb05f8e8 (diff) | |
parent | 62974fc389b364d8af70e044836362222bd3ae53 (diff) |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This is more cleanup and consolidation of the hmm APIs and the very
strongly related mmu_notifier interfaces. Many places across the tree
using these interfaces are touched in the process. Beyond that a
cleanup to the page walker API and a few memremap related changes
round out the series:
- General improvement of hmm_range_fault() and related APIs, more
documentation, bug fixes from testing, API simplification &
consolidation, and unused API removal
- Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE,
and make them internal kconfig selects
- Hoist a lot of code related to mmu notifier attachment out of
drivers by using a refcount get/put attachment idiom and remove the
convoluted mmu_notifier_unregister_no_release() and related APIs.
- General API improvement for the migrate_vma API and revision of its
only user in nouveau
- Annotate mmu_notifiers with lockdep and sleeping region debugging
Two series unrelated to HMM or mmu_notifiers came along due to
dependencies:
- Allow pagemap's memremap_pages family of APIs to work without
providing a struct device
- Make walk_page_range() and related use a constant structure for
function pointers"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits)
libnvdimm: Enable unit test infrastructure compile checks
mm, notifier: Catch sleeping/blocking for !blockable
kernel.h: Add non_block_start/end()
drm/radeon: guard against calling an unpaired radeon_mn_unregister()
csky: add missing brackets in a macro for tlb.h
pagewalk: use lockdep_assert_held for locking validation
pagewalk: separate function pointers from iterator data
mm: split out a new pagewalk.h header from mm.h
mm/mmu_notifiers: annotate with might_sleep()
mm/mmu_notifiers: prime lockdep
mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end
mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports
mm/hmm: hmm_range_fault() infinite loop
mm/hmm: hmm_range_fault() NULL pointer bug
mm/hmm: fix hmm_range_fault()'s handling of swapped out pages
mm/mmu_notifiers: remove unregister_no_release
RDMA/odp: remove ib_ucontext from ib_umem
RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'
RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
RDMA/mlx5: Use ib_umem_start instead of umem.address
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/hmm.h | 125 | ||||
-rw-r--r-- | include/linux/ioport.h | 2 | ||||
-rw-r--r-- | include/linux/kernel.h | 23 | ||||
-rw-r--r-- | include/linux/memremap.h | 3 | ||||
-rw-r--r-- | include/linux/migrate.h | 120 | ||||
-rw-r--r-- | include/linux/mm.h | 46 | ||||
-rw-r--r-- | include/linux/mm_types.h | 6 | ||||
-rw-r--r-- | include/linux/mmu_notifier.h | 59 | ||||
-rw-r--r-- | include/linux/pagewalk.h | 66 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/rdma/ib_umem.h | 2 | ||||
-rw-r--r-- | include/rdma/ib_umem_odp.h | 58 | ||||
-rw-r--r-- | include/rdma/ib_verbs.h | 7 |
13 files changed, 223 insertions, 298 deletions
diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7ef56dc18050..3fec513b9c00 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -84,15 +84,12 @@ * @notifiers: count of active mmu notifiers */ struct hmm { - struct mm_struct *mm; - struct kref kref; + struct mmu_notifier mmu_notifier; spinlock_t ranges_lock; struct list_head ranges; struct list_head mirrors; - struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; - struct rcu_head rcu; long notifiers; }; @@ -158,13 +155,11 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { struct hmm *hmm; - struct vm_area_struct *vma; struct list_head list; unsigned long start; unsigned long end; @@ -173,32 +168,11 @@ struct hmm_range { const uint64_t *values; uint64_t default_flags; uint64_t pfn_flags_mask; - uint8_t page_shift; uint8_t pfn_shift; bool valid; }; /* - * hmm_range_page_shift() - return the page shift for the range - * @range: range being queried - * Return: page shift (page size = 1 << page shift) for the range - */ -static inline unsigned hmm_range_page_shift(const struct hmm_range *range) -{ - return range->page_shift; -} - -/* - * hmm_range_page_size() - return the page size for the range - * @range: range being queried - * Return: page size for the range in bytes - */ -static inline unsigned long hmm_range_page_size(const struct hmm_range *range) -{ - return 1UL << hmm_range_page_shift(range); -} - -/* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on * @timeout: time out for wait in ms (ie abort wait after that period of time) @@ -291,40 +265,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, } /* - * Old API: - * hmm_pfn_to_page() - * hmm_pfn_to_pfn() - * hmm_pfn_from_page() - * hmm_pfn_from_pfn() - * - * This are the OLD API please use new API, it is here to avoid cross-tree - * merge painfullness ie we convert things to new API in stages. - */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_page(range, pfn); -} - -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_pfn(range, pfn); -} - -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, - struct page *page) -{ - return hmm_device_entry_from_page(range, page); -} - -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return hmm_device_entry_from_pfn(range, pfn); -} - -/* * Mirroring: how to synchronize device page table with CPU page table. * * A device driver that is participating in HMM mirroring must always @@ -375,29 +315,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, struct hmm_mirror; /* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - -/* * struct hmm_mirror_ops - HMM mirror device operations callback * * @update: callback to update range on a device @@ -417,9 +334,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update information (see struct hmm_update) - * Return: -EAGAIN if update.blockable false and callback need to - * block, 0 otherwise. + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false + * and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -430,8 +347,9 @@ struct hmm_mirror_ops { * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + int (*sync_cpu_device_pagetables)( + struct hmm_mirror *mirror, + const struct mmu_notifier_range *update); }; /* @@ -457,20 +375,24 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -int hmm_range_register(struct hmm_range *range, - struct hmm_mirror *mirror, - unsigned long start, - unsigned long end, - unsigned page_shift); +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); void hmm_range_unregister(struct hmm_range *range); -long hmm_range_snapshot(struct hmm_range *range); -long hmm_range_fault(struct hmm_range *range, bool block); + +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + +long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_dma_map(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, - bool block); + unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, - struct vm_area_struct *vma, struct device *device, dma_addr_t *daddrs, bool dirty); @@ -484,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* Below are for HMM internal use only! Not to be used by device driver! */ -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #endif /* LINUX_HMM_H */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5b6a7121c9f0..7bddddfc76d6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -297,6 +297,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); +struct resource *request_free_mem_region(struct resource *base, + unsigned long size, const char *name); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4fa360a13c1e..d83d403dac2e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -217,7 +217,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); * might_sleep - annotation for functions that can sleep * * this macro will print a stack trace if it is executed in an atomic - * context (spinlock, irq-handler, ...). + * context (spinlock, irq-handler, ...). Additional sections where blocking is + * not allowed can be annotated with non_block_start() and non_block_end() + * pairs. * * This is a useful debugging help to be able to catch problems early and not * be bitten later when the calling function happens to sleep when it is not @@ -233,6 +235,23 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); # define cant_sleep() \ do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) # define sched_annotate_sleep() (current->task_state_change = 0) +/** + * non_block_start - annotate the start of section where sleeping is prohibited + * + * This is on behalf of the oom reaper, specifically when it is calling the mmu + * notifiers. The problem is that if the notifier were to block on, for example, + * mutex_lock() and if the process which holds that mutex were to perform a + * sleeping memory allocation, the oom reaper is now blocked on completion of + * that memory allocation. Other blocking calls like wait_event() pose similar + * issues. + */ +# define non_block_start() (current->non_block_count++) +/** + * non_block_end - annotate the end of section where sleeping is prohibited + * + * Closes a section opened by non_block_start(). + */ +# define non_block_end() WARN_ON(current->non_block_count-- == 0) #else static inline void ___might_sleep(const char *file, int line, int preempt_offset) { } @@ -241,6 +260,8 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) # define sched_annotate_sleep() do { } while (0) +# define non_block_start() do { } while (0) +# define non_block_end() do { } while (0) #endif #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..fb2a0bd826b9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -109,7 +109,6 @@ struct dev_pagemap { struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; - struct device *dev; enum memory_type type; unsigned int flags; u64 pci_p2pdma_bus_offset; @@ -124,6 +123,8 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +void *memremap_pages(struct dev_pagemap *pgmap, int nid); +void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7f04754c7f2b..72120061b7d4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -166,8 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED (1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) -#define MIGRATE_PFN_DEVICE (1UL << 4) -#define MIGRATE_PFN_ERROR (1UL << 5) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -182,107 +180,27 @@ static inline unsigned long migrate_pfn(unsigned long pfn) return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; } -/* - * struct migrate_vma_ops - migrate operation callback - * - * @alloc_and_copy: alloc destination memory and copy source memory to it - * @finalize_and_map: allow caller to map the successfully migrated pages - * - * - * The alloc_and_copy() callback happens once all source pages have been locked, - * unmapped and checked (checked whether pinned or not). All pages that can be - * migrated will have an entry in the src array set with the pfn value of the - * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other - * flags might be set but should be ignored by the callback). - * - * The alloc_and_copy() callback can then allocate destination memory and copy - * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and - * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the - * callback must update each corresponding entry in the dst array with the pfn - * value of the destination page and with the MIGRATE_PFN_VALID and - * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages - * locked, via lock_page()). - * - * At this point the alloc_and_copy() callback is done and returns. - * - * Note that the callback does not have to migrate all the pages that are - * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration - * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also - * set in the src array entry). If the device driver cannot migrate a device - * page back to system memory, then it must set the corresponding dst array - * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to - * access any of the virtual addresses originally backed by this page. Because - * a SIGBUS is such a severe result for the userspace process, the device - * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an - * unrecoverable state. - * - * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing device driver to allocate device memory for those unback virtual - * address. For this the device driver simply have to allocate device memory - * and properly set the destination entry like for regular migration. Note that - * this can still fails and thus inside the device driver must check if the - * migration was successful for those entry inside the finalize_and_map() - * callback just like for regular migration. - * - * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES - * OR BAD THINGS WILL HAPPEN ! - * - * - * The finalize_and_map() callback happens after struct page migration from - * source to destination (destination struct pages are the struct pages for the - * memory allocated by the alloc_and_copy() callback). Migration can fail, and - * thus the finalize_and_map() allows the driver to inspect which pages were - * successfully migrated, and which were not. Successfully migrated pages will - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. - * - * It is safe to update device page table from within the finalize_and_map() - * callback because both destination and source page are still locked, and the - * mmap_sem is held in read mode (hence no one can unmap the range being - * migrated). - * - * Once callback is done cleaning up things and updating its page table (if it - * chose to do so, this is not an obligation) then it returns. At this point, - * the HMM core will finish up the final steps, and the migration is complete. - * - * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY - * ENTRIES OR BAD THINGS WILL HAPPEN ! - */ -struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, - const unsigned long *src, - unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); +struct migrate_vma { + struct vm_area_struct *vma; + /* + * Both src and dst array must be big enough for + * (end - start) >> PAGE_SHIFT entries. + * + * The src array must not be modified by the caller after + * migrate_vma_setup(), and must not change the dst array after + * migrate_vma_pages() returns. + */ + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; }; -#if defined(CONFIG_MIGRATE_VMA_HELPER) -int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); -#else -static inline int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private) -{ - return -EINVAL; -} -#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */ +int migrate_vma_setup(struct migrate_vma *args); +void migrate_vma_pages(struct migrate_vma *migrate); +void migrate_vma_finalize(struct migrate_vma *migrate); #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0334ca97c584..7cf955feb823 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); -/** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) - */ -struct mm_walk { - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_entry)(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, - unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*test_walk)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - struct mm_struct *mm; - struct vm_area_struct *vma; - void *private; -}; - struct mmu_notifier_range; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a7a1083b6fb..0b739f360cec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -25,7 +25,6 @@ struct address_space; struct mem_cgroup; -struct hmm; /* * Each physical page in the system has a struct page associated with @@ -511,11 +510,6 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; - -#ifdef CONFIG_HMM_MIRROR - /* HMM needs to track a few things per mm */ - struct hmm *hmm; -#endif } __randomize_layout; /* diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..1bd8e6a09a3c 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -42,6 +42,10 @@ enum mmu_notifier_event { #ifdef CONFIG_MMU_NOTIFIER +#ifdef CONFIG_LOCKDEP +extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; +#endif + /* * The mmu notifier_mm structure is allocated and installed in * mm->mmu_notifier_mm inside the mm_take_all_locks() protected @@ -211,6 +215,19 @@ struct mmu_notifier_ops { */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); + + /* + * These callbacks are used with the get/put interface to manage the + * lifetime of the mmu_notifier memory. alloc_notifier() returns a new + * notifier for use with the mm. + * + * free_notifier() is only called after the mmu_notifier has been + * fully put, calls to any ops callback are prevented and no ops + * callbacks are currently running. It is called from a SRCU callback + * and cannot sleep. + */ + struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); + void (*free_notifier)(struct mmu_notifier *mn); }; /* @@ -227,6 +244,9 @@ struct mmu_notifier_ops { struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; + struct mm_struct *mm; + struct rcu_head rcu; + unsigned int users; }; static inline int mm_has_notifiers(struct mm_struct *mm) @@ -234,14 +254,27 @@ static inline int mm_has_notifiers(struct mm_struct *mm) return unlikely(mm->mmu_notifier_mm); } +struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, + struct mm_struct *mm); +static inline struct mmu_notifier * +mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) +{ + struct mmu_notifier *ret; + + down_write(&mm->mmap_sem); + ret = mmu_notifier_get_locked(ops, mm); + up_write(&mm->mmap_sem); + return ret; +} +void mmu_notifier_put(struct mmu_notifier *mn); +void mmu_notifier_synchronize(void); + extern int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); -extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, - struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, @@ -310,25 +343,36 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { + might_sleep(); + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } + lock_map_release(&__mmu_notifier_invalidate_range_start_map); } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { + int ret = 0; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; - return __mmu_notifier_invalidate_range_start(range); + ret = __mmu_notifier_invalidate_range_start(range); } - return 0; + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { + if (mmu_notifier_range_blockable(range)) + might_sleep(); + if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range, false); } @@ -482,9 +526,6 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, set_pte_at(___mm, ___address, __ptep, ___pte); \ }) -extern void mmu_notifier_call_srcu(struct rcu_head *rcu, - void (*func)(struct rcu_head *rcu)); - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { @@ -581,6 +622,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at +static inline void mmu_notifier_synchronize(void) +{ +} + #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h new file mode 100644 index 000000000000..bddd9759bab9 --- /dev/null +++ b/include/linux/pagewalk.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGEWALK_H +#define _LINUX_PAGEWALK_H + +#include <linux/mm.h> + +struct mm_walk; + +/** + * mm_walk_ops - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 means + * "do page table walk over the current vma", returning + * a negative value means "abort current page table walk + * right now" and returning 1 means "skip the current vma" + */ +struct mm_walk_ops { + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_entry)(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_hole)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*test_walk)(unsigned long addr, unsigned long next, + struct mm_walk *walk); +}; + +/** + * mm_walk - walk_page_range data + * @ops: operation to call during the walk + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + const struct mm_walk_ops *ops; + struct mm_struct *mm; + struct vm_area_struct *vma; + void *private; +}; + +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private); + +#endif /* _LINUX_PAGEWALK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b75b28287005..70db597d6fd4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -958,6 +958,10 @@ struct task_struct { struct mutex_waiter *blocked_on; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + int non_block_count; +#endif + #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 1052d0d62be7..a91b2af64ec4 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,7 +42,7 @@ struct ib_ucontext; struct ib_umem_odp; struct ib_umem { - struct ib_ucontext *context; + struct ib_device *ibdev; struct mm_struct *owning_mm; size_t length; unsigned long address; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 479db5c98ff6..253df1a1fa54 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -37,11 +37,6 @@ #include <rdma/ib_verbs.h> #include <linux/interval_tree.h> -struct umem_odp_node { - u64 __subtree_last; - struct rb_node rb; -}; - struct ib_umem_odp { struct ib_umem umem; struct ib_ucontext_per_mm *per_mm; @@ -72,7 +67,15 @@ struct ib_umem_odp { int npages; /* Tree tracking */ - struct umem_odp_node interval_tree; + struct interval_tree_node interval_tree; + + /* + * An implicit odp umem cannot be DMA mapped, has 0 length, and serves + * only as an anchor for the driver to hold onto the per_mm. FIXME: + * This should be removed and drivers should work with the per_mm + * directly. + */ + bool is_implicit_odp; struct completion notifier_completion; int dying; @@ -88,14 +91,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) /* Returns the first page of an ODP umem. */ static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) { - return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); + return umem_odp->interval_tree.start; } /* Returns the address of the page after the last one of an ODP umem. */ static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) { - return ALIGN(umem_odp->umem.address + umem_odp->umem.length, - 1UL << umem_odp->page_shift); + return umem_odp->interval_tree.last + 1; } static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) @@ -120,25 +122,20 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_ucontext_per_mm { - struct ib_ucontext *context; - struct mm_struct *mm; + struct mmu_notifier mn; struct pid *tgid; - bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - unsigned int odp_mrs_count; - - struct list_head ucontext_list; - struct rcu_head rcu; }; -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, - unsigned long addr, size_t size); +struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, + size_t size, int access); +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, + int access); +struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, + unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, @@ -163,8 +160,17 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, * Find first region intersecting with address range. * Return NULL if not found */ -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, - u64 addr, u64 length); +static inline struct ib_umem_odp * +rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_first(root, addr, addr + length - 1); + if (!node) + return NULL; + return container_of(node, struct ib_umem_odp, interval_tree); + +} static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) @@ -185,9 +191,11 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, + unsigned long addr, + size_t size, int access) { - return -EINVAL; + return ERR_PTR(-EINVAL); } static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4f225175cb91..f659f4a02aa9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1417,11 +1417,6 @@ struct ib_ucontext { bool cleanup_retryable; - void (*invalidate_range)(struct ib_umem_odp *umem_odp, - unsigned long start, unsigned long end); - struct mutex per_mm_list_lock; - struct list_head per_mm_list; - struct ib_rdmacg_object cg_obj; /* * Implementation details of the RDMA core, don't use in drivers: @@ -2378,6 +2373,8 @@ struct ib_device_ops { u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); + void (*invalidate_range)(struct ib_umem_odp *umem_odp, + unsigned long start, unsigned long end); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, |