diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-14 19:42:11 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-14 19:42:11 -0700 |
commit | fec88ab0af9706b2201e5daf377c5031c62d11f7 (patch) | |
tree | 7206e8a3ff2dea87f912f4660d453a8c118248ac /include | |
parent | fa6e951a2a440babd7a7310d0f4713e618061767 (diff) | |
parent | cc5dfd59e375f4d0f2b64643723d16b38b2f2d78 (diff) |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull HMM updates from Jason Gunthorpe:
"Improvements and bug fixes for the hmm interface in the kernel:
- Improve clarity, locking and APIs related to the 'hmm mirror'
feature merged last cycle. In linux-next we now see AMDGPU and
nouveau to be using this API.
- Remove old or transitional hmm APIs. These are hold overs from the
past with no users, or APIs that existed only to manage cross tree
conflicts. There are still a few more of these cleanups that didn't
make the merge window cut off.
- Improve some core mm APIs:
- export alloc_pages_vma() for driver use
- refactor into devm_request_free_mem_region() to manage
DEVICE_PRIVATE resource reservations
- refactor duplicative driver code into the core dev_pagemap
struct
- Remove hmm wrappers of improved core mm APIs, instead have drivers
use the simplified API directly
- Remove DEVICE_PUBLIC
- Simplify the kconfig flow for the hmm users and core code"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (42 commits)
mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
mm: remove the HMM config option
mm: sort out the DEVICE_PRIVATE Kconfig mess
mm: simplify ZONE_DEVICE page private data
mm: remove hmm_devmem_add
mm: remove hmm_vma_alloc_locked_page
nouveau: use devm_memremap_pages directly
nouveau: use alloc_page_vma directly
PCI/P2PDMA: use the dev_pagemap internal refcount
device-dax: use the dev_pagemap internal refcount
memremap: provide an optional internal refcount in struct dev_pagemap
memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag
memremap: remove the data field in struct dev_pagemap
memremap: add a migrate_to_ram method to struct dev_pagemap_ops
memremap: lift the devmap_enable manipulation into devm_memremap_pages
memremap: pass a struct dev_pagemap to ->kill and ->cleanup
memremap: move dev_pagemap callbacks into a separate structure
memremap: validate the pagemap type passed to devm_memremap_pages
mm: factor out a devm_request_free_mem_region helper
mm: export alloc_pages_vma
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/hmm.h | 302 | ||||
-rw-r--r-- | include/linux/ioport.h | 5 | ||||
-rw-r--r-- | include/linux/memremap.h | 75 | ||||
-rw-r--r-- | include/linux/mm.h | 28 | ||||
-rw-r--r-- | include/linux/mm_types.h | 4 | ||||
-rw-r--r-- | include/linux/swapops.h | 15 |
6 files changed, 92 insertions, 337 deletions
diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 044a36d7c3f8..b8a08b2a10ca 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -21,8 +21,8 @@ * * HMM address space mirroring API: * - * Use HMM address space mirroring if you want to mirror range of the CPU page - * table of a process into a device page table. Here, "mirror" means "keep + * Use HMM address space mirroring if you want to mirror a range of the CPU + * page tables of a process into a device page table. Here, "mirror" means "keep * synchronized". Prerequisites: the device must provide the ability to write- * protect its page tables (at PAGE_SIZE granularity), and must be able to * recover from the resulting potential page faults. @@ -62,7 +62,7 @@ #include <linux/kconfig.h> #include <asm/pgtable.h> -#if IS_ENABLED(CONFIG_HMM) +#ifdef CONFIG_HMM_MIRROR #include <linux/device.h> #include <linux/migrate.h> @@ -82,19 +82,18 @@ * @mirrors_sem: read/write semaphore protecting the mirrors list * @wq: wait queue for user waiting on a range invalidation * @notifiers: count of active mmu notifiers - * @dead: is the mm dead ? */ struct hmm { struct mm_struct *mm; struct kref kref; - struct mutex lock; + spinlock_t ranges_lock; struct list_head ranges; struct list_head mirrors; struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; + struct rcu_head rcu; long notifiers; - bool dead; }; /* @@ -105,10 +104,11 @@ struct hmm { * HMM_PFN_WRITE: CPU page table has write permission set * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) * - * The driver provide a flags array, if driver valid bit for an entry is bit - * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide + * The driver provides a flags array for mapping page protections to device + * PTE bits. If the driver valid bit for an entry is bit 3, + * i.e., (entry & (1 << 3)), then the driver must provide * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. - * Same logic apply to all flags. This is same idea as vm_page_prot in vma + * Same logic apply to all flags. This is the same idea as vm_page_prot in vma * except that this is per device driver rather than per architecture. */ enum hmm_pfn_flag_e { @@ -129,13 +129,13 @@ enum hmm_pfn_flag_e { * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. * - * Driver provide entry value for none entry, error entry and special entry, - * driver can alias (ie use same value for error and special for instance). It - * should not alias none and error or special. + * Driver provides values for none entry, error entry, and special entry. + * Driver can alias (i.e., use same value) error and special, but + * it should not alias none with error or special. * * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table + * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry, * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ enum hmm_pfn_value_e { @@ -158,6 +158,7 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter + * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -180,7 +181,7 @@ struct hmm_range { /* * hmm_range_page_shift() - return the page shift for the range * @range: range being queried - * Returns: page shift (page size = 1 << page shift) for the range + * Return: page shift (page size = 1 << page shift) for the range */ static inline unsigned hmm_range_page_shift(const struct hmm_range *range) { @@ -190,7 +191,7 @@ static inline unsigned hmm_range_page_shift(const struct hmm_range *range) /* * hmm_range_page_size() - return the page size for the range * @range: range being queried - * Returns: page size for the range in bytes + * Return: page size for the range in bytes */ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) { @@ -201,28 +202,19 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on * @timeout: time out for wait in ms (ie abort wait after that period of time) - * Returns: true if the range is valid, false otherwise. + * Return: true if the range is valid, false otherwise. */ static inline bool hmm_range_wait_until_valid(struct hmm_range *range, unsigned long timeout) { - /* Check if mm is dead ? */ - if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { - range->valid = false; - return false; - } - if (range->valid) - return true; - wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, - msecs_to_jiffies(timeout)); - /* Return current valid status just in case we get lucky */ - return range->valid; + return wait_event_timeout(range->hmm->wq, range->valid, + msecs_to_jiffies(timeout)) != 0; } /* * hmm_range_valid() - test if a range is valid or not * @range: range - * Returns: true if the range is valid, false otherwise. + * Return: true if the range is valid, false otherwise. */ static inline bool hmm_range_valid(struct hmm_range *range) { @@ -233,7 +225,7 @@ static inline bool hmm_range_valid(struct hmm_range *range) * hmm_device_entry_to_page() - return struct page pointed to by a device entry * @range: range use to decode device entry value * @entry: device entry value to get corresponding struct page from - * Returns: struct page pointer if entry is a valid, NULL otherwise + * Return: struct page pointer if entry is a valid, NULL otherwise * * If the device entry is valid (ie valid flag set) then return the struct page * matching the entry value. Otherwise return NULL. @@ -256,7 +248,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang * hmm_device_entry_to_pfn() - return pfn value store in a device entry * @range: range use to decode device entry value * @entry: device entry to extract pfn from - * Returns: pfn value if device entry is valid, -1UL otherwise + * Return: pfn value if device entry is valid, -1UL otherwise */ static inline unsigned long hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) @@ -276,7 +268,7 @@ hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value * @page: page for which to create the device entry - * Returns: valid device entry for the page + * Return: valid device entry for the page */ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, struct page *page) @@ -289,7 +281,7 @@ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value * @pfn: pfn value for which to create the device entry - * Returns: valid device entry for the pfn + * Return: valid device entry for the pfn */ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, unsigned long pfn) @@ -332,9 +324,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, return hmm_device_entry_from_pfn(range, pfn); } - - -#if IS_ENABLED(CONFIG_HMM_MIRROR) /* * Mirroring: how to synchronize device page table with CPU page table. * @@ -394,7 +383,7 @@ enum hmm_update_event { }; /* - * struct hmm_update - HMM update informations for callback + * struct hmm_update - HMM update information for callback * * @start: virtual start address of the range to update * @end: virtual end address of the range to update @@ -418,17 +407,18 @@ struct hmm_mirror_ops { * * @mirror: pointer to struct hmm_mirror * - * This is called when the mm_struct is being released. - * The callback should make sure no references to the mirror occur - * after the callback returns. + * This is called when the mm_struct is being released. The callback + * must ensure that all access to any pages obtained from this mirror + * is halted before the callback returns. All future access should + * fault. */ void (*release)(struct hmm_mirror *mirror); /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update informations (see struct hmm_update) - * Returns: -EAGAIN if update.blockable false and callback need to + * @update: update information (see struct hmm_update) + * Return: -EAGAIN if update.blockable false and callback need to * block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU @@ -465,35 +455,10 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * hmm_mirror_mm_is_alive() - test if mm is still alive - * @mirror: the HMM mm mirror for which we want to lock the mmap_sem - * Returns: false if the mm is dead, true otherwise - * - * This is an optimization it will not accurately always return -EINVAL if the - * mm is dead ie there can be false negative (process is being kill but HMM is - * not yet inform of that). It is only intented to be use to optimize out case - * where driver is about to do something time consuming and it would be better - * to skip it if the mm is dead. - */ -static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) -{ - struct mm_struct *mm; - - if (!mirror || !mirror->hmm) - return false; - mm = READ_ONCE(mirror->hmm->mm); - if (mirror->hmm->dead || !mm) - return false; - - return true; -} - - -/* * Please see Documentation/vm/hmm.rst for how to use the range API. */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift); @@ -529,7 +494,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) } /* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_range *range, bool block) +static inline int hmm_vma_fault(struct hmm_mirror *mirror, + struct hmm_range *range, bool block) { long ret; @@ -542,7 +508,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, range->vma->vm_mm, + ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); if (ret) @@ -561,7 +527,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) ret = hmm_range_fault(range, block); if (ret <= 0) { if (ret == -EBUSY || !ret) { - /* Same as above drop mmap_sem to match old API. */ + /* Same as above, drop mmap_sem to match old API. */ up_read(&range->vma->vm_mm->mmap_sem); ret = -EBUSY; } else if (ret == -EAGAIN) @@ -573,208 +539,12 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) } /* Below are for HMM internal use only! Not to be used by device driver! */ -void hmm_mm_destroy(struct mm_struct *mm); - static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct hmm_devmem; - -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr); - -/* - * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events - * - * @free: call when refcount on page reach 1 and thus is no longer use - * @fault: call when there is a page fault to unaddressable memory - * - * Both callback happens from page_free() and page_fault() callback of struct - * dev_pagemap respectively. See include/linux/memremap.h for more details on - * those. - * - * The hmm_devmem_ops callback are just here to provide a coherent and - * uniq API to device driver and device driver should not register their - * own page_free() or page_fault() but rely on the hmm_devmem_ops call- - * back. - */ -struct hmm_devmem_ops { - /* - * free() - free a device page - * @devmem: device memory structure (see struct hmm_devmem) - * @page: pointer to struct page being freed - * - * Call back occurs whenever a device page refcount reach 1 which - * means that no one is holding any reference on the page anymore - * (ZONE_DEVICE page have an elevated refcount of 1 as default so - * that they are not release to the general page allocator). - * - * Note that callback has exclusive ownership of the page (as no - * one is holding any reference). - */ - void (*free)(struct hmm_devmem *devmem, struct page *page); - /* - * fault() - CPU page fault or get user page (GUP) - * @devmem: device memory structure (see struct hmm_devmem) - * @vma: virtual memory area containing the virtual address - * @addr: virtual address that faulted or for which there is a GUP - * @page: pointer to struct page backing virtual address (unreliable) - * @flags: FAULT_FLAG_* (see include/linux/mm.h) - * @pmdp: page middle directory - * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR - * on error - * - * The callback occurs whenever there is a CPU page fault or GUP on a - * virtual address. This means that the device driver must migrate the - * page back to regular memory (CPU accessible). - * - * The device driver is free to migrate more than one page from the - * fault() callback as an optimization. However if device decide to - * migrate more than one page it must always priotirize the faulting - * address over the others. - * - * The struct page pointer is only given as an hint to allow quick - * lookup of internal device driver data. A concurrent migration - * might have already free that page and the virtual address might - * not longer be back by it. So it should not be modified by the - * callback. - * - * Note that mmap semaphore is held in read mode at least when this - * callback occurs, hence the vma is valid upon callback entry. - */ - vm_fault_t (*fault)(struct hmm_devmem *devmem, - struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); -}; - -/* - * struct hmm_devmem - track device memory - * - * @completion: completion object for device memory - * @pfn_first: first pfn for this resource (set by hmm_devmem_add()) - * @pfn_last: last pfn for this resource (set by hmm_devmem_add()) - * @resource: IO resource reserved for this chunk of memory - * @pagemap: device page map for that chunk - * @device: device to bind resource to - * @ops: memory operations callback - * @ref: per CPU refcount - * @page_fault: callback when CPU fault on an unaddressable device page - * - * This an helper structure for device drivers that do not wish to implement - * the gory details related to hotplugging new memoy and allocating struct - * pages. - * - * Device drivers can directly use ZONE_DEVICE memory on their own if they - * wish to do so. - * - * The page_fault() callback must migrate page back, from device memory to - * system memory, so that the CPU can access it. This might fail for various - * reasons (device issues, device have been unplugged, ...). When such error - * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and - * set the CPU page table entry to "poisoned". - * - * Note that because memory cgroup charges are transferred to the device memory, - * this should never fail due to memory restrictions. However, allocation - * of a regular system page might still fail because we are out of memory. If - * that happens, the page_fault() callback must return VM_FAULT_OOM. - * - * The page_fault() callback can also try to migrate back multiple pages in one - * chunk, as an optimization. It must, however, prioritize the faulting address - * over all the others. - */ -typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); - -struct hmm_devmem { - struct completion completion; - unsigned long pfn_first; - unsigned long pfn_last; - struct resource *resource; - struct device *device; - struct dev_pagemap pagemap; - const struct hmm_devmem_ops *ops; - struct percpu_ref ref; - dev_page_fault_t page_fault; -}; - -/* - * To add (hotplug) device memory, HMM assumes that there is no real resource - * that reserves a range in the physical address space (this is intended to be - * use by unaddressable device memory). It will reserve a physical range big - * enough and allocate struct page for it. - * - * The device driver can wrap the hmm_devmem struct inside a private device - * driver struct. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size); -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res); - -/* - * hmm_devmem_page_set_drvdata - set per-page driver data field - * - * @page: pointer to struct page - * @data: driver data value to set - * - * Because page can not be on lru we have an unsigned long that driver can use - * to store a per page field. This just a simple helper to do that. - */ -static inline void hmm_devmem_page_set_drvdata(struct page *page, - unsigned long data) -{ - page->hmm_data = data; -} - -/* - * hmm_devmem_page_get_drvdata - get per page driver data field - * - * @page: pointer to struct page - * Return: driver data value - */ -static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) -{ - return page->hmm_data; -} - - -/* - * struct hmm_device - fake device to hang device memory onto - * - * @device: device struct - * @minor: device minor number - */ -struct hmm_device { - struct device device; - unsigned int minor; -}; - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper and - * it is not strictly needed, in order to make use of any HMM functionality. - */ -struct hmm_device *hmm_device_new(void *drvdata); -void hmm_device_put(struct hmm_device *hmm_device); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -#else /* IS_ENABLED(CONFIG_HMM) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} -static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM) */ - #endif /* LINUX_HMM_H */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5db386cfc2d4..5b6a7121c9f0 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -133,8 +133,7 @@ enum { IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, - IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, - IORES_DESC_RESERVED = 8, + IORES_DESC_RESERVED = 7, }; /* @@ -296,6 +295,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) return (r1->start <= r2->end && r1->end >= r2->start); } +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1732dea030b2..f8a5b2a19945 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -37,13 +37,6 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * - * MEMORY_DEVICE_PUBLIC: - * Device memory that is cache coherent from device and CPU point of view. This - * is use on platform that have an advance system bus (like CAPI or CCIX). A - * driver can hotplug the device memory using ZONE_DEVICE and with that memory - * type. Any page of a process can be migrated to such memory. However no one - * should be allow to pin such memory so that it can always be evicted. - * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -52,54 +45,84 @@ struct vmem_altmap { * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * + * MEMORY_DEVICE_DEVDAX: + * Host memory that has similar access semantics as System RAM i.e. DMA + * coherent and supports page pinning. In contrast to + * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax + * character device. + * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { + /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, - MEMORY_DEVICE_PUBLIC, MEMORY_DEVICE_FS_DAX, + MEMORY_DEVICE_DEVDAX, MEMORY_DEVICE_PCI_P2PDMA, }; -/* - * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief - * explanation in include/linux/memory_hotplug.h. - * - * The page_free() callback is called once the page refcount reaches 1 - * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. - * This allows the device driver to implement its own memory management.) - */ -typedef void (*dev_page_free_t)(struct page *page, void *data); +struct dev_pagemap_ops { + /* + * Called once the page refcount reaches 1. (ZONE_DEVICE pages never + * reach 0 refcount unless there is a refcount bug. This allows the + * device driver to implement its own memory management.) + */ + void (*page_free)(struct page *page); + + /* + * Transition the refcount in struct dev_pagemap to the dead state. + */ + void (*kill)(struct dev_pagemap *pgmap); + + /* + * Wait for refcount in struct dev_pagemap to be idle and reap it. + */ + void (*cleanup)(struct dev_pagemap *pgmap); + + /* + * Used for private (un-addressable) device memory only. Must migrate + * the page back to a CPU accessible page. + */ + vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); +}; + +#define PGMAP_ALTMAP_VALID (1 << 0) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping - * @kill: callback to transition @ref to the dead state - * @cleanup: callback to wait for @ref to be idle and reap it + * @internal_ref: internal reference if @ref is not provided by the caller + * @done: completion for @internal_ref * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @flags: PGMAP_* flags to specify defailed behavior + * @ops: method table */ struct dev_pagemap { - dev_page_free_t page_free; struct vmem_altmap altmap; - bool altmap_valid; struct resource res; struct percpu_ref *ref; - void (*kill)(struct percpu_ref *ref); - void (*cleanup)(struct percpu_ref *ref); + struct percpu_ref internal_ref; + struct completion done; struct device *dev; - void *data; enum memory_type type; + unsigned int flags; u64 pci_p2pdma_bus_offset; + const struct dev_pagemap_ops *ops; }; +static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) +{ + if (pgmap->flags & PGMAP_ALTMAP_VALID) + return &pgmap->altmap; + return NULL; +} + #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index f88f0eabcc5e..0389c34ac529 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -937,8 +937,6 @@ static inline bool is_zone_device_page(const struct page *page) #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void dev_pagemap_get_ops(void); -void dev_pagemap_put_ops(void); void __put_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool put_devmap_managed_page(struct page *page) @@ -949,7 +947,6 @@ static inline bool put_devmap_managed_page(struct page *page) return false; switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_PUBLIC: case MEMORY_DEVICE_FS_DAX: __put_devmap_managed_page(page); return true; @@ -965,12 +962,6 @@ static inline bool is_device_private_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PRIVATE; } -static inline bool is_device_public_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PUBLIC; -} - #ifdef CONFIG_PCI_P2PDMA static inline bool is_pci_p2pdma_page(const struct page *page) { @@ -985,14 +976,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page) #endif /* CONFIG_PCI_P2PDMA */ #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline void dev_pagemap_get_ops(void) -{ -} - -static inline void dev_pagemap_put_ops(void) -{ -} - static inline bool put_devmap_managed_page(struct page *page) { return false; @@ -1003,11 +986,6 @@ static inline bool is_device_private_page(const struct page *page) return false; } -static inline bool is_device_public_page(const struct page *page) -{ - return false; -} - static inline bool is_pci_p2pdma_page(const struct page *page) { return false; @@ -1436,10 +1414,8 @@ struct zap_details { pgoff_t last_index; /* Highest page->index to unmap */ }; -struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, bool with_public_device); -#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) - +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1d1093474c1a..3a37a89eb7a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -158,7 +158,7 @@ struct page { struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; - unsigned long hmm_data; + void *zone_device_data; unsigned long _zd_pad_1; /* uses mapping */ }; @@ -503,7 +503,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#if IS_ENABLED(CONFIG_HMM) +#ifdef CONFIG_HMM_MIRROR /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4d961668e5fc..15bdb6fe71e5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return pfn_to_page(swp_offset(entry)); } - -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp); #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_device_private_entry(struct page *page, bool write) { @@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return NULL; } - -static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - return VM_FAULT_SIGBUS; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION |