diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-26 20:00:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-26 20:00:28 -0700 |
commit | 478a1469a7d27fe6b2f85fc801ecdeb8afc836e6 (patch) | |
tree | 9b1eb10e1a0567413443281387b09d02b514b5ec /mm | |
parent | 315227f6da389f3a560f27f7777080857278e1b4 (diff) | |
parent | 4d9a2c8746671efbb0c27d3ae28c7474597a7aad (diff) |
Merge tag 'dax-locking-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull DAX locking updates from Ross Zwisler:
"Filesystem DAX locking for 4.7
- We use a bit in an exceptional radix tree entry as a lock bit and
use it similarly to how page lock is used for normal faults. This
fixes races between hole instantiation and read faults of the same
index.
- Filesystem DAX PMD faults are disabled, and will be re-enabled when
PMD locking is implemented"
* tag 'dax-locking-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
dax: Remove i_mmap_lock protection
dax: Use radix tree entry lock to protect cow faults
dax: New fault locking
dax: Allow DAX code to replace exceptional entries
dax: Define DAX lock bit for radix tree exceptional entry
dax: Make huge page handling depend of CONFIG_BROKEN
dax: Fix condition for filling of PMD holes
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 30 | ||||
-rw-r--r-- | mm/memory.c | 40 | ||||
-rw-r--r-- | mm/truncate.c | 62 |
3 files changed, 69 insertions, 63 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 9665b1d4f318..00ae878b2a38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping, return; /* - * Track node that only contains shadow entries. + * Track node that only contains shadow entries. DAX mappings contain + * no shadow entries and may contain other exceptional entries so skip + * those. * * Avoid acquiring the list_lru lock if already tracked. The * list_empty() test is safe as node->private_list is * protected by mapping->tree_lock. */ - if (!workingset_node_pages(node) && + if (!dax_mapping(mapping) && !workingset_node_pages(node) && list_empty(&node->private_list)) { node->private_data = mapping; list_lru_add(&workingset_shadow_nodes, &node->private_list); @@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!radix_tree_exceptional_entry(p)) return -EEXIST; - if (WARN_ON(dax_mapping(mapping))) - return -EINVAL; - - if (shadowp) - *shadowp = p; mapping->nrexceptional--; - if (node) - workingset_node_shadows_dec(node); + if (!dax_mapping(mapping)) { + if (shadowp) + *shadowp = p; + if (node) + workingset_node_shadows_dec(node); + } else { + /* DAX can replace empty locked entry with a hole */ + WARN_ON_ONCE(p != + (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK)); + /* DAX accounts exceptional entries as normal pages */ + if (node) + workingset_node_pages_dec(node); + /* Wakeup waiters for exceptional entry lock */ + dax_wake_mapping_entry_waiter(mapping, page->index, + false); + } } radix_tree_replace_slot(slot, page); mapping->nrpages++; diff --git a/mm/memory.c b/mm/memory.c index a1b93d9e4449..15322b73636b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -63,6 +63,7 @@ #include <linux/dma-debug.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> +#include <linux/dax.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - - /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); @@ -2825,7 +2824,8 @@ oom: */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, - struct page *cow_page, struct page **page) + struct page *cow_page, struct page **page, + void **entry) { struct vm_fault vmf; int ret; @@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - if (!vmf.page) - goto out; + if (ret & VM_FAULT_DAX_LOCKED) { + *entry = vmf.entry; + return ret; + } if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); - out: *page = vmf.page; return ret; } @@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + void *fault_entry; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; @@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, + &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - if (fault_page) + if (!(ret & VM_FAULT_DAX_LOCKED)) copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, + pgoff); } goto uncharge_out; } @@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); } return ret; uncharge_out: @@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; diff --git a/mm/truncate.c b/mm/truncate.c index b00272810871..4064f8f53daa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping, if (shmem_mapping(mapping)) return; - spin_lock_irq(&mapping->tree_lock); - if (dax_mapping(mapping)) { - if (radix_tree_delete_item(&mapping->page_tree, index, entry)) - mapping->nrexceptional--; - } else { - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + dax_delete_mapping_entry(mapping, index); + return; } + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } |