From e7c58097793ef15d58fadf190ee58738fbf447cd Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Jan 2019 15:23:32 -0800 Subject: hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race" This reverts c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 The reverted commit caused ABBA deadlocks when file migration raced with file eviction for specific hugetlbfs files. This was discovered with a modified version of the LTP move_pages12 test. The purpose of the reverted patch was to close a long existing race between hugetlbfs file truncation and page faults. After more analysis of the patch and impacted code, it was determined that i_mmap_rwsem can not be used for all required synchronization. Therefore, revert this patch while working an another approach to the underlying issue. Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Jan Stancek Cc: Michal Hocko Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: "Aneesh Kumar K . V" Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Davidlohr Bueso Cc: Prakash Sangappa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 61 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2fcea5f8225..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. + * maps and global counts. Page faults can not race with truncation + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. - * - * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent - * races with page faults. - * + * pages are not modified. Page faults can race with hole punch. + * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; index = page->index; + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, index, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + /* - * A mapped page is impossible as callers should unmap - * all references before calling. And, i_mmap_rwsem - * prevents the creation of additional mappings. + * If page is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. */ - VM_BUG_ON(page_mapped(page)); + if (unlikely(page_mapped(page))) { + BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); + } lock_page(page); /* @@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } unlock_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, static void hugetlbfs_evict_inode(struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct resv_map *resv_map; - /* - * The vfs layer guarantees that there are no other users of this - * inode. Therefore, it would be safe to call remove_inode_hugepages - * without holding i_mmap_rwsem. We acquire and hold here to be - * consistent with other callers. Since there will be no contention - * on the semaphore, overhead is negligible. - */ - i_mmap_lock_write(mapping); remove_inode_hugepages(inode, 0, LLONG_MAX); - i_mmap_unlock_write(mapping); - resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); - remove_inode_hugepages(inode, offset, LLONG_MAX); i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } @@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); - remove_inode_hugepages(inode, hole_start, hole_end); i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); } @@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); -- cgit v1.2.3-58-ga151