diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 20:49:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 20:49:49 -0700 |
commit | d34fc1adf01ff87026da85fb972dc259dc347540 (patch) | |
tree | 27356073d423187157b7cdb69da32b53102fb9e7 /fs | |
parent | 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff) | |
parent | d2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- various misc bits
- DAX updates
- OCFS2
- most of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits)
mm,fork: introduce MADV_WIPEONFORK
x86,mpx: make mpx depend on x86-64 to free up VMA flag
mm: add /proc/pid/smaps_rollup
mm: hugetlb: clear target sub-page last when clearing huge page
mm: oom: let oom_reap_task and exit_mmap run concurrently
swap: choose swap device according to numa node
mm: replace TIF_MEMDIE checks by tsk_is_oom_victim
mm, oom: do not rely on TIF_MEMDIE for memory reserves access
z3fold: use per-cpu unbuddied lists
mm, swap: don't use VMA based swap readahead if HDD is used as swap
mm, swap: add sysfs interface for VMA based swap readahead
mm, swap: VMA based swap readahead
mm, swap: fix swap readahead marking
mm, swap: add swap readahead hit statistics
mm/vmalloc.c: don't reinvent the wheel but use existing llist API
mm/vmstat.c: fix wrong comment
selftests/memfd: add memfd_create hugetlbfs selftest
mm/shmem: add hugetlbfs support to memfd_create()
mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups
mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas()
...
Diffstat (limited to 'fs')
35 files changed, 365 insertions, 670 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 103ca5e1267b..64c58eb26159 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) -{ - struct v9fs_inode *v9inode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { .get_attr = v9fs_cache_inode_get_attr, .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, - .now_uncached = v9fs_cache_inode_now_uncached, }; void v9fs_cache_inode_get_cookie(struct inode *inode) diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 577763c3d88b..1fe855191261 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen); -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); struct fscache_netfs afs_cache_netfs = { .name = "afs", @@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = { .get_attr = afs_vnode_cache_get_attr, .get_aux = afs_vnode_cache_get_aux, .check_aux = afs_vnode_cache_check_aux, - .now_uncached = afs_vnode_cache_now_uncached, }; /* @@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, _leave(" = SUCCESS"); return FSCACHE_CHECKAUX_OKAY; } - -/* - * indication the cookie is no longer uncached - * - this function is called when the backing store currently caching a cookie - * is removed - * - the netfs should use this to clean up any markers indicating cached pages - * - this is mandatory for any object that may have data - */ -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - _enter("{%x,%x,%Lx}", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - /* grab a bunch of pages to clean */ - nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } - - _leave(""); -} diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..50da0e102ca0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct pagevec pvec; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; - int i; + int i, count; struct buffer_head *bh; struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { - for (i = 0; i < pagevec_count(&pvec); i++) { + while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { + count = pagevec_count(&pvec); + for (i = 0; i < count; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; if (!page_has_buffers(page)) continue; /* @@ -1670,7 +1667,9 @@ unlock_page: } pagevec_release(&pvec); cond_resched(); - index++; + /* End of range already reached? */ + if (index > end || !index) + break; } } EXPORT_SYMBOL(clean_bdev_aliases); @@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, pagevec_init(&pvec, 0); do { - unsigned want, nr_pages, i; + unsigned nr_pages, i; - want = min_t(unsigned, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); if (nr_pages == 0) break; @@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, lastoff < page_offset(page)) goto check_range; - /* Searching done if the page index is out of range. */ - if (page->index >= end) - goto not_found; - lock_page(page); if (likely(page->mapping == inode->i_mapping) && page_has_buffers(page)) { @@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, unlock_page(page); lastoff = page_offset(page) + PAGE_SIZE; } - - /* Searching done if fewer pages returned than wanted. */ - if (nr_pages < want) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index < end); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 337f88673ed9..174d6e6569a8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OKAY; } -static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) -{ - struct ceph_inode_info* ci = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dout("ceph inode 0x%p now uncached", ci); - - while (1) { - nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .get_attr = ceph_fscache_inode_get_attr, .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, - .now_uncached = ceph_fscache_inode_now_uncached, }; void ceph_fscache_register_inode_cookie(struct inode *inode) diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 6c665bf4a27c..2c14020e5e1d 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct cifsInodeInfo *cifsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); - - for (;;) { - nr_pages = pagevec_lookup(&pvec, - cifsi->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = { .get_attr = cifs_fscache_inode_get_attr, .get_aux = cifs_fscache_inode_get_aux, .check_aux = cifs_fscache_inode_check_aux, - .now_uncached = cifs_fscache_inode_now_uncached, }; @@ -42,6 +42,9 @@ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) +/* The 'colour' (ie low bits) within a PMD of a page offset. */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a zero page or + * an empty entry that is just used for locking. In total four special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) + +static unsigned long dax_radix_sector(void *entry) +{ + return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | + ((unsigned long)sector << RADIX_DAX_SHIFT) | + RADIX_DAX_ENTRY_LOCK); +} + +static unsigned int dax_radix_order(void *entry) +{ + if ((unsigned long)entry & RADIX_DAX_PMD) + return PMD_SHIFT - PAGE_SHIFT; + return 0; +} + static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry) static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_HZP; + return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) @@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) - index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + index &= ~PG_PMD_COLOUR; key->mapping = mapping; key->entry_start = index; @@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, void *entry, bool wake_all) +{ + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); +} + +/* * Check whether the given slot is locked. The function must be called with * mapping->tree_lock held */ @@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, for (;;) { entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || + if (!entry || + WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, } static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) + pgoff_t index) { - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } + dax_unlock_mapping_entry(mapping, index); } /* @@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!entry) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, } /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked. * * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB @@ -276,18 +334,21 @@ restart: spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + entry = ERR_PTR(-EIO); + goto out_unlock; + } + if (entry) { if (size_flag & RADIX_DAX_PMD) { - if (!radix_tree_exceptional_entry(entry) || - dax_is_pte_entry(entry)) { + if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ - if (radix_tree_exceptional_entry(entry) && - dax_is_pmd_entry(entry) && + if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; @@ -321,7 +382,7 @@ restart: mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } spin_lock_irq(&mapping->tree_lock); @@ -371,52 +432,12 @@ restart: spin_unlock_irq(&mapping->tree_lock); return entry; } - /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(entry)) { - struct page *page = entry; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - /* Page got truncated? Retry... */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto restart; - } - return page; - } entry = lock_slot(mapping, slot); out_unlock: spin_unlock_irq(&mapping->tree_lock); return entry; } -/* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. - */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) -{ - struct exceptional_entry_key key; - wait_queue_head_t *wq; - - wq = dax_entry_waitqueue(mapping, index, entry, &key); - - /* - * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. - * So at this point all tasks that could have seen our entry locked - * must be in the waitqueue and the following check will see them. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); -} - static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) + if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_mapping_entry(mapping, index, false); } -/* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. - */ -static int dax_load_hole(struct address_space *mapping, void **entry, - struct vm_fault *vmf) -{ - struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } - - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; - } -out: - trace_dax_load_hole(inode, vmf, ret); - return ret; -} - static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) @@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - int error = 0; - bool hole_fill = false; void *new_entry; pgoff_t index = vmf->pgoff; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - /* Replacing hole page with block mapping? */ - if (!radix_tree_exceptional_entry(entry)) { - hole_fill = true; - /* - * Unmap the page now before we remove it from page cache below. - * The page is locked so it cannot be faulted in again. - */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); - if (error) - return ERR_PTR(error); - } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { - /* replacing huge zero page with PMD block mapping */ - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + /* we are replacing a zero page with block mapping */ + if (dax_is_pmd_entry(entry)) + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, + PMD_SIZE, 0); + else /* pte entry */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(sector, flags); - if (hole_fill) { - __delete_from_page_cache(entry, NULL); - /* Drop pagecache reference */ - put_page(entry); - error = __radix_tree_insert(page_tree, index, - dax_radix_order(new_entry), new_entry); - if (error) { - new_entry = ERR_PTR(error); - goto unlock; - } - mapping->nrexceptional++; - } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); + entry = new_entry; } + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: + spin_unlock_irq(&mapping->tree_lock); - if (hole_fill) { - radix_tree_preload_end(); - /* - * We don't need hole page anymore, it has been replaced with - * locked radix tree entry now. - */ - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(entry); - unlock_page(entry); - put_page(entry); - } - return new_entry; + return entry; } static inline unsigned long @@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || !radix_tree_exceptional_entry(entry2)) + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev, trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ret; put_unlocked: @@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void **entryp, + sector_t sector, size_t size, void *entry, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - void *entry = *entryp; void *ret, *kaddr; pgoff_t pgoff; int id, rc; @@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping, ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); - *entryp = ret; trace_dax_insert_mapping(mapping->host, vmf, ret); - return vm_insert_mixed(vma, vaddr, pfn); + if (vmf->flags & FAULT_FLAG_WRITE) + return vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + return vm_insert_mixed(vma, vaddr, pfn); } -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault +/* + * The user has performed a load from a hole in the file. Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files. Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead. */ -int dax_pfn_mkwrite(struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + unsigned long vaddr = vmf->address; + int ret = VM_FAULT_NOPAGE; + struct page *zero_page; + void *entry2; - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (!entry || !radix_tree_exceptional_entry(entry)) { - if (entry) - put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); - trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + zero_page = ZERO_PAGE(0); + if (unlikely(!zero_page)) { + ret = VM_FAULT_OOM; + goto out; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - /* - * If we race with somebody updating the PTE and finish_mkwrite_fault() - * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry - * the fault in either case. - */ - finish_mkwrite_fault(vmf); - put_locked_mapping_entry(mapping, index, entry); - trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_ZERO_PAGE); + if (IS_ERR(entry2)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); +out: + trace_dax_load_hole(inode, vmf, ret); + return ret; } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); static bool dax_range_is_aligned(struct block_device *bdev, unsigned int offset, unsigned int length) @@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (map_len > end - pos) map_len = end - pos; + /* + * The userspace address for the memory copy has already been + * validated via access_ok() in either vfs_read() or + * vfs_write(), depending on which operation we are doing. + */ if (iov_iter_rw(iter) == WRITE) map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); @@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, &entry, vmf->vma, vmf); + sector, PAGE_SIZE, entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, &entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + put_locked_mapping_entry(mapping, vmf->pgoff); out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void **entryp) + loff_t pos, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL, *kaddr; long length = 0; pgoff_t pgoff; - pfn_t pfn; + pfn_t pfn = {}; int id; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) @@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, goto unlock_fallback; dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - *entryp = ret; trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1321,7 +1267,7 @@ fallback: } static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void **entryp) + void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, - RADIX_DAX_PMD | RADIX_DAX_HZP); + ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); if (IS_ERR(ret)) goto fallback; - *entryp = ret; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. + * grab_mapping_entry() will make sure we get a 2MiB empty entry, a + * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page + * is already in the tree, for instance), it will return -EEXIST and + * we just fall back to 4k entries. */ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) @@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, entry); break; default: WARN_ON_ONCE(1); @@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); + put_locked_mapping_entry(mapping, pgoff); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d34d32bdc944..ff3a3636a5ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf) return ret; } -static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - loff_t size; - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - down_read(&ei->dax_sem); - - /* check that the faulting page hasn't raced with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* @@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { * will always fail and fail back to regular faults. */ .page_mkwrite = ext2_dax_fault, - .pfn_mkwrite = ext2_dax_pfn_mkwrite, + .pfn_mkwrite = ext2_dax_fault, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 197653ea6041..57dcaea762c3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; - - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; -} - static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops @@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, pagevec_init(&pvec, 0); do { - int i, num; + int i; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - (pgoff_t)num); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &index, end); if (nr_pages == 0) break; @@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, goto out; } - if (page->index > end) - goto out; - lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -576,14 +542,10 @@ next: unlock_page(page); } - /* The no. of pages is less than our desired, we are done. */ - if (nr_pages < num) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* There are no pages upto endoff - that would be a hole in there. */ if (whence == SEEK_HOLE && lastoff < endoff) { found = 1; *offset = lastoff; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 714396760616..e963508ea35f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { @@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } unlock_page(page); } - index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } } @@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, - PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; - /* Up to 'end' pages must be contiguous */ - BUG_ON(page->index != start); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) @@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_release(&pvec); return err; } - start++; } pagevec_release(&pvec); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c8c4f79c7ce1..0ad3fd3ad0b4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; do { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + if (!pagevec_lookup(&pvec, mapping, &next)) break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } while (++next); + } while (next); _leave(""); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28d2753be094..7c02b3f738e1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, const pgoff_t end = lend >> huge_page_shift(h); struct vm_area_struct pseudo_vma; struct pagevec pvec; - pgoff_t next; + pgoff_t next, index; int i, freed = 0; - long lookup_nr = PAGEVEC_SIZE; bool truncate_op = (lend == LLONG_MAX); memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); @@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, next = start; while (next < end) { /* - * Don't grab more pages than the number left in the range. - */ - if (end - next < lookup_nr) - lookup_nr = end - next; - - /* * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; u32 hash; - /* - * The page (index) could be beyond end. This is - * only possible in the punch hole case as end is - * max page offset in the truncate case. - */ - next = page->index; - if (next >= end) - break; - + index = page->index; hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, - mapping, next, 0); + mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, - next * pages_per_huge_page(h), - (next + 1) * pages_per_huge_page(h)); + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); i_mmap_unlock_write(mapping); } @@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, - next, next + 1, 1))) + index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } unlock_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - ++next; huge_pagevec_release(&pvec); cond_resched(); } diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 777b055063f6..3025fe8584a0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, } /* - * Indication from FS-Cache that the cookie is no longer cached - * - This function is called when the backing store currently caching a cookie - * is removed - * - The netfs should use this to clean up any markers indicating cached pages - * - This is mandatory for any object that may have data - */ -static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct nfs_inode *nfsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); - - for (;;) { - /* grab a bunch of pages to unmark */ - nr_pages = pagevec_lookup(&pvec, - nfsi->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - -/* * Get an extra reference on a read context. * - This function can be absent if the completion function doesn't require a * context. @@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = { .get_attr = nfs_fscache_inode_get_attr, .get_aux = nfs_fscache_inode_get_aux, .check_aux = nfs_fscache_inode_check_aux, - .now_uncached = nfs_fscache_inode_now_uncached, .get_context = nfs_fh_get_context, .put_context = nfs_fh_put_context, }; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index f11a3ad2df0c..8616c46d33da 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + n = pagevec_lookup(&pvec, smap, &index); if (!n) return; - index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e50a387959bf..40b5cc97f7b0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -221,7 +221,7 @@ out: /* * Set the access or default ACL of an inode. */ -int ocfs2_set_acl(handle_t *handle, +static int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 2783a75b3999..7be0bb756286 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -28,13 +28,6 @@ struct ocfs2_acl_entry { struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); -int ocfs2_set_acl(handle_t *handle, - struct inode *inode, - struct buffer_head *di_bh, - int type, - struct posix_acl *acl, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb15a96df0b6..a177eae3aa1a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, /* * How many free extents have we got before we need more meta data? */ -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et) +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) { int retval; struct ocfs2_extent_list *el = NULL; @@ -1933,14 +1932,12 @@ out: * the new changes. * * left_rec: the record on the left. - * left_child_el: is the child list pointed to by left_rec * right_rec: the record to the right of left_rec * right_child_el: is the child list pointed to by right_rec * * By definition, this only works on interior nodes. */ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, - struct ocfs2_extent_list *left_child_el, struct ocfs2_extent_rec *right_rec, struct ocfs2_extent_list *right_child_el) { @@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, */ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); - ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], &root_el->l_recs[i + 1], right_el); } @@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, el = right_path->p_node[i].el; right_rec = &el->l_recs[0]; - ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, - right_el); + ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); ocfs2_journal_dirty(handle, left_path->p_node[i].bh); ocfs2_journal_dirty(handle, right_path->p_node[i].bh); @@ -2509,7 +2505,7 @@ out_ret_path: static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_tree *et, - int subtree_index, struct ocfs2_path *path) + struct ocfs2_path *path) { int i, idx, ret; struct ocfs2_extent_rec *rec; @@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, et); + free_extents = ocfs2_num_free_extents(et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, *ac = NULL; - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4a5152ec88a3..27b75cf32cfa 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, u64 refcount_loc, bool refcount_tree_locked); -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et); +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); /* * how many new metadata chunks would an allocation need at maximum? diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..56ac07cd35f6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); @@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - - o2hb_fill_node_map(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); - int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3ecb9f337b7d..febe6312ceff 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, &et); + num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 66e59d3163ea..6e41fc8fabbe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -713,13 +713,6 @@ leave: return status; } -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) -{ - return __ocfs2_extend_allocation(inode, logical_start, - clusters_to_add, mark_unwritten); -} - /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d5e5fa7f0743..36304434eacf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; - osb->dirty = 0; /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e52a2852d50d..7eb3b0a6347e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..9a50f222ac97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -320,7 +320,6 @@ struct ocfs2_super u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; - u8 *uuid; char *uuid_str; u32 uuid_hash; u8 *vol_label; @@ -388,9 +387,8 @@ struct ocfs2_super unsigned int osb_resv_level; unsigned int osb_dir_resv_level; - /* Next three fields are for local node slot recovery during + /* Next two fields are for local node slot recovery during * mount. */ - int dirty; struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f8933cb53d68..ab156e35ec00 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, int *credits) { int ret = 0, meta_add = 0; - int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6ad3533940ba..71f22c8fbffd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3f936be379a9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ - osb->dirty = 1; osb->local_alloc_copy = local_alloc; local_alloc = NULL; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f70c3778d600..5fdf269ba82e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( *credits += 1; /* count in the xattr tree change. */ - num_free_extents = ocfs2_num_free_extents(osb, xt_et); + num_free_extents = ocfs2_num_free_extents(xt_et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/proc/base.c b/fs/proc/base.c index 98fd8f6df851..e5d89a0d0b8a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2b89071630..2cbfcd32e884 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c */ +struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; + struct mem_size_stats *rollup; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 509a61668d90..cdd979724c74 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); - show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); @@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", - global_page_state(NR_KERNEL_STACK_KB)); + global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", - global_page_state(NR_PAGETABLE)); + global_zone_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif @@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); show_val_kb(m, "Bounce: ", - global_page_state(NR_BOUNCE)); + global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); @@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); #endif hugetlb_report_meminfo(m); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fe8f3265e877..a290966f91ec 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file) if (priv->mm) mmdrop(priv->mm); + kfree(priv->rollup); return seq_release_private(inode, file); } @@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv, vma->vm_end >= vma->vm_mm->start_stack; } +static void show_vma_header_prefix(struct seq_file *m, + unsigned long start, unsigned long end, + vm_flags_t flags, unsigned long long pgoff, + dev_t dev, unsigned long ino) +{ + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); +} + static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { @@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) start = vma->vm_start; end = vma->vm_end; - - seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); /* * Print the dentry name for named mappings, and a @@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { + bool first; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -443,7 +452,9 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long first_vma_start; u64 pss; + u64 pss_locked; u64 swap_pss; bool check_shmem_swap; }; @@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", @@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) static int show_smap(struct seq_file *m, void *v, int is_pid) { + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct mem_size_stats mss; + struct mem_size_stats mss_stack; + struct mem_size_stats *mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, #ifdef CONFIG_HUGETLB_PAGE .hugetlb_entry = smaps_hugetlb_range, #endif .mm = vma->vm_mm, - .private = &mss, }; + int ret = 0; + bool rollup_mode; + bool last_vma; + + if (priv->rollup) { + rollup_mode = true; + mss = priv->rollup; + if (mss->first) { + mss->first_vma_start = vma->vm_start; + mss->first = false; + } + last_vma = !m_next_vma(priv, vma); + } else { + rollup_mode = false; + memset(&mss_stack, 0, sizeof(mss_stack)); + mss = &mss_stack; + } - memset(&mss, 0, sizeof mss); + smaps_walk.private = mss; #ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { @@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss.swap = shmem_swapped; + mss->swap = shmem_swapped; } else { - mss.check_shmem_swap = true; + mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; } } @@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); + if (vma->vm_flags & VM_LOCKED) + mss->pss_locked += mss->pss; + + if (!rollup_mode) { + show_map_vma(m, vma, is_pid); + } else if (last_vma) { + show_vma_header_prefix( + m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + } else { + ret = SEQ_SKIP; + } - show_map_vma(m, vma, is_pid); - - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n" - "Locked: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss.resident >> 10, - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), - mss.shared_clean >> 10, - mss.shared_dirty >> 10, - mss.private_clean >> 10, - mss.private_dirty >> 10, - mss.referenced >> 10, - mss.anonymous >> 10, - mss.lazyfree >> 10, - mss.anonymous_thp >> 10, - mss.shmem_thp >> 10, - mss.shared_hugetlb >> 10, - mss.private_hugetlb >> 10, - mss.swap >> 10, - (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10, - (vma->vm_flags & VM_LOCKED) ? - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - - arch_show_smap(m, vma); - show_smap_vma_flags(m, vma); + if (!rollup_mode) + seq_printf(m, + "Size: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); + + + if (!rollup_mode || last_vma) + seq_printf(m, + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" + "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" + "Locked: %8lu kB\n", + mss->resident >> 10, + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), + mss->shared_clean >> 10, + mss->shared_dirty >> 10, + mss->private_clean >> 10, + mss->private_dirty >> 10, + mss->referenced >> 10, + mss->anonymous >> 10, + mss->lazyfree >> 10, + mss->anonymous_thp >> 10, + mss->shmem_thp >> 10, + mss->shared_hugetlb >> 10, + mss->private_hugetlb >> 10, + mss->swap >> 10, + (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), + (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + + if (!rollup_mode) { + arch_show_smap(m, vma); + show_smap_vma_flags(m, vma); + } m_cache_vma(m, vma); - return 0; + return ret; } static int show_pid_smap(struct seq_file *m, void *v) @@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } +static int pid_smaps_rollup_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct proc_maps_private *priv; + int ret = do_maps_open(inode, file, &proc_pid_smaps_op); + + if (ret < 0) + return ret; + seq = file->private_data; + priv = seq->private; + priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); + if (!priv->rollup) { + proc_map_release(inode, file); + return -ENOMEM; + } + priv->rollup->first = true; + return 0; +} + static int tid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_tid_smaps_op); @@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = { .release = proc_map_release, }; +const struct file_operations proc_pid_smaps_rollup_operations = { + .open = pid_smaps_rollup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + const struct file_operations proc_tid_smaps_operations = { .open = tid_smaps_open, .read = seq_read, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2ef7ce75c062..3ac1f2387083 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/sync.c b/fs/sync.c index 27d6b8bbcb6a..2e3fd7d94d2d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, goto out_put; mapping = f.file->f_mapping; - if (!mapping) { - ret = -EINVAL; - goto out_put; - } - ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = file_fdatawait_range(f.file, offset, endbyte); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 886085b47c75..5419e7da82ba 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg) static inline struct uffd_msg userfault_msg(unsigned long address, unsigned int flags, - unsigned long reason) + unsigned long reason, + unsigned int features) { struct uffd_msg msg; msg_init(&msg); @@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * write protect fault. */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (features & UFFD_FEATURE_THREAD_ID) + msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; } @@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + if (ctx->features & UFFD_FEATURE_SIGBUS) + goto out; + /* * If it's already released don't get it. This avoids to loop * in __get_user_pages if userfaultfd_release waits on the @@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + ctx->features); uwq.ctx = ctx; uwq.waken = false; @@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; - bool non_anon_pages; + bool basic_ioctls; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - non_anon_pages = false; + basic_ioctls = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* * Note vmas containing huge pages */ - if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) - non_anon_pages = true; + if (is_vm_hugetlb_page(cur)) + basic_ioctls = true; found = true; } @@ -1371,7 +1378,7 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : + if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0debbc7e3f03..ec3e44fcf771 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vmf); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; |