diff options
Diffstat (limited to 'fs')
473 files changed, 13323 insertions, 9331 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 55e108e5e133..1c8dc696d516 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -49,22 +49,20 @@ int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, void v9fs_cache_inode_get_cookie(struct inode *inode) { - struct v9fs_inode *v9inode; + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; __le32 version; __le64 path; if (!S_ISREG(inode->i_mode)) return; - - v9inode = V9FS_I(inode); - if (WARN_ON(v9inode->fscache)) + if (WARN_ON(v9fs_inode_cookie(v9inode))) return; version = cpu_to_le32(v9inode->qid.version); path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = + v9inode->netfs_ctx.cache = fscache_acquire_cookie(v9fs_session_cache(v9ses), 0, &path, sizeof(path), @@ -72,5 +70,5 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) i_size_read(&v9inode->vfs_inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", - inode, v9inode->fscache); + inode, v9fs_inode_cookie(v9inode)); } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 08f65c40af4f..e28ddf763b3b 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -623,9 +623,7 @@ static void v9fs_sysfs_cleanup(void) static void v9fs_inode_init_once(void *foo) { struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; -#endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); inode_init_once(&v9inode->vfs_inode); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index bc8b30205d36..ec0e8df3b2eb 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -9,6 +9,7 @@ #define FS_9P_V9FS_H #include <linux/backing-dev.h> +#include <linux/netfs.h> /** * enum p9_session_flags - option flags for each 9P session @@ -108,14 +109,15 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *fscache; -#endif + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; - struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) @@ -126,7 +128,7 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode) static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return v9inode->fscache; + return netfs_i_cookie(&v9inode->vfs_inode); #else return NULL; #endif @@ -163,6 +165,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern const struct netfs_request_ops v9fs_req_ops; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9a10e68c5f30..501128188343 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -28,12 +28,12 @@ #include "fid.h" /** - * v9fs_req_issue_op - Issue a read from 9P + * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make */ -static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) +static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; + struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; struct iov_iter to; loff_t pos = subreq->start + subreq->transferred; @@ -52,20 +52,21 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) } /** - * v9fs_init_rreq - Initialise a read request + * v9fs_init_request - Initialise a read request * @rreq: The read request * @file: The file being read from */ -static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { struct p9_fid *fid = file->private_data; refcount_inc(&fid->count); rreq->netfs_priv = fid; + return 0; } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq + * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request * @mapping: unused mapping of request to cleanup * @priv: private data to cleanup, a fid, guaranted non-null. */ @@ -77,21 +78,10 @@ static void v9fs_req_cleanup(struct address_space *mapping, void *priv) } /** - * v9fs_is_cache_enabled - Determine if caching is enabled for an inode - * @inode: The inode to check - */ -static bool v9fs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); - - return fscache_cookie_enabled(cookie) && cookie->cache_priv; -} - -/** * v9fs_begin_cache_operation - Begin a cache operation for a read * @rreq: The read request */ -static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) +static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) { #ifdef CONFIG_9P_FSCACHE struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); @@ -102,37 +92,14 @@ static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) #endif } -static const struct netfs_read_request_ops v9fs_req_ops = { - .init_rreq = v9fs_init_rreq, - .is_cache_enabled = v9fs_is_cache_enabled, +const struct netfs_request_ops v9fs_req_ops = { + .init_request = v9fs_init_request, .begin_cache_operation = v9fs_begin_cache_operation, - .issue_op = v9fs_req_issue_op, + .issue_read = v9fs_issue_read, .cleanup = v9fs_req_cleanup, }; /** - * v9fs_vfs_readpage - read an entire page in from 9P - * @file: file being read - * @page: structure to page - * - */ -static int v9fs_vfs_readpage(struct file *file, struct page *page) -{ - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &v9fs_req_ops, NULL); -} - -/** - * v9fs_vfs_readahead - read a set of pages from 9P - * @ractl: The readahead parameters - */ -static void v9fs_vfs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &v9fs_req_ops, NULL); -} - -/** * v9fs_release_page - release the private state associated with a page * @page: The page to be released * @gfp: The caller's allocation restrictions @@ -158,18 +125,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) return 1; } -/** - * v9fs_invalidate_page - Invalidate a page completely or partially - * @page: The page to be invalidated - * @offset: offset of the invalidated region - * @length: length of the invalidated region - */ - -static void v9fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +static void v9fs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(page); - folio_wait_fscache(folio); } @@ -249,16 +207,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) return retval; } -/** - * v9fs_launder_page - Writeback a dirty page - * @page: The page to be cleaned up - * - * Returns 0 on success. - */ - -static int v9fs_launder_page(struct page *page) +static int v9fs_launder_folio(struct folio *folio) { - struct folio *folio = page_folio(page); int retval; if (folio_clear_dirty_for_io(folio)) { @@ -325,8 +275,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata, - &v9fs_req_ops, NULL); + retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata); if (retval < 0) return retval; @@ -376,25 +325,25 @@ out: * Mark a page as having been made dirty and thus needing writeback. We also * need to pin the cache object to write back to. */ -static int v9fs_set_page_dirty(struct page *page) +static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct v9fs_inode *v9inode = V9FS_I(page->mapping->host); + struct v9fs_inode *v9inode = V9FS_I(mapping->host); - return fscache_set_page_dirty(page, v9fs_inode_cookie(v9inode)); + return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); } #else -#define v9fs_set_page_dirty __set_page_dirty_nobuffers +#define v9fs_dirty_folio filemap_dirty_folio #endif const struct address_space_operations v9fs_addr_operations = { - .readpage = v9fs_vfs_readpage, - .readahead = v9fs_vfs_readahead, - .set_page_dirty = v9fs_set_page_dirty, + .readpage = netfs_readpage, + .readahead = netfs_readahead, + .dirty_folio = v9fs_dirty_folio, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, .write_end = v9fs_write_end, .releasepage = v9fs_release_page, - .invalidatepage = v9fs_invalidate_page, - .launder_page = v9fs_launder_page, + .invalidate_folio = v9fs_invalidate_folio, + .launder_folio = v9fs_launder_folio, .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2a10242c79c7..55367ecb9442 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -228,12 +228,9 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) { struct v9fs_inode *v9inode; - v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL); + v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; -#endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); @@ -250,6 +247,14 @@ void v9fs_free_inode(struct inode *inode) kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } +/* + * Set parameters for the netfs library + */ +static void v9fs_set_netfs_context(struct inode *inode) +{ + netfs_i_context_init(inode, &v9fs_req_ops); +} + int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev) { @@ -338,6 +343,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } + + v9fs_set_netfs_context(inode); error: return err; diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb..30b751c7f11a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,7 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) + depends on ZONE_DEVICE || FS_DAX_LIMITED select FS_IOMAP select DAX help @@ -344,7 +344,7 @@ config LOCKD config LOCKD_V4 bool - depends on NFSD_V3 || NFS_V3 + depends on NFSD || NFS_V3 depends on FILE_LOCKING default y diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4d5ae61580aa..21c6332fa785 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -28,6 +28,16 @@ config BINFMT_ELF ld.so (check the file <file:Documentation/Changes> for location and latest version). +config BINFMT_ELF_KUNIT_TEST + bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS + depends on KUNIT=y && BINFMT_ELF=y + default KUNIT_ALL_TESTS + help + This builds the ELF loader KUnit tests, which try to gather + prior bug fixes into a regression test collection. This is really + only needed for debugging. Note that with CONFIG_COMPAT=y, the + compat_binfmt_elf KUnit test is also created. + config COMPAT_BINFMT_ELF def_bool y depends on COMPAT && BINFMT_ELF @@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_BINFMT_ELF_EXTRA_PHDRS + bool + config ARCH_HAVE_ELF_PROT bool diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5156821bfe6a..561bc748c04a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -73,7 +73,8 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations adfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = adfs_readpage, .writepage = adfs_writepage, .write_begin = adfs_write_begin, diff --git a/fs/adfs/super.c b/fs/adfs/super.c index bdbd26e571ed..e8bfc38239cd 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/file.c b/fs/affs/file.c index 75ebd2b576ca..b3f81d84ff4c 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -453,7 +453,8 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations affs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, @@ -834,7 +835,8 @@ err_bh: } const struct address_space_operations affs_aops_ofs = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = affs_readpage_ofs, //.writepage = affs_writepage_ofs, .write_begin = affs_write_begin_ofs, diff --git a/fs/affs/super.c b/fs/affs/super.c index c609005a9eaa..4c5f30a83336 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb) { struct affs_inode_info *i; - i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL); if (!i) return NULL; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index da9b4f8577a1..932e61e28e5d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -42,10 +42,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); -static void afs_dir_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); +static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, + size_t length); -static int afs_dir_set_page_dirty(struct page *page) +static bool afs_dir_dirty_folio(struct address_space *mapping, + struct folio *folio) { BUG(); /* This should never happen. */ } @@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = { }; const struct address_space_operations afs_dir_aops = { - .set_page_dirty = afs_dir_set_page_dirty, + .dirty_folio = afs_dir_dirty_folio, .releasepage = afs_dir_releasepage, - .invalidatepage = afs_dir_invalidatepage, + .invalidate_folio = afs_dir_invalidate_folio, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -2019,13 +2020,12 @@ static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags) /* * Invalidate part or all of a folio. */ -static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset, - unsigned int length) +static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(subpage); struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{%lu},%u,%u", folio_index(folio), offset, length); + _enter("{%lu},%zu,%zu", folio->index, offset, length); BUG_ON(!folio_test_locked(folio)); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index db832cc931c8..f120bcb8bf73 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); + netfs_i_context_init(inode, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index 720818a7c166..26292a110a8f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,13 +19,11 @@ #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); -static int afs_readpage(struct file *file, struct page *page); static int afs_symlink_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); +static void afs_invalidate_folio(struct folio *folio, size_t offset, + size_t length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static void afs_readahead(struct readahead_control *ractl); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); @@ -52,12 +50,12 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { - .readpage = afs_readpage, - .readahead = afs_readahead, - .set_page_dirty = afs_set_page_dirty, - .launder_page = afs_launder_page, + .readpage = netfs_readpage, + .readahead = netfs_readahead, + .dirty_folio = afs_dirty_folio, + .launder_folio = afs_launder_folio, .releasepage = afs_releasepage, - .invalidatepage = afs_invalidatepage, + .invalidate_folio = afs_invalidate_folio, .write_begin = afs_write_begin, .write_end = afs_write_end, .writepage = afs_writepage, @@ -67,7 +65,7 @@ const struct address_space_operations afs_file_aops = { const struct address_space_operations afs_symlink_aops = { .readpage = afs_symlink_readpage, .releasepage = afs_releasepage, - .invalidatepage = afs_invalidatepage, + .invalidate_folio = afs_invalidate_folio, }; static const struct vm_operations_struct afs_vm_ops = { @@ -240,7 +238,7 @@ void afs_put_read(struct afs_read *req) static void afs_fetch_data_notify(struct afs_operation *op) { struct afs_read *req = op->fetch.req; - struct netfs_read_subrequest *subreq = req->subreq; + struct netfs_io_subrequest *subreq = req->subreq; int error = op->error; if (error == -ECONNABORTED) @@ -310,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) return afs_do_sync_operation(op); } -static void afs_req_issue_op(struct netfs_read_subrequest *subreq) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); struct afs_read *fsreq; @@ -359,19 +357,13 @@ static int afs_symlink_readpage(struct file *file, struct page *page) return ret; } -static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { rreq->netfs_priv = key_get(afs_file_key(file)); + return 0; } -static bool afs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - - return fscache_cookie_enabled(cookie) && cookie->cache_priv; -} - -static int afs_begin_cache_operation(struct netfs_read_request *rreq) +static int afs_begin_cache_operation(struct netfs_io_request *rreq) { #ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); @@ -396,27 +388,14 @@ static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) key_put(netfs_priv); } -const struct netfs_read_request_ops afs_req_ops = { - .init_rreq = afs_init_rreq, - .is_cache_enabled = afs_is_cache_enabled, +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, - .issue_op = afs_req_issue_op, + .issue_read = afs_issue_read, .cleanup = afs_priv_cleanup, }; -static int afs_readpage(struct file *file, struct page *page) -{ - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &afs_req_ops, NULL); -} - -static void afs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &afs_req_ops, NULL); -} - int afs_write_inode(struct inode *inode, struct writeback_control *wbc) { fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); @@ -427,8 +406,8 @@ int afs_write_inode(struct inode *inode, struct writeback_control *wbc) * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. */ -static void afs_invalidate_dirty(struct folio *folio, unsigned int offset, - unsigned int length) +static void afs_invalidate_dirty(struct folio *folio, size_t offset, + size_t length) { struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); unsigned long priv; @@ -485,16 +464,14 @@ full_invalidate: * - release a page and clean up its private data if offset is 0 (indicating * the entire page) */ -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void afs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(page); - - _enter("{%lu},%u,%u", folio_index(folio), offset, length); + _enter("{%lu},%zu,%zu", folio->index, offset, length); - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (PagePrivate(page)) + if (folio_get_private(folio)) afs_invalidate_dirty(folio, offset, length); folio_wait_fscache(folio); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 5964f8aee090..2fe402483ad5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren } /* + * Set parameters for the netfs library + */ +static void afs_set_netfs_context(struct afs_vnode *vnode) +{ + netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); +} + +/* * Initialise an inode from the vnode status. */ static int afs_inode_init_from_status(struct afs_operation *op, @@ -128,6 +136,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, } afs_set_i_size(vnode, status->size); + afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); @@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ + vnode->netfs_ctx.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -420,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->cache = NULL; + vnode->netfs_ctx.cache = NULL; return; } @@ -430,12 +440,14 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); afs_set_cache_aux(vnode, &aux); - vnode->cache = fscache_acquire_cookie( - vnode->volume->cache, - vnode->status.type == AFS_FTYPE_FILE ? 0 : FSCACHE_ADV_SINGLE_CHUNK, - &key, sizeof(key), - &aux, sizeof(aux), - vnode->status.size); + afs_vnode_set_cache(vnode, + fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? + 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + vnode->status.size)); #endif } @@ -528,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) vnode = AFS_FS_I(inode); vnode->cb_v_break = as->volume->cb_v_break, + afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); if (IS_ERR(op)) { @@ -786,11 +799,8 @@ void afs_evict_inode(struct inode *inode) afs_put_wb_key(wbk); } -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, + fscache_relinquish_cookie(afs_vnode_cache(vnode), test_bit(AFS_VNODE_DELETED, &vnode->flags)); - vnode->cache = NULL; -#endif afs_prune_wb_keys(vnode); afs_put_permits(rcu_access_pointer(vnode->permit_cache)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b6f02321fc09..7b7ef945dc78 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -207,7 +207,7 @@ struct afs_read { loff_t file_size; /* File size returned by server */ struct key *key; /* The key to use to reissue the read */ struct afs_vnode *vnode; /* The file being read into. */ - struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ + struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; unsigned int call_debug_id; @@ -619,15 +619,16 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct inode vfs_inode; /* the VFS's inode record */ + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ @@ -674,12 +675,20 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return vnode->cache; + return netfs_i_cookie(&vnode->vfs_inode); #else return NULL; #endif } +static inline void afs_vnode_set_cache(struct afs_vnode *vnode, + struct fscache_cookie *cookie) +{ +#ifdef CONFIG_AFS_FSCACHE + vnode->netfs_ctx.cache = cookie; +#endif +} + /* * cached security record for one user's attempt to access a vnode */ @@ -1063,7 +1072,7 @@ extern const struct address_space_operations afs_file_aops; extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; -extern const struct netfs_read_request_ops afs_req_ops; +extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); @@ -1521,9 +1530,9 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); * write.c */ #ifdef CONFIG_AFS_FSCACHE -extern int afs_set_page_dirty(struct page *); +bool afs_dirty_folio(struct address_space *, struct folio *); #else -#define afs_set_page_dirty __set_page_dirty_nobuffers +#define afs_dirty_folio filemap_dirty_folio #endif extern int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1537,7 +1546,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -extern int afs_launder_page(struct page *); +int afs_launder_folio(struct folio *); /* * xattr.c diff --git a/fs/afs/super.c b/fs/afs/super.c index 5ec9fd97eccc..1fea195b0b27 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -679,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) { struct afs_vnode *vnode; - vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL); if (!vnode) return NULL; @@ -688,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb) /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); + afs_vnode_set_cache(vnode, NULL); vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = NULL; -#endif vnode->flags = 1 << AFS_VNODE_UNSET; vnode->lock_state = AFS_VNODE_LOCK_NONE; diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e9157d0da29..4763132ca57e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -22,9 +22,10 @@ static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len * Mark a page as having been made dirty and thus needing writeback. We also * need to pin the cache object to write back to. */ -int afs_set_page_dirty(struct page *page) +bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) { - return fscache_set_page_dirty(page, afs_vnode_cache(AFS_FS_I(page->mapping->host))); + return fscache_dirty_folio(mapping, folio, + afs_vnode_cache(AFS_FS_I(mapping->host))); } static void afs_folio_start_fscache(bool caching, struct folio *folio) { @@ -59,8 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata, - &afs_req_ops, NULL); + ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata); if (ret < 0) return ret; @@ -354,9 +354,10 @@ static const struct afs_operation_ops afs_store_data_operation = { static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, bool laundering) { + struct netfs_i_context *ictx = &vnode->netfs_ctx; struct afs_operation *op; struct afs_wb_key *wbk = NULL; - loff_t size = iov_iter_count(iter), i_size; + loff_t size = iov_iter_count(iter); int ret = -ENOKEY; _enter("%s{%llx:%llu.%u},%llx,%llx", @@ -378,15 +379,13 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t return -ENOMEM; } - i_size = i_size_read(&vnode->vfs_inode); - afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; op->file[0].modification = true; op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, i_size); + op->store.i_size = max(pos + size, ictx->remote_i_size); op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; @@ -617,8 +616,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(afs_vnode_cache(vnode), - mapping, start, len, caching); + fscache_clear_page_bits(mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } @@ -703,7 +701,7 @@ static int afs_writepages_region(struct address_space *mapping, struct folio *folio; struct page *head_page; ssize_t ret; - int n; + int n, skips = 0; _enter("%llx,%llx,", start, end); @@ -754,8 +752,15 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif + } else { + start += folio_size(folio); } folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } continue; } @@ -972,9 +977,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* * Clean up a page during invalidation. */ -int afs_launder_page(struct page *subpage) +int afs_launder_folio(struct folio *folio) { - struct folio *folio = page_folio(subpage); struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; struct bio_vec bv[1]; @@ -982,7 +986,7 @@ int afs_launder_page(struct page *subpage) unsigned int f, t; int ret = 0; - _enter("{%lx}", folio_index(folio)); + _enter("{%lx}", folio->index); priv = (unsigned long)folio_get_private(folio); if (folio_clear_dirty_for_io(folio)) { @@ -478,7 +478,7 @@ out: #endif static const struct address_space_operations aio_ctx_aops = { - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, #if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, #endif @@ -1478,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; - req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then @@ -1553,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) return -EINVAL; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c1ba13d19024..b4b3567ac655 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb) { struct befs_inode_info *bi; - bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 7f8544abf636..03139344568f 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -188,7 +188,8 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations bfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = bfs_readpage, .writepage = bfs_writepage, .write_begin = bfs_write_begin, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fd691e4815c5..1926bec2c850 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; - bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9e11e6f13e83..63c7ebb0da89 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, +#ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) @@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) @@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1074,12 +1075,12 @@ out_free_interp: vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1116,11 +1117,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (interpreter || alignment > ELF_MIN_ALIGN) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; @@ -1135,14 +1136,25 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - } - /* - * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the - * initial mapping is performed.) - */ - if (!load_addr_set) { + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. + * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1159,16 +1171,25 @@ out_free_interp: goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } + + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1204,6 +1225,7 @@ out_free_interp: } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1267,8 +1289,8 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; @@ -1619,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1651,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1675,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1688,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1744,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task, static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, - long signr, size_t *total) + long signr, struct elf_note_info *info) { - unsigned int i; + unsigned int note_iter, view_iter; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1760,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_note(&t->notes[0], "CORE", NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); - *total += notesize(&t->notes[0]); + info->size += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); /* * Each other regset might generate a note too. For each regset - * that has no core_note_type or is inactive, we leave t->notes[i] - * all zero and we'll know to skip writing it later. + * that has no core_note_type or is inactive, skip it. */ - for (i = 1; i < view->n; ++i) { - const struct user_regset *regset = &view->regsets[i]; + note_iter = 1; + for (view_iter = 1; view_iter < view->n; ++view_iter) { + const struct user_regset *regset = &view->regsets[view_iter]; int note_type = regset->core_note_type; bool is_fpreg = note_type == NT_PRFPREG; void *data; @@ -1786,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (ret < 0) continue; + if (WARN_ON_ONCE(note_iter >= info->thread_notes)) + break; + if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); - *total += notesize(&t->notes[i]); + info->size += notesize(&t->notes[note_iter]); + note_iter++; } return 1; @@ -1800,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1872,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info)) return 0; /* @@ -1881,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); - fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2029,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct core_thread *ct; struct elf_thread_status *ets; @@ -2050,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_for_each_entry(ets, &info->thread_list, list) { int sz; - sz = elf_dump_thread_status(siginfo->si_signo, ets); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, regs); + fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2072,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, - info->fpu); + info->prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); @@ -2169,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2178,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2201,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; @@ -2226,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2246,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2284,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm) /* Align to page */ dump_skip_to(cprm, dataoff); - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2302,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } @@ -2324,3 +2344,7 @@ static void __exit exit_elf_binfmt(void) core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); MODULE_LICENSE("GPL"); + +#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST +#include "binfmt_elf_test.c" +#endif diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c6f588dc4a9d..08d0c8797828 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = { .load_binary = load_elf_fdpic_binary, #ifdef CONFIG_ELF_CORE .core_dump = elf_fdpic_core_dump, -#endif .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; static int __init init_elf_fdpic_binfmt(void) @@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dump_skip_to(cprm, dataoff); - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1652,7 +1647,6 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c new file mode 100644 index 000000000000..11d734fec366 --- /dev/null +++ b/fs/binfmt_elf_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> + +static void total_mapping_size_test(struct kunit *test) +{ + struct elf_phdr empty[] = { + { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, }, + { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, }, + }; + /* + * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \ + * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}' + */ + struct elf_phdr mount[] = { + { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, }, + { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, }, + { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, }, + { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, }, + { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, }, + }; + size_t mount_size = 0xE070; + /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */ + struct elf_phdr unordered[] = { + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + }; + + /* No headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0); + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0); + /* Empty headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0); + /* No PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0); + /* Empty PT_LOAD and non-PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0); + + /* Normal set of PT_LOADS, and expected size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size); + /* Unordered PT_LOADs result in same size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size); +} + +static struct kunit_case binfmt_elf_test_cases[] = { + KUNIT_CASE(total_mapping_size_test), + {}, +}; + +static struct kunit_suite binfmt_elf_test_suite = { + .name = KBUILD_MODNAME, + .test_cases = binfmt_elf_test_cases, +}; + +kunit_test_suite(binfmt_elf_test_suite); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d776f80ee50..626898150011 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -37,6 +37,7 @@ #include <linux/flat.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/coredump.h> #include <asm/byteorder.h> #include <asm/unaligned.h> @@ -97,13 +98,17 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *); +#ifdef CONFIG_COREDUMP static int flat_core_dump(struct coredump_params *cprm); +#endif static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, +#ifdef CONFIG_COREDUMP .core_dump = flat_core_dump, .min_coredump = PAGE_SIZE +#endif }; /****************************************************************************/ @@ -112,12 +117,14 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ +#ifdef CONFIG_COREDUMP static int flat_core_dump(struct coredump_params *cprm) { pr_warn("Process %s:%d received signr %d and should have core dumped\n", current->comm, current->pid, cprm->siginfo->si_signo); return 1; } +#endif /****************************************************************************/ /* diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 4188ba3fd8c3..99f9995670ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags) subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value obj-$(CONFIG_BTRFS_FS) := btrfs.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c9ee579bc5a6..ebc392ea1d74 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -1335,7 +1337,8 @@ again: if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 8202ad6aa131..0dd6de994199 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1522,8 +1522,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); return; + } /* * Long running balances can keep us blocked here for eternity, so @@ -1531,6 +1535,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1605,6 +1610,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -2006,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); @@ -2288,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2444,6 +2451,27 @@ next: btrfs_trans_release_chunk_metadata(trans); } +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) @@ -2464,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; @@ -2473,12 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2693,7 +2717,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, cache->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); @@ -2916,7 +2940,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2982,7 +3005,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3083,7 +3105,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3141,7 +3162,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3425,7 +3445,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3512,7 +3532,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3627,10 +3651,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3720,9 +3751,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5878b7ce3b78..e8308f2ad07d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,15 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, }; struct btrfs_caching_control { @@ -68,6 +72,7 @@ struct btrfs_block_group { u64 bytes_super; u64 flags; u64 cache_generation; + u64 global_root_id; /* * If the free space extent count exceeds this number, convert the block diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b3e46aabc3d8..47e72d72f7d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -14,6 +14,13 @@ #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -173,8 +180,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) spin_unlock(&inode->lock); } +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..abac86a75840 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 71e5b2e9a1ba..19bf36d8ffea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b bi_size += bvec->bv_len; if (bio->bi_status) - cb->errors = 1; + cb->status = bio->bi_status; ASSERT(bi_size && bi_size <= cb->compressed_len); last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, @@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b return last_io; } -static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; @@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi } /* Do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); + if (cb->status != BLK_STS_OK) { + cb->orig_bio->bi_status = cb->status; + bio_endio(cb->orig_bio); } else { struct bio_vec *bvec; struct bvec_iter_all iter_all; - ASSERT(bio); - ASSERT(!bio->bi_status); /* * We have verified the checksum already, set page checked so * the end_io handlers know about it */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); + ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { u64 bvec_start = page_offset(bvec->bv_page) + bvec->bv_offset; @@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio) * Some IO in this cb have failed, just skip checksum as there * is no way it could be correct. */ - if (cb->errors == 1) + if (cb->status != BLK_STS_OK) goto csum_failed; inode = cb->inode; @@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) - cb->errors = 1; - finish_compressed_bio_read(cb, bio); + cb->status = errno_to_blk_status(ret); + finish_compressed_bio_read(cb); out: bio_put(bio); } @@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode, unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; + const int errno = blk_status_to_errno(cb->status); int i; int ret; - if (cb->errors) - mapping_set_error(inode->i_mapping, -EIO); + if (errno) + mapping_set_error(inode->i_mapping, errno); while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode, continue; } for (i = 0; i < ret; i++) { - if (cb->errors) + if (errno) SetPageError(pages[i]); btrfs_page_clamp_clear_writeback(fs_info, pages[i], cb->start, cb->len); @@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) */ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, - !cb->errors); + cb->status == BLK_STS_OK); - end_compressed_writeback(inode, cb); + if (cb->writeback) + end_compressed_writeback(inode, cb); /* Note, our inode could be gone now */ /* @@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css) + struct cgroup_subsys_state *blkcg_css, + bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; @@ -524,16 +526,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!cb) return BLK_STS_RESOURCE; refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; + cb->writeback = writeback; cb->orig_bio = NULL; cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -552,6 +558,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio = NULL; goto finish_cb; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -591,7 +599,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) goto finish_cb; } @@ -609,6 +617,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, return 0; finish_cb: + if (blkcg_css) + kthread_associate_blkcg(NULL); + if (bio) { bio->bi_status = ret; bio_endio(bio); @@ -808,7 +819,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - blk_status_t ret = BLK_STS_RESOURCE; + blk_status_t ret; int faili = 0; u8 *sums; @@ -821,17 +832,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); - if (!em) - return BLK_STS_IOERR; + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); - if (!cb) + if (!cb) { + ret = BLK_STS_RESOURCE; goto out; + } refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = inode; cb->mirror_num = mirror_num; sums = cb->sums; @@ -851,8 +866,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; goto fail1; + } for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); @@ -938,7 +955,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = NULL; } } - return 0; + return BLK_STS_OK; fail2: while (faili >= 0) { @@ -951,6 +968,8 @@ fail1: kfree(cb); out: free_extent_map(em); + bio->bi_status = ret; + bio_endio(bio); return ret; finish_cb: if (comp_bio) { @@ -970,7 +989,7 @@ finish_cb: */ ASSERT(refcount_read(&cb->pending_sectors)); /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb, NULL); + finish_compressed_bio_read(cb); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 56eef0821e3e..ac5b20731d2a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -22,6 +22,8 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) @@ -52,8 +54,11 @@ struct compressed_bio { /* The compression algorithm for this bio */ u8 compress_type; + /* Whether this is a write for writeback. */ + bool writeback; + /* IO errors */ - u8 errors; + blk_status_t status; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -95,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css); + struct cgroup_subsys_state *blkcg_css, + bool writeback); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b..0eecf98d0abb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -846,9 +846,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - eb = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } return eb; @@ -1436,13 +1438,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); - if (!ret) { - *eb_ret = tmp; - return 0; + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + *eb_ret = tmp; + return 0; } /* @@ -1460,19 +1462,19 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); - if (!IS_ERR(tmp)) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!extent_buffer_uptodate(tmp)) - ret = -EIO; - free_extent_buffer(tmp); - } else { - ret = PTR_ERR(tmp); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + free_extent_buffer(tmp); btrfs_release_path(p); return ret; @@ -2990,16 +2992,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; @@ -3224,7 +3221,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -3235,12 +3231,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(left); - if (free_space < data_size) { - ret = 1; - goto out; - } - if (check_sibling_keys(left, right)) { ret = -EUCLEAN; goto out; @@ -4170,24 +4160,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - u32 last_off; - u32 dsize = 0; int ret = 0; int wret; - int i; u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4227,24 +4215,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, fixup_low_keys(path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); - - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8992e0096163..b7631b88426e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -148,6 +149,8 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT }; #define BTRFS_BACKREF_REV_MAX 256 @@ -274,8 +277,14 @@ struct btrfs_super_block { /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* Extent tree v2 */ + __le64 block_group_root; + __le64 block_group_root_generation; + u8 block_group_root_level; + /* future expansion */ - __le64 reserved[28]; + u8 reserved8[7]; + __le64 reserved[25]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -300,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -314,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) +#endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -602,6 +632,9 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -633,6 +666,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1027,6 +1061,8 @@ struct btrfs_fs_info { spinlock_t relocation_bg_lock; u64 data_reloc_bg; + u64 nr_global_roots; + spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; @@ -1106,8 +1142,15 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. @@ -1599,25 +1642,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -1648,8 +1691,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } @@ -1657,8 +1700,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -2318,6 +2361,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); +/* + * For extent tree v2 we overload the extent root with the block group root, as + * we will have multiple extent roots. + */ +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, + struct btrfs_root_backup, extent_root_level, 8); + /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); @@ -2452,6 +2506,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, + block_group_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, + struct btrfs_super_block, + block_group_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, + block_group_root_level, 8); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); @@ -2829,7 +2890,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -3145,7 +3207,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig); + u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, @@ -3246,6 +3308,11 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; @@ -3291,7 +3358,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); @@ -3308,6 +3375,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3764,7 +3833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3876,5 +3945,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fb46a28f5065..bd8267c4687d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, } static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) { u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, @@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, *qgroup_reserve = nr_extents * fs_info->nodesize; } -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends @@ -329,8 +331,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) return ret; @@ -349,7 +351,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -454,7 +456,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62b9651ea662..71fd99b48283 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; struct rcu_string *name; @@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } rcu_assign_pointer(device->name, name); + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; @@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); *device_out = device; return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..126f244cdf88 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ @@ -999,41 +1013,40 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, - "page private not zero on page %llu", - (unsigned long long)page_offset(page)); - detach_page_private(page); + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_releasepage(&folio->page, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); } } -static int btree_set_page_dirty(struct page *page) -{ #ifdef DEBUG - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; - u64 page_start = page_offset(page); + u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(subpage->dirty_bitmap); while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { @@ -1059,18 +1072,20 @@ static int btree_set_page_dirty(struct page *page) cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); } -#endif - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } +#else +#define btree_dirty_folio filemap_dirty_folio +#endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, + .invalidate_folio = btree_invalidate_folio, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif - .set_page_dirty = btree_set_page_dirty, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1289,12 +1304,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, return root; } +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1305,7 +1341,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1522,7 +1558,8 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = PTR_ERR(root->node); root->node = NULL; goto fail; - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } @@ -1727,6 +1764,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1812,9 +1850,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; @@ -1925,8 +1964,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { - struct btrfs_root *root = arg; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; int again; while (1) { @@ -1959,7 +1997,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(fs_info); - again = btrfs_clean_one_deleted_snapshot(root); + again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* @@ -2095,8 +2133,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2121,11 +2157,30 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(extent_root->node)); + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { + btrfs_set_backup_block_group_root(root_backup, + info->block_group_root->node->start); + btrfs_set_backup_block_group_root_gen(root_backup, + btrfs_header_generation(info->block_group_root->node)); + btrfs_set_backup_block_group_root_level(root_backup, + btrfs_header_level(info->block_group_root->node)); + } else { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } /* * we might commit during log recovery, which happens before we set @@ -2146,12 +2201,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(csum_root->node)); - btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, @@ -2269,6 +2318,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2504,11 +2554,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; - } else if (!extent_buffer_uptodate(log_tree_root->node)) { + } + if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2533,6 +2585,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; + u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, @@ -2568,6 +2621,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, break; btrfs_release_path(path); + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { @@ -2585,6 +2645,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, } btrfs_release_path(path); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); @@ -2930,6 +2993,56 @@ out: return ret; } +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, + root->root_key.objectid, gen, level, NULL); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + bytenr = btrfs_super_block_group_root(sb); + gen = btrfs_super_block_group_root_generation(sb); + level = btrfs_super_block_group_root_level(sb); + ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); + if (ret) + btrfs_warn(fs_info, "couldn't read block group root"); + return ret; +} + static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); @@ -2939,10 +3052,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - u64 generation; - int level; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + struct btrfs_root *root; + + root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, + GFP_KERNEL); + if (!root) + return -ENOMEM; + fs_info->block_group_root = root; + } + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); @@ -2967,29 +3087,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; } - generation = btrfs_super_generation(sb); - level = btrfs_super_root_level(sb); - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), - BTRFS_ROOT_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(tree_root->node)) { - handle_error = true; - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - btrfs_warn(fs_info, "couldn't read tree root"); - continue; - } else if (!extent_buffer_uptodate(tree_root->node)) { + ret = load_important_roots(fs_info); + if (ret) { handle_error = true; - ret = -EIO; - btrfs_warn(fs_info, "error while reading tree root"); continue; } - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user @@ -3009,8 +3113,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* All successful */ - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -3293,7 +3397,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(fs_info->tree_root); + ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3594,21 +3698,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - - chunk_root->node = read_tree_block(fs_info, - btrfs_super_chunk_root(disk_super), - BTRFS_CHUNK_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(chunk_root->node) || - !extent_buffer_uptodate(chunk_root->node)) { + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { btrfs_err(fs_info, "failed to read chunk root"); - if (!IS_ERR(chunk_root->node)) - free_extent_buffer(chunk_root->node); - chunk_root->node = NULL; goto fail_tree_roots; } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), @@ -3728,7 +3823,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; @@ -3813,6 +3908,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4029,8 +4128,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4042,7 +4142,6 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; @@ -4154,10 +4253,8 @@ static void write_dev_flush(struct btrfs_device *device) return; #endif - bio_reset(bio); + bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; - bio_set_dev(bio, device->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; @@ -4538,6 +4635,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e8bef4b7563..2e10514ecda8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -111,6 +111,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 04083ee5ae6e..c3eb52dbe61c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..6aa92f84f465 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -598,7 +598,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); - *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1072,8 +1071,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1121,7 +1119,6 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1166,8 +1163,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1181,21 +1177,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else ret = btrfs_del_item(trans, root, path); - } return ret; } @@ -2766,12 +2758,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2862,6 +2853,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + /* * Drop one or more refs of @node. * @@ -2943,7 +2963,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); extent_root = btrfs_extent_root(info, bytenr); @@ -3010,8 +3029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, is_data, - &last_ref); + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3136,8 +3154,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, is_data, - &last_ref); + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3182,7 +3199,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3191,28 +3207,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - struct btrfs_root *csum_root; - csum_root = btrfs_csum_root(info, bytenr); - ret = btrfs_del_csums(trans, csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); @@ -4087,7 +4082,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) @@ -4605,6 +4600,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, @@ -4665,18 +4682,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4694,7 +4700,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4704,12 +4709,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); @@ -4754,22 +4757,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -5622,6 +5610,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5653,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5839,6 +5830,13 @@ out_free: btrfs_free_path(path); out: /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 409bad3928db..724e8fe06aa0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1507,17 +1507,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } } @@ -2610,6 +2610,7 @@ static bool btrfs_check_repairable(struct inode *inode, * a good copy of the failed sector and if we succeed, we have setup * everything for repair_io_failure to do the rest for us. */ + ASSERT(failed_mirror); failrec->failed_mirror = failed_mirror; failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) @@ -2639,7 +2640,6 @@ int btrfs_repair_one_sector(struct inode *inode, const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); @@ -2678,13 +2678,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -3068,6 +3068,14 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { /* + * If we failed to submit the IO at all we'll have a + * mirror_num == 0, in which case we need to just mark + * the page with an error and unlock it and carry on. + */ + if (mirror == 0) + goto readpage_ok; + + /* * btrfs_submit_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ @@ -3143,7 +3151,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) struct bio *bio; ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); btrfs_bio_init(btrfs_bio(bio)); return bio; } @@ -3154,7 +3162,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); + new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(new); btrfs_bio_init(bbio); bbio->iter = bio->bi_iter; @@ -3169,7 +3177,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) ASSERT(offset <= UINT_MAX && size <= UINT_MAX); /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); bbio = btrfs_bio(bio); @@ -3321,7 +3329,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bio_flags = bio_flags; bio->bi_end_io = end_io_func; bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) @@ -3534,7 +3541,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3563,7 +3570,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3608,9 +3614,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { unlock_extent(tree, cur, end); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; @@ -3721,9 +3728,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_bio_extent_readpage, 0, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { + if (ret) { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); goto out; @@ -3951,7 +3956,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); break; @@ -4048,6 +4053,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4068,8 +4074,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } @@ -4780,11 +4786,12 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ - btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + btrfs_put_block_group(cache); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -5218,17 +5225,17 @@ void extent_readahead(struct readahead_control *rac) } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5238,7 +5245,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, @@ -5390,7 +5397,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, break; len = ALIGN(len, sectorsize); em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) + if (IS_ERR(em)) return em; /* if this isn't a hole return it */ @@ -6841,14 +6848,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..151e9da5da2d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -118,7 +118,7 @@ struct btrfs_bio_ctrl { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5a36add21305..6fee14ce2e6b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); free_extent_map(merge); } } @@ -490,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) @@ -504,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *new, int modified) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e217337dff9..d2fa32ffe304 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -25,6 +25,8 @@ enum { EXTENT_FLAG_FILLING, /* filesystem extent mapping type */ EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, }; struct extent_map { @@ -40,6 +42,12 @@ struct extent_map { u64 ram_bytes; u64 block_start; u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ u64 generation; unsigned long flags; /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90c5c38836ab..c828f971a346 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -305,7 +305,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } @@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_bio *bbio = NULL; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; @@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; + blk_status_t ret = BLK_STS_OK; if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_bio *bbio = btrfs_bio(bio); + bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); - if (count <= 0) { - /* - * Either we hit a critical error or we didn't find - * the csum. - * Either way, we put zero into the csums dst, and skip - * to the next sector. - */ + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio) + btrfs_bio_free_csum(bbio); + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; - /* - * For data reloc inode, we need to mark the range - * NODATASUM so that balance won't report false csum - * error. - */ if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; @@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } btrfs_free_path(path); - return BLK_STS_OK; + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -612,32 +620,33 @@ fail: return ret; } -/* - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio +/** + * Calculate checksums of the data contained inside a bio + * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed - * @file_start: offset in file this bio begins to describe - * @contig: Boolean. If true/1 means all bio vecs in this bio are - * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contain potentially discontiguous bio vecs - * so the logical offset of each should be calculated separately. + * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the + * file offsets are determined from the page offsets in the bio. + * Otherwise, this is the starting file offset of the bio vecs in + * @bio, which must be contiguous. + * @one_ordered: If true, @bio only refers to one ordered extent. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig) + u64 offset, bool one_ordered) { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; + const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; int index; - int nr_sectors; + unsigned int blockcount; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; int i; - u64 offset; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -651,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - if (contig) - offset = file_start; - else - offset = 0; /* shut up gcc */ - sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!contig) + if (use_page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { @@ -681,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } } - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); - for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->num_bytes || - offset < ordered->file_offset) { + for (i = 0; i < blockcount; i++) { + if (!one_ordered && + !in_range(offset, ordered->file_offset, + ordered->num_bytes)) { unsigned long bytes_left; sums->len = this_sum_bytes; @@ -1211,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, extent_start = key.offset; extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 11204dbbe053..380054c94e4b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -50,11 +50,14 @@ struct inode_defrag { /* root objectid */ u64 root; - /* last offset we were able to defrag */ - u64 last_offset; - - /* if we've wrapped around back to zero once already */ - int cycled; + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, + * thus needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, @@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; - if (defrag->last_offset > entry->last_offset) - entry->last_offset = defrag->last_offset; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); return -EEXIST; } } @@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { @@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, } /* - * Requeue the defrag object. If there is a defrag object that points to - * the same inode in the tree, we will merge them together (by - * __btrfs_add_inode_defrag()) and free the one that we want to requeue. - */ -static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - - if (!__need_auto_defrag(fs_info)) - goto out; - - /* - * Here we don't check the IN_DEFRAG flag, because we need merge - * them together. - */ - spin_lock(&fs_info->defrag_inodes_lock); - ret = __btrfs_add_inode_defrag(inode, defrag); - spin_unlock(&fs_info->defrag_inodes_lock); - if (ret) - goto out; - return; -out: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -} - -/* * pick the defragable inode that we want, if it doesn't exist, we will get * the next one. */ @@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_root *inode_root; struct inode *inode; struct btrfs_ioctl_defrag_range_args range; - int num_defrag; - int ret; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; /* get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); @@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, goto cleanup; } + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; - range.start = defrag->last_offset; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; sb_start_write(fs_info->sb); - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == BTRFS_DEFRAG_BATCH) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - iput(inode); - return 0; + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; @@ -718,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs; int found = 0; - int leafs_visited = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; @@ -756,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; - leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -768,7 +739,6 @@ next_slot: ret = 0; break; } - leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -1014,7 +984,7 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && args->replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { @@ -1749,7 +1719,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, + reserve_bytes); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -2066,12 +2037,43 @@ out: return err < 0 ? err : written; } -static ssize_t btrfs_file_write_iter(struct kiocb *iocb, - struct iov_iter *from) +static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t count; + ssize_t ret; + + btrfs_inode_lock(inode, 0); + count = encoded->len; + ret = generic_write_checks_count(iocb, &count); + if (ret == 0 && count != encoded->len) { + /* + * The write got truncated by generic_write_checks_count(). We + * can't do a partial encoded write. + */ + ret = -EFBIG; + } + if (ret || encoded->len == 0) + goto out; + + ret = btrfs_write_check(iocb, from, encoded->len); + if (ret < 0) + goto out; + + ret = btrfs_do_encoded_write(iocb, from, encoded); +out: + btrfs_inode_unlock(inode, 0); + return ret; +} + +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); - ssize_t num_written = 0; + ssize_t num_written, num_sync; const bool sync = iocb->ki_flags & IOCB_DSYNC; /* @@ -2082,22 +2084,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; if (sync) atomic_inc(&inode->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) - num_written = btrfs_direct_write(iocb, from); - else - num_written = btrfs_buffered_write(iocb, from); + if (encoded) { + num_written = btrfs_encoded_write(iocb, from, encoded); + num_sync = encoded->len; + } else if (iocb->ki_flags & IOCB_DIRECT) { + num_written = num_sync = btrfs_direct_write(iocb, from); + } else { + num_written = num_sync = btrfs_buffered_write(iocb, from); + } btrfs_set_inode_last_sub_trans(inode); - if (num_written > 0) - num_written = generic_write_sync(iocb, num_written); + if (num_sync > 0) { + num_sync = generic_write_sync(iocb, num_sync); + if (num_sync < 0) + num_written = num_sync; + } if (sync) atomic_dec(&inode->sync_writers); @@ -2106,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return num_written; } +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + return btrfs_do_write_iter(iocb, from, NULL); +} + int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; @@ -2501,7 +2514,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2522,8 +2535,7 @@ out: } while (ret == -EEXIST); free_extent_map(hole_em); if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } return 0; @@ -2877,7 +2889,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; @@ -2945,8 +2957,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2978,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3418,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); + return btrfs_punch_hole(file, offset, len); /* * Only trigger disk allocation, don't trigger qgroup reserve @@ -3440,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 655aad0f9e1c..0ae54d8c10d6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root( .offset = 0, }; + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) + key.offset = block_group->global_root_id; return btrfs_global_root(block_group->fs_info, &key); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b2403b6127f..5082b9c70f8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,11 @@ struct btrfs_dio_data { struct extent_changeset *data_reserved; }; +struct btrfs_rename_ctx { + /* Output field. Stores the index number of the old directory entry. */ + u64 index; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, bool extent_inserted, - struct btrfs_root *root, struct inode *inode, - u64 start, size_t size, size_t compressed_size, + struct btrfs_path *path, + struct btrfs_inode *inode, bool extent_inserted, + size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; - unsigned long offset; + u64 i_size; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); @@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; size_t datasize; - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = start; + key.objectid = btrfs_ino(inode); + key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); @@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->i_mapping, - start >> PAGE_SHIFT); + page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = offset_in_page(start); - write_extent_buffer(leaf, kaddr + offset, ptr, size); + write_extent_buffer(leaf, kaddr, ptr, size); kunmap_atomic(kaddr); put_page(page); } @@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * We align size to sectorsize for inline extents just for simplicity * sake. */ - size = ALIGN(size, root->fs_info->sectorsize); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + ret = btrfs_inode_set_file_extent_range(inode, 0, + ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* - * we're an inline extent, so nobody can - * extend the file past i_size without locking - * a page we already have locked. + * We're an inline extent, so nobody can extend the file past i_size + * without locking a page we already have locked. * - * We must do any isize and inode updates - * before we unlock the pages. Otherwise we - * could end up racing with unlink. + * We must do any i_size and inode updates before we unlock the pages. + * Otherwise we could end up racing with unlink. */ - BTRFS_I(inode)->disk_i_size = inode->i_size; + i_size = i_size_read(&inode->vfs_inode); + if (update_i_size && size > i_size) { + i_size_write(&inode->vfs_inode, size); + i_size = size; + } + inode->disk_i_size = i_size; + fail: return ret; } @@ -339,35 +348,31 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, - u64 end, size_t compressed_size, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(&inode->vfs_inode); - u64 actual_end = min(end + 1, isize); - u64 inline_len = actual_end - start; - u64 aligned_end = ALIGN(end, fs_info->sectorsize); - u64 data_len = inline_len; + u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; - if (compressed_size) - data_len = compressed_size; - - if (start > 0 || - actual_end > fs_info->sectorsize || + /* + * We can create an inline extent if it ends at or beyond the current + * i_size, is no larger than a sector (decompressed), and the (possibly + * compressed) data fits in a leaf and the configured maximum inline + * size. + */ + if (size < i_size_read(&inode->vfs_inode) || + size > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - (!compressed_size && - (actual_end & (fs_info->sectorsize - 1)) == 0) || - end + 1 < isize || - data_len > fs_info->max_inline) { + data_len > fs_info->max_inline) return 1; - } path = btrfs_alloc_path(); if (!path) @@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, trans->block_rsv = &inode->block_rsv; drop_args.path = path; - drop_args.start = start; - drop_args.end = aligned_end; + drop_args.start = 0; + drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; - - if (compressed_size && compressed_pages) - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - compressed_size); - else - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - inline_len); - + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - if (isize > actual_end) - inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, drop_args.extent_inserted, - root, &inode->vfs_inode, start, - inline_len, compressed_size, - compress_type, compressed_pages); + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, + size, compressed_size, compress_type, + compressed_pages, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -560,12 +555,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, } static inline void inode_should_defrag(struct btrfs_inode *inode, - u64 start, u64 end, u64 num_bytes, u64 small_write) + u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + btrfs_add_inode_defrag(NULL, inode, small_write); } /* @@ -624,7 +619,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED / PAGE_SIZE); @@ -735,14 +729,15 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 0, BTRFS_COMPRESS_NONE, - NULL); + NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, total_compressed, - compress_type, pages); + compress_type, pages, + false); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | @@ -981,11 +976,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ - ins.objectid, /* disk_bytenr */ - async_extent->ram_size, /* num_bytes */ - ins.offset, /* disk_num_bytes */ - async_extent->compress_type); + ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + async_extent->ram_size, /* ram_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* disk_num_bytes */ + 0, /* offset */ + 1 << BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, start, end, 0); goto out_free_reserve; @@ -1003,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, - async_chunk->blkcg_css)) { + async_chunk->blkcg_css, true)) { const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1130,7 +1128,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } @@ -1152,9 +1149,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), + end + 1); + /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the @@ -1234,9 +1234,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, - BTRFS_ORDERED_REGULAR); + ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, + ins.objectid, cur_alloc_size, 0, + 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); if (ret) goto out_drop_extent_cache; @@ -1895,10 +1896,11 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_PREALLOC); + ret = btrfs_add_ordered_extent(inode, + cur_offset, num_bytes, num_bytes, + disk_bytenr, num_bytes, 0, + 1 << BTRFS_ORDERED_PREALLOC, + BTRFS_COMPRESS_NONE); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, cur_offset + num_bytes - 1, @@ -1907,9 +1909,11 @@ out_check: } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, + num_bytes, num_bytes, disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_NOCOW); + 0, + 1 << BTRFS_ORDERED_NOCOW, + BTRFS_COMPRESS_NONE); if (ret) goto error; } @@ -2012,8 +2016,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ - ASSERT(!zoned || - (zoned && btrfs_is_data_reloc_root(inode->root))); + ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || @@ -2310,7 +2313,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } /* @@ -2538,10 +2541,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { + /* + * btrfs_submit_compressed_read will handle completing + * the bio if there were any errors, so just return + * here. + */ ret = btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); - goto out; + goto out_no_endio; } else { /* * Lookup bio sums does extra checks around whether we @@ -2562,7 +2570,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); if (ret) goto out; } @@ -2575,6 +2583,7 @@ out: bio->bi_status = ret; bio_endio(bio); } +out_no_endio: return ret; } @@ -2870,6 +2879,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; @@ -2944,7 +2954,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos, qgroup_reserved, &ins); + file_pos - offset, + qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2970,20 +2981,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; - u64 logical_len; bool update_inode_bytes; + u64 num_bytes = oe->num_bytes; + u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - logical_len = oe->truncated_len; - else - logical_len = oe->num_bytes; - btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); - btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + num_bytes = ram_bytes = oe->truncated_len; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ @@ -2994,6 +3005,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), @@ -3028,7 +3040,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); @@ -4062,7 +4075,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const char *name, int name_len, + struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4118,15 +4132,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } skip_backref: + if (rename_ctx) + rename_ctx->index = index; + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unncessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); + } /* * If we have a pending delayed iput we could end up with the final iput @@ -4158,7 +4184,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4460,6 +4486,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -4565,14 +4598,21 @@ out_up_write: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } return btrfs_delete_subvolume(dir, dentry); + } trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4611,7 +4651,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); + btrfs_btree_balance_dirty(fs_info); return err; } @@ -4664,7 +4704,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -4876,8 +4916,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; @@ -5046,16 +5085,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } /* - * While truncating the inode pages during eviction, we get the VFS calling - * btrfs_invalidatepage() against each page of the inode. This is slow because - * the calls to btrfs_invalidatepage() result in a huge amount of calls to - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting - * extent_state structures over and over, wasting lots of time. + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. * - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all - * those expensive operations on a per page basis and do only the ordered io - * finishing, while we release here the extent_map and extent_state structures, - * without the excessive merging and splitting. + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { @@ -5121,7 +5161,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. - * (Refer to comment in btrfs_invalidatepage, case 2) + * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ @@ -5584,21 +5624,17 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); +static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); +static_assert(BTRFS_FT_DIR == FT_DIR); +static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); +static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); +static_assert(BTRFS_FT_FIFO == FT_FIFO); +static_assert(BTRFS_FT_SOCK == FT_SOCK); +static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); + static inline u8 btrfs_inode_type(struct inode *inode) { - /* - * Compile-time asserts that generic FT_* types still match - * BTRFS_FT_* types - */ - BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); - BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); - BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); - BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); - BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); - BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); - BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); - BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); - return fs_umode_to_ftype(inode->i_mode); } @@ -5971,14 +6007,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) goto out; ret = 0; - /* - * MAGIC NUMBER EXPLANATION: - * since we search a directory based on f_pos we have to start at 2 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody - * else has to start at 2 - */ if (path->slots[0] == 0) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -5989,7 +6019,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -6140,7 +6170,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; @@ -6537,7 +6567,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: @@ -7040,8 +7070,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, - block_len, type); + ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, + block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); if (ret) { if (em) { free_extent_map(em); @@ -7410,6 +7443,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, u64 block_start, orig_start, orig_block_len, ram_bytes; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7437,11 +7471,12 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, can_nocow = true; } + prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); @@ -7466,8 +7501,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; } } else { - const u64 prev_len = len; - /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; @@ -7498,7 +7531,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered @@ -7600,6 +7633,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, len); @@ -7803,7 +7864,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct bio *bio) @@ -7860,7 +7921,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); if (ret) goto err; } else { @@ -8076,8 +8137,13 @@ int btrfs_readpage(struct file *file, struct page *page) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) - ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); + if (bio_ctrl.bio) { + int ret2; + + ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); + if (ret == 0) + ret = ret2; + } return ret; } @@ -8118,8 +8184,8 @@ static void btrfs_readahead(struct readahead_control *rac) } /* - * For releasepage() and invalidatepage() we have a race window where - * end_page_writeback() is called but the subpage spinlock is not yet released. + * For releasepage() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. @@ -8195,48 +8261,48 @@ static int btrfs_migratepage(struct address_space *mapping, } #endif -static void btrfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* - * We have page locked so no new ordered extent can be created on this - * page, nor bio can be submitted for this page. + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. * - * But already submitted bio can still be finished on this page. - * Furthermore, endio function won't skip page which has Ordered + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and - * invalidatepage to do the same ordered extent accounting twice - * on one page. + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. * * So here we wait for any submitted bios to finish, so that we won't - * do double ordered extent accounting on the same page. + * do double ordered extent accounting on the same folio. */ - wait_on_page_writeback(page); - wait_subpage_spinlock(page); + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. - * If the range doesn't cover the full page, we don't need to and - * shouldn't clear page extent mapped, as page->private can still + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * - * For cases that can invalidate the full even the range doesn't - * cover the full page, like invalidating the last page, we're + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ - if (!(offset == 0 && length == PAGE_SIZE)) { - btrfs_releasepage(page, GFP_NOFS); + if (!(offset == 0 && length == folio_size(folio))) { + btrfs_releasepage(&folio->page, GFP_NOFS); return; } @@ -8277,7 +8343,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8287,7 +8353,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, delete_states = false; goto next; } - btrfs_page_clear_ordered(fs_info, page, cur, range_len); + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8357,11 +8423,11 @@ next: * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ - ASSERT(!PageOrdered(page)); - btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(page, GFP_NOFS); - clear_page_extent_mapped(page); + __btrfs_releasepage(&folio->page, GFP_NOFS); + clear_page_extent_mapped(&folio->page); } /* @@ -8706,7 +8772,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); return ret; } @@ -8759,7 +8825,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -9002,14 +9068,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); + struct btrfs_rename_ctx old_rename_ctx; + struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; - bool root_log_pinned = false; - bool dest_log_pinned = false; bool need_abort = false; /* @@ -9112,29 +9178,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } - /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. - * - * We pin the logs even if at this precise moment none of the inodes was - * logged before. This is because right after we checked for that, some - * other task fsyncing some other inode not involved with this rename - * operation could log that one of our inodes exists. - * - * We don't need to pin the logs before the above calls to - * btrfs_insert_inode_ref(), since those don't ever need to change a log. - */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(root); - root_log_pinned = true; - } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; - } - /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9142,7 +9185,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9158,7 +9202,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, - new_dentry->d_name.len); + new_dentry->d_name.len, + &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9188,46 +9233,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (root_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), - old_dentry->d_parent); - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } -out_fail: /* - * If we have pinned a log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. */ - if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) - btrfs_set_log_full_commit(trans); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); - if (root_log_pinned) { - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } - } + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + old_rename_ctx.index, new_dentry->d_parent); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), + new_rename_ctx.index, old_dentry->d_parent); + + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9302,11 +9332,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); + struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); - bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9411,29 +9441,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { - /* - * Now pin the log. We do it to ensure that no other task can - * sync the log while we are in progress with the rename, as - * that could result in an inconsistency in case any of the - * inodes that are part of this rename operation were logged - * before. - * - * We pin the log even if at this precise moment none of the - * inodes was logged before. This is because right after we - * checked for that, some other task fsyncing some other inode - * not involved with this rename operation could log that one of - * our inodes exists. - * - * We don't need to pin the logs before the above call to - * btrfs_insert_inode_ref(), since that does not need to change - * a log. - */ - btrfs_pin_log_trans(root); - log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9475,12 +9487,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - log_pinned = false; - } + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, @@ -9492,28 +9501,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } } out_fail: - /* - * If we have pinned the log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && log_pinned) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(trans); - - btrfs_end_log_trans(root); - log_pinned = false; - } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9993,8 +9980,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10076,11 +10062,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { @@ -10182,6 +10163,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } +static int btrfs_encoded_io_compression_from_extent( + struct btrfs_fs_info *fs_info, + int compress_type) +{ + switch (compress_type) { + case BTRFS_COMPRESS_NONE: + return BTRFS_ENCODED_IO_COMPRESSION_NONE; + case BTRFS_COMPRESS_ZLIB: + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; + case BTRFS_COMPRESS_LZO: + /* + * The LZO format depends on the sector size. 64K is the maximum + * sector size that we support. + */ + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) + return -EINVAL; + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + + (fs_info->sectorsize_bits - 12); + case BTRFS_COMPRESS_ZSTD: + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; + default: + return -EUCLEAN; + } +} + +static ssize_t btrfs_encoded_read_inline( + struct kiocb *iocb, + struct iov_iter *iter, u64 start, + u64 lockend, + struct extent_state **cached_state, + u64 extent_start, size_t count, + struct btrfs_ioctl_encoded_io_args *encoded, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *item; + u64 ram_bytes; + unsigned long ptr; + void *tmp; + ssize_t ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + extent_start, 0); + if (ret) { + if (ret > 0) { + /* The extent item disappeared? */ + ret = -EIO; + } + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + ptr = btrfs_file_extent_inline_start(item); + + encoded->len = min_t(u64, extent_start + ram_bytes, + inode->vfs_inode.i_size) - iocb->ki_pos; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, item)); + if (ret < 0) + goto out; + encoded->compression = ret; + if (encoded->compression) { + size_t inline_size; + + inline_size = btrfs_file_extent_inline_item_len(leaf, + path->slots[0]); + if (inline_size > count) { + ret = -ENOBUFS; + goto out; + } + count = inline_size; + encoded->unencoded_len = ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - extent_start; + } else { + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + ptr += iocb->ki_pos - extent_start; + } + + tmp = kmalloc(count, GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(leaf, tmp, ptr, count); + btrfs_release_path(path); + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + ret = copy_to_iter(tmp, count, iter); + if (ret != count) + ret = -EFAULT; + kfree(tmp); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_encoded_read_private { + struct btrfs_inode *inode; + u64 file_offset; + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; + bool skip_csum; +}; + +static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, + struct bio *bio, int mirror_num) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + blk_status_t ret; + + if (!priv->skip_csum) { + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); + if (ret) + return ret; + } + + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) { + btrfs_bio_free_csum(bbio); + return ret; + } + + atomic_inc(&priv->pending); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) { + atomic_dec(&priv->pending); + btrfs_bio_free_csum(bbio); + } + return ret; +} + +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) +{ + const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); + struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; + struct btrfs_inode *inode = priv->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + u64 start = priv->file_offset; + u32 bio_offset = 0; + + if (priv->skip_csum || !uptodate) + return bbio->bio.bi_status; + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + unsigned int i, nr_sectors, pgoff; + + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + for (i = 0; i < nr_sectors; i++) { + ASSERT(pgoff < PAGE_SIZE); + if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff, start)) + return BLK_STS_IOERR; + start += sectorsize; + bio_offset += sectorsize; + pgoff += sectorsize; + } + } + return BLK_STS_OK; +} + +static void btrfs_encoded_read_endio(struct bio *bio) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + blk_status_t status; + + status = btrfs_encoded_read_verify_csum(bbio); + if (status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); + btrfs_bio_free_csum(bbio); + bio_put(bio); +} + +static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, + u64 disk_bytenr, + u64 disk_io_size, + struct page **pages) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .inode = inode, + .file_offset = file_offset, + .pending = ATOMIC_INIT(1), + .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), + }; + unsigned long i = 0; + u64 cur = 0; + int ret; + + init_waitqueue_head(&priv.wait); + /* + * Submit bios for the extent, splitting due to bio or stripe limits as + * necessary. + */ + while (cur < disk_io_size) { + struct extent_map *em; + struct btrfs_io_geometry geom; + struct bio *bio = NULL; + u64 remaining; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, + disk_io_size - cur); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + } else { + ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, + disk_bytenr + cur, &geom); + free_extent_map(em); + } + if (ret) { + WRITE_ONCE(priv.status, errno_to_blk_status(ret)); + break; + } + remaining = min(geom.len, disk_io_size - cur); + while (bio || remaining) { + size_t bytes = min_t(u64, remaining, PAGE_SIZE); + + if (!bio) { + bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio->bi_iter.bi_sector = + (disk_bytenr + cur) >> SECTOR_SHIFT; + bio->bi_end_io = btrfs_encoded_read_endio; + bio->bi_private = &priv; + bio->bi_opf = REQ_OP_READ; + } + + if (!bytes || + bio_add_page(bio, pages[i], bytes, 0) < bytes) { + blk_status_t status; + + status = submit_encoded_read_bio(inode, bio, 0); + if (status) { + WRITE_ONCE(priv.status, status); + bio_put(bio); + goto out; + } + bio = NULL; + continue; + } + + i++; + cur += bytes; + remaining -= bytes; + } + } + +out: + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv.status)); +} + +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, + struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + unsigned long nr_pages, i; + u64 cur; + size_t page_offset; + ssize_t ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + ret = -ENOMEM; + goto out; + } + } + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages); + if (ret) + goto out; + + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + if (compressed) { + i = 0; + page_offset = 0; + } else { + i = (iocb->ki_pos - start) >> PAGE_SHIFT; + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < count) { + size_t bytes = min_t(size_t, count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(pages[i], page_offset, bytes, + iter) != bytes) { + ret = -EFAULT; + goto out; + } + i++; + cur += bytes; + page_offset = 0; + } + ret = count; +out: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + return ret; +} + +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + ssize_t ret; + size_t count = iov_iter_count(iter); + u64 start, lockend, disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + bool unlocked = false; + + file_accessed(iocb->ki_filp); + + btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + + if (iocb->ki_pos >= inode->vfs_inode.i_size) { + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return 0; + } + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); + /* + * We don't know how long the extent containing iocb->ki_pos is, but if + * it's compressed we know that it won't be longer than this. + */ + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + lock_extent_bits(io_tree, start, lockend, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, lockend, &cached_state); + cond_resched(); + } + + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_extent; + } + + if (em->block_start == EXTENT_MAP_INLINE) { + u64 extent_start = em->start; + + /* + * For inline extents we get everything we need out of the + * extent item. + */ + free_extent_map(em); + em = NULL; + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, + &cached_state, extent_start, + count, encoded, &unlocked); + goto out; + } + + /* + * We only want to return up to EOF even if the extent extends beyond + * that. + */ + encoded->len = min_t(u64, extent_map_end(em), + inode->vfs_inode.i_size) - iocb->ki_pos; + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + disk_bytenr = EXTENT_MAP_HOLE; + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + disk_bytenr = em->block_start; + /* + * Bail if the buffer isn't large enough to return the whole + * compressed extent. + */ + if (em->block_len > count) { + ret = -ENOBUFS; + goto out_em; + } + disk_io_size = count = em->block_len; + encoded->unencoded_len = em->ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + em->compress_type); + if (ret < 0) + goto out_em; + encoded->compression = ret; + } else { + disk_bytenr = em->block_start + (start - em->start); + if (encoded->len > count) + encoded->len = count; + /* + * Don't read beyond what we locked. This also limits the page + * allocations that we'll do. + */ + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + disk_io_size - iocb->ki_pos; + encoded->len = count; + encoded->unencoded_len = count; + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + } + free_extent_map(em); + em = NULL; + + if (disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent_cached(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlocked = true; + ret = iov_iter_zero(count, iter); + if (ret != count) + ret = -EFAULT; + } else { + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + encoded->compression, + &unlocked); + } + +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; +out_em: + free_extent_map(em); +out_unlock_extent: + if (!unlocked) + unlock_extent_cached(io_tree, start, lockend, &cached_state); +out_unlock_inode: + if (!unlocked) + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return ret; +} + +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap(pages[i]); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap(pages[i]); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent_bits(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent_cached(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, NULL, + false)) { + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); + ret = -EIO; + goto out_pages; + } + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent_cached(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a @@ -10390,8 +11112,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); @@ -10638,12 +11375,12 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, - .invalidatepage = btrfs_invalidatepage, + .invalidate_folio = btrfs_invalidate_folio, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION .migratepage = btrfs_migratepage, #endif - .set_page_dirty = btrfs_set_page_dirty, + .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 927771d1853f..be6c24577dbe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> +#include <linux/sched/xacct.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 { #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ @@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, } } -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { - struct inode *inode = file_inode(file); - return put_user(inode->i_generation, arg); } @@ -753,6 +770,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -1012,8 +1036,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1028,6 +1199,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; @@ -1035,7 +1220,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* get the big lock and read metadata off disk */ if (!locked) lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) unlock_extent_cached(io_tree, start, end, &cached); @@ -1046,23 +1231,50 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return BTRFS_MAX_EXTENT_SIZE; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; +out: free_extent_map(next); return ret; } @@ -1186,8 +1398,10 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1197,7 +1411,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; @@ -1254,8 +1470,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (range_len >= extent_thresh) goto next; + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(em)) + goto next; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; @@ -1272,6 +1495,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1315,10 +1539,22 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } #define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); /* * Defrag one contiguous target range. @@ -1373,7 +1609,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1419,7 +1656,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1454,7 +1691,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1462,10 +1700,9 @@ static int defrag_one_cluster(struct btrfs_inode *inode, LIST_HEAD(target_list); int ret; - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; @@ -1482,6 +1719,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode, range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1494,7 +1740,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; *sectors_defragged += range_len >> @@ -1505,6 +1752,8 @@ out: list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1590,11 +1839,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; - /* The cluster size 256K should always be page aligned */ - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - if (btrfs_defrag_cancelled(fs_info)) { ret = -EAGAIN; break; @@ -1619,8 +1866,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1628,7 +1875,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); if (ret > 0) { ret = 0; break; @@ -2011,10 +2258,9 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -2344,12 +2590,11 @@ err: return ret; } -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) { struct btrfs_ioctl_search_args __user *uargs; struct btrfs_ioctl_search_key sk; - struct inode *inode; int ret; size_t buf_size; @@ -2363,7 +2608,6 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, buf_size = sizeof(uargs->buf); - inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* @@ -2378,12 +2622,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct file *file, +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg; struct btrfs_ioctl_search_args_v2 args; - struct inode *inode; int ret; size_t buf_size; const size_t buf_limit = SZ_16M; @@ -2402,7 +2645,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, if (buf_size > buf_limit) buf_size = buf_limit; - inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) @@ -2653,25 +2895,22 @@ out: return ret; } -static noinline int btrfs_ioctl_ino_lookup(struct file *file, +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); - inode = file_inode(file); - /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; + args->treeid = root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2683,7 +2922,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, goto out; } - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); @@ -2739,7 +2978,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ -static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; @@ -2751,7 +2990,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; - struct inode *inode; int slot; int ret = 0; @@ -2765,7 +3003,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) return -ENOMEM; } - inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ @@ -2859,15 +3096,14 @@ out_free: * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ -static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; - struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct inode *inode; u64 objectid; int slot; int ret; @@ -2883,15 +3119,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) return PTR_ERR(rootrefs); } - inode = file_inode(file); - root = BTRFS_I(inode)->root->fs_info->tree_root; - objectid = BTRFS_I(inode)->root->root_key.objectid; - + objectid = root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; + root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; @@ -2971,6 +3205,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int err = 0; bool destroy_parent = false; + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) @@ -3246,6 +3487,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -3771,6 +4017,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); @@ -3870,6 +4121,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); @@ -4931,7 +5187,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4961,11 +5217,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(file, arg); + ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4976,7 +5415,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); + return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: @@ -4996,7 +5435,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); + return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5017,14 +5456,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); + return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(file, argp); + return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); + return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: @@ -5071,10 +5508,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(file, argp, false); + return _btrfs_ioctl_send(inode, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(file, argp, true); + return _btrfs_ioctl_send(inode, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -5101,15 +5538,25 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: - return btrfs_ioctl_get_subvol_info(file, argp); + return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: - return btrfs_ioctl_get_subvol_rootref(file, argp); + return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0fb90cbe7669..430ad36b8b08 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,6 +55,9 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ +#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) +#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); - workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap(cur_page); cur_in += LZO_LEN; + if (seg_len > WORKSPACE_CBUF_LENGTH) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + ret = -EIO; + goto out; + } + /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); @@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; - size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); + size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; char *kaddr; unsigned long bytes; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6b51fd2ec5ac..1957b14b329a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Allocate and add a new ordered_extent into the per-inode tree. +/** + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. * - * The tree is given a single reference on the ordered extent that was - * inserted. + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted. + * + * Return: 0 or -ENOMEM. */ -static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, int dio, - int compress_type) +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset struct btrfs_ordered_extent *entry; int ret; - if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) @@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return -ENOMEM; entry->file_offset = file_offset; - entry->disk_bytenr = disk_bytenr; entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; entry->bytes_left = num_bytes; entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; @@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->qgroup_rsv = ret; entry->physical = (u64)-1; - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC || - type == BTRFS_ORDERED_COMPRESSED); - set_bit(type, &entry->flags); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); + entry->flags = flags; percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, fs_info->delalloc_batch); - if (dio) - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return 0; } -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 1, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type) -{ - ASSERT(compress_type != BTRFS_COMPRESS_NONE); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, - BTRFS_ORDERED_COMPRESSED, 0, - compress_type); -} - /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, - false); + if (root != fs_info->tree_root) { + u64 release; + + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) + release = entry->disk_num_bytes; + else + release = entry->num_bytes; + btrfs_delalloc_release_metadata(btrfs_inode, release, false); + } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); @@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; - u64 num_bytes = len; - u64 disk_num_bytes = len; - int type; - unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); - int compress_type = ordered->compress_type; - unsigned long weight; - int ret; - - weight = hweight_long(flags_masked); - WARN_ON_ONCE(weight > 1); - if (!weight) - type = 0; - else - type = __ffs(flags_masked); + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; /* - * The splitting extent is already counted and will be added again - * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid - * double counting. + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { - WARN_ON_ONCE(1); - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), - file_offset, disk_bytenr, num_bytes, - disk_num_bytes, compress_type); - } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } else { - ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } - - return ret; + WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4194e960ff61..ecad67a2c745 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,8 +74,18 @@ enum { BTRFS_ORDERED_LOGGED_CSUM, /* We wait for this extent to complete in the current transaction */ BTRFS_ORDERED_PENDING, + /* BTRFS_IOC_ENCODED_WRITE */ + BTRFS_ORDERED_ENCODED, }; +/* BTRFS_ORDERED_* flags that specify the type of the extent. */ +#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED) | \ + (1UL << BTRFS_ORDERED_DIRECT) | \ + (1UL << BTRFS_ORDERED_ENCODED)) + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -84,9 +94,11 @@ struct btrfs_ordered_extent { * These fields directly correspond to the same fields in * btrfs_file_extent_item. */ - u64 disk_bytenr; u64 num_bytes; + u64 ram_bytes; + u64 disk_bytenr; u64 disk_num_bytes; + u64 offset; /* number of bytes that still need writing */ u64 bytes_left; @@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type); -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type); + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0775ae9f4419..dd8777872143 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = { { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, }; @@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); - if (IS_ERR(next)) { + if (IS_ERR(next)) continue; - } else if (!extent_buffer_uptodate(next)) { + if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); continue; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f12dc687350c..1866b1f0da01 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,18 +25,6 @@ #include "sysfs.h" #include "tree-mod-log.h" -/* TODO XXX FIXME - * - subvol delete -> delete when ref goes to 0? delete limits also? - * - reorganize keys - * - compressed - * - sync - * - copy also limits on subvol creation - * - limit - * - caches for ulists - * - performance benchmarks - * - check all ioctl parameters - */ - /* * Helpers to access qgroup reservation * @@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return 0; } -/* must be called with qgroup_lock held */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, - u64 memberid, u64 parentid) +/* + * Add relation specified by two qgroups. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the qgroups is NULL + * <0 other errors + */ +static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { - struct btrfs_qgroup *member; - struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; - member = find_qgroup_rb(fs_info, memberid); - parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; @@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, return 0; } -/* must be called with qgroup_lock held */ +/* + * Add relation specified by two qgoup ids. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the ids does not exist + * <0 other errors + */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + + return __add_relation_rb(member, parent); +} + +/* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { @@ -948,6 +959,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) */ lockdep_assert_held_write(&fs_info->subvol_sem); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "qgroups are currently unsupported in extent tree v2"); + return -EINVAL; + } + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1197,13 +1214,20 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) goto out; /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may * deadlock with transaction by the qgroup rescan worker. */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); - mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 For the root item @@ -1444,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = add_relation_rb(fs_info, src, dst); + ret = __add_relation_rb(member, parent); if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; @@ -3261,7 +3285,8 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3291,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = PTR_ERR(trans); break; } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - err = -EINTR; - } else { - err = qgroup_rescan_leaf(trans, path); - } + + err = qgroup_rescan_leaf(trans, path); + if (err > 0) btrfs_commit_transaction(trans); else @@ -3309,7 +3332,7 @@ out: if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0) { + } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index a3930da4eb3f..998e3f180d90 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -277,7 +277,7 @@ copy_inline_extent: path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { @@ -494,7 +494,8 @@ process_slot: &clone_info, &trans); if (ret) goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can @@ -505,8 +506,12 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, @@ -518,17 +523,22 @@ process_slot: btrfs_release_path(path); /* - * If this is a new extent update the last_reflink_trans of both - * inodes. This is used by fsync to make sure it does not log - * multiple checksum items with overlapping ranges. For older - * extents we don't need to do it since inode logging skips the - * checksums for older extents. Also ignore holes and inline - * extents because they don't have checksums in the csum tree. + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. */ - if (extent_gen == trans->transid && disko > 0) { + if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; - BTRFS_I(inode)->last_reflink_trans = trans->transid; - } + + BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); @@ -575,8 +585,7 @@ process_slot: * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); @@ -636,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); @@ -730,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); @@ -772,9 +781,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (btrfs_root_readonly(root_out)) return -EROFS; - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; + ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5465197996d..fdc2c4b411f0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2599,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, block->owner, block->key.offset, block->level, NULL); - if (IS_ERR(eb)) { + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2997,7 +2997,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len); + clamped_len, clamped_len); if (ret) goto release_page; @@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; @@ -4110,9 +4123,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ -int btrfs_recover_relocation(struct btrfs_root *root) +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; @@ -4153,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_tree_root(root, &key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3d68d2dcd83e..ca7426ef61c8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2e9a322773f2..11089568b287 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3190,7 +3190,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 generation; int mirror_num; struct btrfs_key key; - u64 increment = map->stripe_len; + u64 increment; u64 offset; u64 extent_logical; u64 extent_physical; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 201eb2628aea..7d1642937274 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -528,17 +528,12 @@ out: static int fs_path_copy(struct fs_path *p, struct fs_path *from) { - int ret; - p->reversed = from->reversed; fs_path_reset(p); - ret = fs_path_add_path(p, from); - - return ret; + return fs_path_add_path(p, from); } - static void fs_path_unreverse(struct fs_path *p) { char *tmp; @@ -7477,10 +7472,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) root->root_key.objectid, root->dedupe_in_progress); } -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; + struct btrfs_root *send_root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 23bcefc84e49..08602fdd600a 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -126,7 +126,7 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); #endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 294242c194d8..b87931a458eb 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -737,6 +737,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh = div_factor_fine(space_info->total_bytes, 90); u64 used; + lockdep_assert_held(&space_info->lock); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -1061,7 +1063,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1092,6 +1093,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 29bd8c7a7706..ef7ae20d2b77 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers)) + if (atomic_read(&subpage->writers) == 0) /* No writers, locked by plain lock_page() */ return unlock_page(page); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4d947ba32da9..b228efe8ab6e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem sate. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + /* * Generally the error codes correspond to their respective errors, but there * are a few special cases. @@ -128,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; const char *errstr; #endif @@ -140,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function #ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; va_list args; @@ -148,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.fmt = fmt; vaf.va = &args; - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, function, line, errno, errstr, &vaf); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); va_end(args); } else { - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n", - sb->s_id, function, line, errno, errstr); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); } #endif @@ -240,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.va = &args; if (__ratelimit(ratelimit)) { - if (fs_info) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info->sb->s_id, &vaf); - else + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } } va_end(args); @@ -861,6 +913,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_space_cache: case Opt_space_cache_version: + /* + * We already set FREE_SPACE_TREE above because we have + * compat_ro(FREE_SPACE_TREE) set, and we aren't going + * to allow v1 to be set for extent tree v2, simply + * ignore this setting if we're extent tree v2. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(info->mount_opt, @@ -881,6 +941,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: + /* + * We cannot operate without the free space tree with + * extent tree v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (btrfs_test_opt(info, SPACE_CACHE)) { btrfs_clear_and_info(info, SPACE_CACHE, "disabling disk space caching"); @@ -896,6 +962,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: + /* + * We cannot clear the free space tree with extent tree + * v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; @@ -2383,6 +2455,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_device *device = NULL; + dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -2402,7 +2475,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_FORGET_DEV: - ret = btrfs_forget_devices(vol->name); + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index beb7f72d50b8..17389a42a3ab 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -/* Remove once support for zoned allocation is feature complete */ #ifdef CONFIG_BTRFS_DEBUG +/* Remove once support for zoned allocation is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +/* Remove once support for extent tree v2 is feature complete */ +BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid1c34), #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(zoned), + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -1104,6 +1107,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == + ARRAY_SIZE(btrfs_feature_attrs)); +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == + ARRAY_SIZE(btrfs_feature_attrs[0])); + static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, @@ -1272,11 +1280,6 @@ static void init_feature_attrs(void) struct btrfs_feature_attr *fa; int set, i; - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != - ARRAY_SIZE(btrfs_feature_attrs)); - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != - ARRAY_SIZE(btrfs_feature_attrs[0])); - memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 319fed82d741..c5b3a631bf4f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; + write_lock(&em_tree->lock); while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); @@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #endif free_extent_map(em); } + write_unlock(&em_tree->lock); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c3cfdfd8de9b..b008c5110958 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -1320,6 +1350,32 @@ again: } /* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + +/* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted @@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } @@ -1850,6 +1911,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + root_item = &fs_info->block_group_root->root_item; + + super->block_group_root = root_item->bytenr; + super->block_group_root_generation = root_item->generation; + super->block_group_root_level = root_item->level; + } } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -2301,6 +2370,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_set_root_node(&fs_info->block_group_root->root_item, + fs_info->block_group_root->node); + list_add_tail(&fs_info->block_group_root->dirty_list, + &cur_trans->switch_commits); + } + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); @@ -2429,10 +2505,10 @@ cleanup_transaction: * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root; int ret; - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9402d8d94484..970ff316069d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -216,7 +216,8 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9fd145f1c4bc..e56c0107eea3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot, static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size(leaf, slot); + u64 chunk_objectid; u64 flags; u64 type; @@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + /* + * We don't init the nr_global_roots until we load the global + * roots, so this could be 0 at mount time. If it's 0 we'll + * just assume we're fine, and later we'll check against our + * actual value. + */ + if (unlikely(fs_info->nr_global_roots && + chunk_objectid >= fs_info->nr_global_roots)) { + block_group_err(leaf, slot, + "invalid block group global root id, have %llu, needs to be <= %llu", + chunk_objectid, + fs_info->nr_global_roots); + return -EUCLEAN; + } + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -1648,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* These trees must never be empty */ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { @@ -1657,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; } + + /* EXTENT_TREE_V2 can have empty extent trees. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + return 0; } @@ -1682,6 +1711,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + u64 item_data_end; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1696,6 +1726,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the @@ -1706,11 +1738,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, - "unexpected item end, have %u expect %u", - btrfs_item_data_end(leaf, slot), - item_end_expected); + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); return -EUCLEAN; } @@ -1719,12 +1750,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_data_end(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info))) { + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, - "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_data_end(leaf, slot), - BTRFS_LEAF_DATA_SIZE(fs_info)); + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3ee014c06b82..571dae8ad65e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { filemap_fdatawait_range(buf->pages[0]->mapping, @@ -294,16 +288,6 @@ struct walk_control { */ int free; - /* should we write out the extent buffer? This is used - * while flushing the log tree to disk during a sync - */ - int write; - - /* should we wait for the extent buffer io to finish? Also used - * while flushing the log tree to disk for a sync - */ - int wait; - /* pin only walk, we record which extents on disk belong to the * log trees */ @@ -354,17 +338,15 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } - if (wc->pin) + if (wc->pin) { ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); + if (ret) + return ret; - if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { - if (wc->pin && btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, 0) && + btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(eb); - if (wc->write) - btrfs_write_tree_block(eb); - if (wc->wait) - btrfs_wait_tree_block_writeback(eb); } return ret; } @@ -917,6 +899,26 @@ out: return ret; } +static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const char *name, + int name_len) +{ + int ret; + + ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + if (ret) + return ret; + /* + * Whenever we need to check if a name exists or not, we check the + * fs/subvolume tree. So after an unlink we must run delayed items, so + * that future checks for a name during log replay see that the name + * does not exists anymore. + */ + return btrfs_run_delayed_items(trans); +} + /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the @@ -959,12 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, name_len); - if (ret) - goto out; - else - ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -1124,14 +1122,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, dir, inode, + ret = unlink_inode_for_log_replay(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans); - if (ret) - return ret; *search_done = 1; goto again; } @@ -1196,14 +1191,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), inode, victim_name, victim_name_len); - if (!ret) - ret = btrfs_run_delayed_items( - trans); } iput(victim_parent); kfree(victim_name); @@ -1358,7 +1350,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1457,8 +1449,8 @@ static int add_link(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); if (ret) goto out; /* @@ -1467,10 +1459,6 @@ static int add_link(struct btrfs_trans_handle *trans, */ if (other_inode->i_nlink == 0) inc_nlink(other_inode); - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; add_link: ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen, 0, ref_index); @@ -1603,7 +1591,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -2350,15 +2338,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, goto out; inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len); - if (ret) - goto out; - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; - + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), + name, name_len); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -3477,35 +3458,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. This may often - * return some false positives, because logged_trans is an in memory only field, - * not persisted anywhere. This is meant to be used in contexts where a false - * positive has no functional consequences. + * Check if an inode was logged in the current transaction. This correctly deals + * with the case where the inode was logged but has a logged_trans of 0, which + * happens if the inode is evicted and loaded again, as logged_trans is an in + * memory only field (not persisted). + * + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, + * and < 0 on error. */ -static bool inode_logged(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static int inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path_in) { + struct btrfs_path *path = path_in; + struct btrfs_key key; + int ret; + if (inode->logged_trans == trans->transid) - return true; + return 1; - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) - return false; + /* + * If logged_trans is not 0, then we know the inode logged was not logged + * in this transaction, so we can return false right away. + */ + if (inode->logged_trans > 0) + return 0; /* - * The inode's logged_trans is always 0 when we load it (because it is - * not persisted in the inode item or elsewhere). So if it is 0, the - * inode was last modified in the current transaction then the inode may - * have been logged before in the current transaction, then evicted and - * loaded again in the current transaction - or may have never been logged - * in the current transaction, but since we can not be sure, we have to - * assume it was, otherwise our callers can leave an inconsistent log. + * If no log tree was created for this root in this transaction, then + * the inode can not have been logged in this transaction. In that case + * set logged_trans to anything greater than 0 and less than the current + * transaction's ID, to avoid the search below in a future call in case + * a log tree gets created after this. */ - if (inode->logged_trans == 0 && - inode->last_trans == trans->transid && - !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) - return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * We have a log tree and the inode's logged_trans is 0. We can't tell + * for sure if the inode was logged before in this transaction by looking + * only at logged_trans. We could be pessimistic and assume it was, but + * that can lead to unnecessarily logging an inode during rename and link + * operations, and then further updating the log in followup rename and + * link operations, specially if it's a directory, which adds latency + * visible to applications doing a series of rename or link operations. + * + * A logged_trans of 0 here can mean several things: + * + * 1) The inode was never logged since the filesystem was mounted, and may + * or may have not been evicted and loaded again; + * + * 2) The inode was logged in a previous transaction, then evicted and + * then loaded again; + * + * 3) The inode was logged in the current transaction, then evicted and + * then loaded again. + * + * For cases 1) and 2) we don't want to return true, but we need to detect + * case 3) and return true. So we do a search in the log root for the inode + * item. + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + + if (path_in) + btrfs_release_path(path); + else + btrfs_free_path(path); + + /* + * Logging an inode always results in logging its inode item. So if we + * did not find the item we know the inode was not logged for sure. + */ + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* + * Set logged_trans to a value greater than 0 and less then the + * current transaction to avoid doing the search in future calls. + */ + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * The inode was previously logged and then evicted, set logged_trans to + * the current transacion's ID, to avoid future tree searches as long as + * the inode is not evicted again. + */ + inode->logged_trans = trans->transid; + + /* + * If it's a directory, then we must set last_dir_index_offset to the + * maximum possible value, so that the next attempt to log the inode does + * not skip checking if dir index keys found in modified subvolume tree + * leaves have been logged before, otherwise it would result in attempts + * to insert duplicate dir index keys in the log tree. This must be done + * because last_dir_index_offset is an in-memory only field, not persisted + * in the inode item or any other on-disk structure, so its value is lost + * once the inode is evicted. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_dir_index_offset = (u64)-1; + + return 1; +} + +/* + * Delete a directory entry from the log if it exists. + * + * Returns < 0 on error + * 1 if the entry does not exists + * 0 if the entry existed and was successfully deleted + */ +static int del_logged_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dir_ino, + const char *name, int name_len, + u64 index) +{ + struct btrfs_dir_item *di; + + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) + return PTR_ERR(di); + else if (!di) + return 1; - return false; + /* + * We do not need to update the size field of the directory's + * inode item because on log replay we update the field to reflect + * all existing entries in the directory (see overwrite_item()). + */ + return btrfs_delete_one_dir_name(trans, log, path, di); } /* @@ -3534,15 +3636,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, u64 index) { - struct btrfs_root *log; - struct btrfs_dir_item *di; struct btrfs_path *path; int ret; - int err = 0; - u64 dir_ino = btrfs_ino(dir); - if (!inode_logged(trans, dir)) + ret = inode_logged(trans, dir, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3550,41 +3653,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, mutex_lock(&dir->log_mutex); - log = root->log_root; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_unlock; } - /* - * We only log dir index items of a directory, so we don't need to look - * for dir item keys. - */ - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - - /* - * We do not need to update the size field of the directory's inode item - * because on log replay we update the field to reflect all existing - * entries in the directory (see overwrite_item()). - */ -fail: + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), + name, name_len, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err < 0) + if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); } @@ -3599,8 +3679,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (!inode_logged(trans, inode)) + ret = inode_logged(trans, inode, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3725,19 +3810,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; struct extent_buffer *src = path->nodes[0]; const int nritems = btrfs_header_nritems(src); const u64 ino = btrfs_ino(inode); - const bool inode_logged_before = inode_logged(trans, inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3748,7 +3834,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, break; } + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); ctx->last_dir_item_offset = key.offset; + + /* + * Skip ranges of items that consist only of dir item keys created + * in past transactions. However if we find a gap, we must log a + * dir index range item for that gap, so that index keys in that + * gap are deleted during log replay. + */ + if (btrfs_dir_transid(src, di) < trans->transid) { + if (key.offset > *last_old_dentry_offset + 1) { + ret = insert_dir_log_key(trans, log, dst_path, + ino, *last_old_dentry_offset + 1, + key.offset - 1); + /* + * -EEXIST should never happen because when we + * log a directory in full mode (LOG_INODE_ALL) + * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from + * the log tree. + */ + ASSERT(ret != -EEXIST); + if (ret < 0) + return ret; + } + + *last_old_dentry_offset = key.offset; + continue; + } /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3772,25 +3885,23 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * resulting in -ENOTEMPTY errors. */ if (!ctx->log_new_dentries) { - struct btrfs_dir_item *di; struct btrfs_key di_key; - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(src, di, &di_key); - if ((btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - di_key.type != BTRFS_ROOT_ITEM_KEY) + if (di_key.type != BTRFS_ROOT_ITEM_KEY) ctx->log_new_dentries = true; } - if (!inode_logged_before) + if (!ctx->logged_before) goto add_to_batch; /* * If we were logged before and have logged dir items, we can skip * checking if any item with a key offset larger than the last one * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. + * contention on the log tree. We can only rely on the value of + * last_dir_index_offset when we know for sure that the inode was + * previously logged in the current transaction. */ if (key.offset > inode->last_dir_index_offset) goto add_to_batch; @@ -3860,7 +3971,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - u64 first_offset = min_offset; + u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); @@ -3894,10 +4005,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) - first_offset = max(min_offset, tmp.offset) + 1; + last_old_dentry_offset = tmp.offset; } goto done; } @@ -3906,17 +4018,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.type == BTRFS_DIR_INDEX_KEY) { - first_offset = tmp.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) { - err = ret; - goto done; - } - } + /* + * The dir index key before the first one we found that needs to + * be logged might be in a previous leaf, and there might be a + * gap between these keys, meaning that we had deletions that + * happened. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the + * previous key's offset plus 1, so that those deletes are replayed. + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; } btrfs_release_path(path); @@ -3938,7 +4051,8 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); if (ret != 0) { if (ret < 0) err = ret; @@ -3964,14 +4078,16 @@ search: goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { - ctx->last_dir_item_offset = min_key.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &min_key); - if (ret) - err = ret; - else - last_offset = min_key.offset; + /* + * The next leaf was not changed in the current transaction + * and has at least one dir index key. + * We check for the next key because there might have been + * one or more deletions between the last key we logged and + * that next key. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's + * offset minus 1, so that those deletes are replayed. + */ + last_offset = min_key.offset - 1; goto done; } if (need_resched()) { @@ -3987,13 +4103,21 @@ done: if (err == 0) { *last_offset_ret = last_offset; /* - * insert the log range keys to indicate where the log - * is valid + * In case the leaf was changed in the current transaction but + * all its dir items are from a past transaction, the last item + * in the leaf is a dir item and there's no gap between that last + * dir item and the first one on the next leaf (which did not + * change in the current transaction), then we don't need to log + * a range, last_old_dentry_offset is == to last_offset. */ - ret = insert_dir_log_key(trans, log, path, ino, first_offset, - last_offset); - if (ret) - err = ret; + ASSERT(last_old_dentry_offset <= last_offset); + if (last_old_dentry_offset < last_offset) { + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); + if (ret) + err = ret; + } } return err; } @@ -4020,22 +4144,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; - /* - * If this is the first time we are being logged in the current - * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the value - * of the last logged key offset. Note that we don't use the helper - * function inode_logged() here - that is because the function returns - * true after an inode eviction, assuming the worst case as it can not - * know for sure if the inode was logged before. So we can not skip key - * searches in the case the inode was evicted, because it may not have - * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir index offset to (u64)-1. - */ - if (inode->logged_trans != trans->transid) - inode->last_dir_index_offset = (u64)-1; - - min_key = 0; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4071,9 +4180,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_key found_key; int start_slot; - if (!inode_logged(trans, inode)) - return 0; - key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -4293,23 +4399,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = trans->fs_info; - unsigned long src_offset; - unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - int ret; + int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; int i; - struct list_head ordered_sums; - int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - - INIT_LIST_HEAD(&ordered_sums); + int dst_index; + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); + const u64 i_size = i_size_read(&inode->vfs_inode); ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4321,28 +4422,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; batch.total_data_size = 0; - batch.nr = nr; + batch.nr = 0; + dst_index = 0; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size(src, i + start_slot); - batch.total_data_size += ins_sizes[i]; - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + const int src_slot = start_slot + i; + struct btrfs_root *csum_root; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_sum *sums_next; + LIST_HEAD(ordered_sums); + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + u64 extent_num_bytes; + bool is_old_extent; + + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); + + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) + goto add_to_batch; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + is_old_extent = (btrfs_file_extent_generation(src, extent) < + trans->transid); + + /* + * Don't copy extents from past generations. That would make us + * log a lot more metadata for common cases like doing only a + * few random writes into a file and then fsync it for the first + * time or after the full sync flag is set on the inode. We can + * get leaves full of extent items, most of which are from past + * generations, so we can skip them - as long as the inode has + * not been the target of a reflink operation in this transaction, + * as in that case it might have had file extent items with old + * generations copied into it. We also must always log prealloc + * extents that start at or beyond eof, otherwise we would lose + * them on log replay. + */ + if (is_old_extent && + ins_keys[dst_index].offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + + if (skip_csum) + goto add_to_batch; + + /* Only regular extents have checksums. */ + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) + goto add_to_batch; + + /* + * If it's an extent created in a past transaction, then its + * checksums are already accessible from the committed csum tree, + * no need to log them. + */ + if (is_old_extent) + goto add_to_batch; + + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); + /* If it's an explicit hole, there are no checksums. */ + if (disk_bytenr == 0) + goto add_to_batch; + + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); + + if (btrfs_file_extent_compression(src, extent)) { + extent_offset = 0; + extent_num_bytes = disk_num_bytes; + } else { + extent_offset = btrfs_file_extent_offset(src, extent); + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); + } + + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); + disk_bytenr += extent_offset; + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0); + if (ret) + goto out; + + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { + if (!ret) + ret = log_csums(trans, inode, log, sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + +add_to_batch: + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); + batch.total_data_size += ins_sizes[dst_index]; + batch.nr++; + dst_index++; } + + /* + * We have a leaf full of old extent items that don't need to be logged, + * so we don't need to do anything. + */ + if (batch.nr == 0) + goto out; + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); - if (ret) { - kfree(ins_data); - return ret; - } + if (ret) + goto out; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + const int dst_slot = dst_path->slots[0] + dst_index; + struct btrfs_key key; + unsigned long src_offset; + unsigned long dst_offset; + + /* + * We're done, all the remaining items in the source leaf + * correspond to old file extent items. + */ + if (dst_index >= batch.nr) + break; + + btrfs_item_key_to_cpu(src, &key, src_slot); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto copy_item; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + /* See the comment in the previous loop, same logic. */ + if (btrfs_file_extent_generation(src, extent) < trans->transid && + key.offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; - for (i = 0; i < nr; i++, dst_path->slots[0]++) { - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], - dst_path->slots[0]); +copy_item: + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); + src_offset = btrfs_item_ptr_offset(src, src_slot); - src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if (key.type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *inode_item; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(dst_path->nodes[0], - dst_path->slots[0], + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, &inode->vfs_inode, @@ -4350,71 +4575,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); + src_offset, ins_sizes[dst_index]); } - /* take a reference on file data extents so that truncates - * or deletes of this inode don't have to relog the inode - * again - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && - !skip_csum) { - int found_type; - extent = btrfs_item_ptr(src, start_slot + i, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(src, extent) < trans->transid) - continue; - - found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - struct btrfs_root *csum_root; - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent); - if (btrfs_file_extent_compression(src, - extent)) { - cs = 0; - cl = dl; - } - - csum_root = btrfs_csum_root(fs_info, ds); - ret = btrfs_lookup_csums_range(csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums, 0); - if (ret) - break; - } - } + dst_index++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); btrfs_release_path(dst_path); +out: kfree(ins_data); - /* - * we have to do this after the loop above to avoid changing the - * log tree while trying to change the log tree. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - if (!ret) - ret = log_csums(trans, inode, log, sums); - list_del(&sums->list); - kfree(sums); - } - return ret; } @@ -4550,14 +4721,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans, { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = inode->root->log_root; - struct btrfs_file_extent_item *fi; + struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; - struct btrfs_map_token token; struct btrfs_key key; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + btrfs_set_stack_file_extent_generation(&fi, trans->transid); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); + else + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - + extent_offset); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } + + btrfs_set_stack_file_extent_offset(&fi, extent_offset); + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4571,12 +4762,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * are small, with a root at level 2 or 3 at most, due to their short * life span. */ - if (inode_logged(trans, inode)) { + if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; drop_args.end = em->start + em->len; drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); + drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; @@ -4588,44 +4779,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans, key.offset = em->start; ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); + sizeof(fi)); if (ret) return ret; } leaf = path->nodes[0]; - btrfs_init_map_token(&token, leaf); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(&token, fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_PREALLOC); - else - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_REG); - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start - - extent_offset); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); - } - - btrfs_set_token_file_extent_offset(&token, fi, extent_offset); - btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); - btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); - btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); - btrfs_set_token_file_extent_encryption(&token, fi, 0); - btrfs_set_token_file_extent_other_encoding(&token, fi, 0); + write_extent_buffer(leaf, &fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(fi)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4635,7 +4796,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -4839,7 +5000,6 @@ process: WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); if (ret) @@ -5414,6 +5574,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5434,13 +5595,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5471,10 +5640,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5526,10 +5693,29 @@ next_key: } else { break; } + + /* + * We may process many leaves full of items for our inode, so + * avoid monopolizing a cpu for too long by rescheduling while + * not holding locks on any tree. + */ + cond_resched(); } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } @@ -5558,8 +5744,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = inode->root->log_root; - int err = 0; - int ret = 0; + int ret; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5568,6 +5753,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool xattrs_logged = false; bool recursive_logging = false; bool inode_item_dropped = true; + const bool orig_logged_before = ctx->logged_before; path = btrfs_alloc_path(); if (!path) @@ -5601,8 +5787,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * and figure out which index ranges have to be logged. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - err = btrfs_commit_inode_delayed_items(trans, inode); - if (err) + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) goto out; } @@ -5618,6 +5804,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * Before logging the inode item, cache the value returned by + * inode_logged(), because after that we have the need to figure out if + * the inode was previously logged in this transaction. + */ + ret = inode_logged(trans, inode, path); + if (ret < 0) + goto out_unlock; + ctx->logged_before = (ret == 1); + ret = 0; + + /* * This is for cases where logging a directory could result in losing a * a file after replaying the log. For example, if we move a file from a * directory A to a directory B, then fsync directory A, we have no way @@ -5628,7 +5825,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - err = 1; + ret = 1; goto out_unlock; } @@ -5642,9 +5839,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, max_key_type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5658,22 +5857,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - err = logged_inode_size(log, inode, path, &logged_isize); - if (err) + ret = logged_inode_size(log, inode, path, &logged_isize); + if (ret) goto out_unlock; } if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, + inode, max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_logged(trans, inode)) + if (ctx->logged_before) ret = truncate_inode_items(trans, log, inode, 0, 0); } @@ -5683,8 +5883,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5693,37 +5894,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } } - if (ret) { - err = ret; + if (ret) goto out_unlock; - } - err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, recursive_logging, inode_only, ctx, &need_log_inode_item); - if (err) + if (ret) goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, inode, path); - if (err) + ret = btrfs_log_holes(trans, inode, path); + if (ret) goto out_unlock; } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); - if (err) + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); + if (ret) goto out_unlock; /* * If we are doing a fast fsync and the inode was logged before @@ -5734,18 +5933,16 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } else if (inode_only == LOG_INODE_ALL) { struct extent_map *em, *n; @@ -5757,10 +5954,8 @@ log_extents: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } spin_lock(&inode->lock); @@ -5799,12 +5994,24 @@ log_extents: if (inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); + + /* + * Reset the last_reflink_trans so that the next fsync does not need to + * go through the slower path when logging extents and their checksums. + */ + if (inode_only == LOG_INODE_ALL) + inode->last_reflink_trans = 0; + out_unlock: mutex_unlock(&inode->log_mutex); out: btrfs_free_path(path); btrfs_free_path(dst_path); - return err; + + if (recursive_logging) + ctx->logged_before = orig_logged_before; + + return ret; } /* @@ -5889,7 +6096,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *log = root->log_root; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5931,7 +6137,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, min_key.offset = 0; again: btrfs_release_path(path); - ret = btrfs_search_forward(log, &min_key, path, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret < 0) { goto next_dir_inode; } else if (ret > 0) { @@ -5939,7 +6145,6 @@ again: goto next_dir_inode; } -process_leaf: leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); for (i = path->slots[0]; i < nritems; i++) { @@ -5957,8 +6162,7 @@ process_leaf: di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid && - type != BTRFS_FT_DIR) + if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); if (di_key.type == BTRFS_ROOT_ITEM_KEY) @@ -5996,16 +6200,6 @@ process_leaf: } break; } - if (i == nritems) { - ret = btrfs_next_leaf(log, path); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - goto process_leaf; - } if (min_key.offset < (u64)-1) { min_key.offset++; goto again; @@ -6736,15 +6930,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. +/** + * Update the log after adding a new name for an inode. + * + * @trans: Transaction handle. + * @old_dentry: The dentry associated with the old name and the old + * parent directory. + * @old_dir: The inode of the previous parent directory for the case + * of a rename. For a link operation, it must be NULL. + * @old_dir_index: The index number associated with the old name, meaningful + * only for rename operations (when @old_dir is not NULL). + * Ignored for link operations. + * @parent: The dentry associated with the directory under which the + * new name is located. + * + * Call this after adding a new name for an inode, as a result of a link or + * rename operation, and it will properly update the log to reflect the new name. */ void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent) { + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6757,26 +6968,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (!inode_logged(trans, inode) && - (!old_dir || !inode_logged(trans, old_dir))) - return; + ret = inode_logged(trans, inode, NULL); + if (ret < 0) { + goto out; + } else if (ret == 0) { + if (!old_dir) + return; + /* + * If the inode was not logged and we are doing a rename (old_dir is not + * NULL), check if old_dir was logged - if it was not we can return and + * do nothing. + */ + ret = inode_logged(trans, old_dir, NULL); + if (ret < 0) + goto out; + else if (ret == 0) + return; + } + ret = 0; /* * If we are doing a rename (old_dir is not NULL) from a directory that - * was previously logged, make sure the next log attempt on the directory - * is not skipped and logs the inode again. This is because the log may - * not currently be authoritative for a range including the old - * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we - * do not end up with both the new and old dentries around (in case the - * inode is a directory we would have a directory with two hard links and - * 2 inode references for different parents). The next log attempt of - * old_dir will happen at btrfs_log_all_parents(), called through - * btrfs_log_inode_parent() below, because we have previously set - * inode->last_unlink_trans to the current transaction ID, either here or - * at btrfs_record_unlink_dir() in case the inode is a directory. + * was previously logged, make sure that on log replay we get the old + * dir entry deleted. This is needed because we will also log the new + * name of the renamed inode, so we need to make sure that after log + * replay we don't end up with both the new and old dir entries existing. */ - if (old_dir) - old_dir->logged_trans = 0; + if (old_dir && old_dir->logged_trans == trans->transid) { + struct btrfs_root *log = old_dir->root->log_root; + struct btrfs_path *path; + + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + log_pinned = true; + btrfs_pin_log_trans(root); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * Other concurrent task might be logging the old directory, + * as it can be triggered when logging other inode that had or + * still has a dentry in the old directory. So take the old + * directory's log_mutex to prevent getting an -EEXIST when + * logging a key to record the deletion, or having that other + * task logging the old directory get an -EEXIST if it attempts + * to log the same key after we just did it. In both cases that + * would result in falling back to a transaction commit. + */ + mutex_lock(&old_dir->log_mutex); + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), + old_dentry->d_name.name, + old_dentry->d_name.len, old_dir_index); + if (ret > 0) { + /* + * The dentry does not exist in the log, so record its + * deletion. + */ + btrfs_release_path(path); + ret = insert_dir_log_key(trans, log, path, + btrfs_ino(old_dir), + old_dir_index, old_dir_index); + } + mutex_unlock(&old_dir->log_mutex); + + btrfs_free_path(path); + if (ret < 0) + goto out; + } btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; @@ -6788,5 +7056,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); +out: + /* + * If an error happened mark the log for a full commit because it's not + * consistent and up to date or we couldn't find out if one of the + * inodes was logged before in this transaction. Do it before unpinning + * the log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + if (log_pinned) + btrfs_end_log_trans(root); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f6811c3df38a..1620f8170629 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Indicate if the inode being logged was logged before. */ + bool logged_before; /* Tracks the last logged dir item/index key offset. */ u64 last_dir_item_offset; struct inode *inode; @@ -32,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); @@ -86,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent); + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b07d382d53a8..a8cc736731fd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -534,30 +534,20 @@ error: return ret; } -static bool device_path_matched(const char *path, struct btrfs_device *device) -{ - int found; - - rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); - rcu_read_unlock(); - - return found == 0; -} - -/* - * Search and remove all stale (devices which are not mounted) devices. +/** + * Search and remove all stale devices (which are not mounted). * When both inputs are NULL, it will search and release all stale devices. - * path: Optional. When provided will it release all unmounted devices - * matching this path only. - * skip_dev: Optional. Will skip this device when searching for the stale + * + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. + * @skip_device: Optional. Will skip this device when searching for the stale * devices. - * Return: 0 for success or if @path is NULL. - * -EBUSY if @path is a mounted device. - * -ENOENT if @path does not match any device in the list. + * + * Return: 0 for success or if @devt is 0. + * -EBUSY if @devt is a mounted device. + * -ENOENT if @devt does not match any device in the list. */ -static int btrfs_free_stale_devices(const char *path, - struct btrfs_device *skip_device) +static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; @@ -565,7 +555,7 @@ static int btrfs_free_stale_devices(const char *path, lockdep_assert_held(&uuid_mutex); - if (path) + if (devt) ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { @@ -575,13 +565,11 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (devt && devt != device->devt) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ - if (path && ret != 0) + if (devt && ret != 0) ret = -EBUSY; break; } @@ -614,7 +602,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) { - struct request_queue *q; struct block_device *bdev; struct btrfs_super_block *disk_super; u64 devid; @@ -656,8 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - q = bdev_get_queue(bdev); - if (!blk_queue_nonrot(q)) + if (!blk_queue_nonrot(bdev_get_queue(bdev))) fs_devices->rotating = true; device->bdev = bdev; @@ -781,11 +767,17 @@ static noinline struct btrfs_device *device_list_add(const char *path, struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_t path_devt; + int error; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + error = lookup_bdev(path, &path_devt); + if (error) + return ERR_PTR(error); + if (fsid_change_in_progress) { if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); @@ -868,6 +860,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); + device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; @@ -928,25 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted + * + * NOTE: the device->fs_info may not be reliable here so pass + * in a NULL to message helpers instead. This avoids a possible + * use-after-free when the fs_info and fs_info->sb are already + * torn down. */ if (device->bdev) { - int error; - dev_t path_dev; - - error = lookup_bdev(path, &path_dev); - if (error) { + if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(error); - } - - if (device->bdev->bd_dev != path_dev) { - mutex_unlock(&fs_devices->device_list_mutex); - /* - * device->fs_info may not be reliable here, so - * pass in a NULL instead. This avoids a - * possible use-after-free when the fs_info and - * fs_info->sb are already torn down. - */ btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, @@ -954,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(device->fs_info, + btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), path, current->comm, @@ -972,6 +955,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } + device->devt = path_devt; } /* @@ -1331,12 +1315,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return disk_super; } -int btrfs_forget_devices(const char *path) +int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); - ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; @@ -1385,10 +1369,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, } device = device_list_add(path, disk_super, &new_device_added); - if (!IS_ERR(device)) { - if (new_device_added) - btrfs_free_stale_devices(path, device); - } + if (!IS_ERR(device) && new_device_added) + btrfs_free_stale_devices(device->devt, device); btrfs_release_disk_super(disk_super); @@ -1914,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1941,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2096,12 +2064,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted @@ -2111,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2119,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2152,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2242,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2253,7 +2231,7 @@ error_undo: device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) @@ -2606,7 +2584,6 @@ error: int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; - struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -2668,6 +2645,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) @@ -2679,7 +2659,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_free_zone; } - q = bdev_get_queue(bdev); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; @@ -2727,7 +2706,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(q)) + if (!blk_queue_nonrot(bdev_get_queue(bdev))) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2814,7 +2793,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ - btrfs_forget_devices(device_path); + btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); @@ -3251,6 +3230,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) u64 length; int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "relocate: not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset @@ -4445,10 +4430,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } @@ -7060,6 +7047,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, } #endif +static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, uuid, true); + return ERR_PTR(-ENOENT); + } + + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); + if (IS_ERR(dev)) { + btrfs_err(fs_info, "failed to init missing device %llu: %ld", + devid, PTR_ERR(dev)); + return dev; + } + btrfs_report_missing_device(fs_info, devid, uuid, false); + + return dev; +} + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -7147,28 +7155,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!map->stripes[i].dev && - !btrfs_test_opt(fs_info, DEGRADED)) { - free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid, true); - return -ENOENT; - } if (!map->stripes[i].dev) { - map->stripes[i].dev = - add_missing_dev(fs_info->fs_devices, devid, - uuid); + map->stripes[i].dev = handle_missing_device(fs_info, + devid, uuid); if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - btrfs_err(fs_info, - "failed to init missing dev %llu: %ld", - devid, PTR_ERR(map->stripes[i].dev)); return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid, false); } + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); - } write_lock(&map_tree->lock); @@ -8299,10 +8296,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); + sb_end_write(fs_info->sb); return -EBUSY; } @@ -8330,6 +8329,7 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 005c9e2a491a..bd297f23d19e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -72,6 +72,11 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; unsigned long dev_state; blk_status_t last_flush_error; @@ -505,7 +510,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); -int btrfs_forget_devices(const char *path); +int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f559d517c7c4..1b1b310c3c51 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -652,8 +652,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (model == BLK_ZONED_HM || (model == BLK_ZONED_HA && incompat_zoned) || (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info = - device->zone_info; + struct btrfs_zoned_device_info *zone_info; zone_info = device->zone_info; zoned_devices++; @@ -1215,12 +1214,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; - u64 physical = 0; int ret; int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; u64 *caps = NULL; + u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1264,6 +1263,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; @@ -1277,14 +1282,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; - physical = map->stripes[i].physical; + physical[i] = map->stripes[i].physical; if (device->bdev == NULL) { alloc_offsets[i] = WP_MISSING_DEV; continue; } - is_sequential = btrfs_dev_is_sequential(device, physical); + is_sequential = btrfs_dev_is_sequential(device, physical[i]); if (is_sequential) num_sequential++; else @@ -1299,21 +1304,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * This zone will be used for allocation, so mark this zone * non-empty. */ - btrfs_dev_clear_zone_empty(device, physical); + btrfs_dev_clear_zone_empty(device, physical[i]); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. */ - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical, &zone); + ret = btrfs_get_dev_zone(device, physical[i], &zone); memalloc_nofs_restore(nofs_flag); if (ret == -EIO || ret == -EOPNOTSUPP) { ret = 0; @@ -1339,7 +1344,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical >> device->zone_info->zone_size_shift, + physical[i] >> device->zone_info->zone_size_shift, rcu_str_deref(device->name), device->devid); alloc_offsets[i] = WP_MISSING_DEV; break; @@ -1404,7 +1409,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (alloc_offsets[0] == WP_MISSING_DEV) { btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", - physical); + physical[0]); ret = -EIO; goto out; } @@ -1413,6 +1418,42 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + cache->zone_is_active = test_bit(0, active); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID0: case BTRFS_BLOCK_GROUP_RAID10: @@ -1465,6 +1506,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); + kfree(physical); kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1759,7 +1801,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1781,50 +1822,55 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) struct btrfs_device *device; u64 physical; bool ret; + int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return true; spin_lock(&block_group->lock); - if (block_group->zone_is_active) { ret = true; goto out_unlock; } - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; - if (!btrfs_dev_set_active_zone(device, physical)) { - /* Cannot activate the zone */ - ret = false; - goto out_unlock; - } + if (device->zone_info->max_active_zones == 0) + continue; - /* Successfully activated all the zones */ - block_group->zone_is_active = 1; + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + + /* Successfully activated all the zones */ + if (i == map->num_stripes - 1) + block_group->zone_is_active = 1; + + + } spin_unlock(&block_group->lock); - /* For the active block group list */ - btrfs_get_block_group(block_group); + if (block_group->zone_is_active) { + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(list_empty(&block_group->active_bg_list)); - list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } return true; @@ -1840,19 +1886,12 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) struct btrfs_device *device; u64 physical; int ret = 0; + int i; if (!btrfs_is_zoned(fs_info)) return 0; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return 0; spin_lock(&block_group->lock); if (!block_group->zone_is_active) { @@ -1904,41 +1943,48 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); - btrfs_dec_block_group_ro(block_group); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; - if (!ret) { - btrfs_dev_clear_active_zone(device, physical); + if (device->zone_info->max_active_zones == 0) + continue; - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + + if (ret) + return ret; - /* For active_bg_list */ - btrfs_put_block_group(block_group); + btrfs_dev_clear_active_zone(device, physical); } + btrfs_dec_block_group_ro(block_group); - return ret; + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + return 0; } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1950,7 +1996,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd371..2b5561ae5d0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -53,7 +53,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint hint, struct writeback_control *wbc); + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -613,17 +613,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. */ -int __set_page_dirty_buffers(struct page *page) +bool block_dirty_folio(struct address_space *mapping, struct folio *folio) { - int newly_dirty; - struct address_space *mapping = page_mapping(page); - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); + struct buffer_head *head; + bool newly_dirty; spin_lock(&mapping->private_lock); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + head = folio_buffers(folio); + if (head) { struct buffer_head *bh = head; do { @@ -635,21 +632,21 @@ int __set_page_dirty_buffers(struct page *page) * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); + folio_memcg_lock(folio); + newly_dirty = !folio_test_set_dirty(folio); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __folio_mark_dirty(folio, mapping, 1); - unlock_page_memcg(page); + folio_memcg_unlock(folio); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(block_dirty_folio); /* * Write out and wait upon a list of buffers. @@ -1235,16 +1232,18 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + bh_lru_lock(); + /* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) + if (lru_cache_disabled()) { + bh_lru_unlock(); return; - - bh_lru_lock(); + } b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { @@ -1482,41 +1481,40 @@ static void discard_buffer(struct buffer_head * bh) } /** - * block_invalidatepage - invalidate part or all of a buffer-backed page - * - * @page: the page which is affected + * block_invalidate_folio - Invalidate part or all of a buffer-backed folio. + * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * - * block_invalidatepage() is called when all or part of the page has become + * block_invalidate_folio() is called when all or part of the folio has been * invalidated by a truncate operation. * - * block_invalidatepage() does not have to release all buffers, but it must + * block_invalidate_folio() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct buffer_head *head, *bh, *next; - unsigned int curr_off = 0; - unsigned int stop = length + offset; + size_t curr_off = 0; + size_t stop = length + offset; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; + BUG_ON(!folio_test_locked(folio)); /* * Check for overflow */ - BUG_ON(stop > PAGE_SIZE || stop < length); + BUG_ON(stop > folio_size(folio) || stop < length); + + head = folio_buffers(folio); + if (!head) + return; - head = page_buffers(page); bh = head; do { - unsigned int next_off = curr_off + bh->b_size; + size_t next_off = curr_off + bh->b_size; next = bh->b_this_page; /* @@ -1535,21 +1533,21 @@ void block_invalidatepage(struct page *page, unsigned int offset, } while (bh != head); /* - * We release buffers only if the entire page is being invalidated. + * We release buffers only if the entire folio is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (length == PAGE_SIZE) - try_to_release_page(page, 0); + if (length == folio_size(folio)) + filemap_release_folio(folio, 0); out: return; } -EXPORT_SYMBOL(block_invalidatepage); +EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1724,12 +1722,12 @@ int __block_write_full_page(struct inode *inode, struct page *page, (1 << BH_Dirty)|(1 << BH_Uptodate)); /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ @@ -1806,8 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1861,8 +1858,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2206,29 +2202,27 @@ int generic_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(generic_write_end); /* - * block_is_partially_uptodate checks whether buffers within a page are + * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * - * Returns true if all buffers which correspond to a file portion - * we want to read are uptodate. + * Returns true if all buffers which correspond to the specified part + * of the folio are uptodate. */ -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; - int ret = 1; + bool ret = true; - if (!page_has_buffers(page)) - return 0; - - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) + return false; blocksize = head->b_size; - to = min_t(unsigned, PAGE_SIZE - from, count); + to = min_t(unsigned, folio_size(folio) - from, count); to = from + to; - if (from < blocksize && to > PAGE_SIZE - blocksize) - return 0; + if (from < blocksize && to > folio_size(folio) - blocksize) + return false; bh = head; block_start = 0; @@ -2236,7 +2230,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) { - ret = 0; + ret = false; break; } if (block_end >= to) @@ -2358,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_CONT_EXPAND, &page, &fsdata); + err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); if (err) goto out; @@ -3008,7 +3001,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint write_hint, struct writeback_control *wbc) + struct writeback_control *wbc) { struct bio *bio; @@ -3024,13 +3017,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); - bio = bio_alloc(GFP_NOIO, 1); + if (buffer_meta(bh)) + op_flags |= REQ_META; + if (buffer_prio(bh)) + op_flags |= REQ_PRIO; + + bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); - bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3038,12 +3034,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - if (buffer_meta(bh)) - op_flags |= REQ_META; - if (buffer_prio(bh)) - op_flags |= REQ_PRIO; - bio_set_op_attrs(bio, op, op_flags); - /* Take care of bh's that straddle the end of the device */ guard_bio_eod(bio); @@ -3058,7 +3048,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3185,7 +3175,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * total exclusion from block_dirty_folio(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -3252,7 +3242,7 @@ int try_to_free_buffers(struct page *page) * the page also. * * private_lock must be held over this entire operation in order - * to synchronise against __set_page_dirty_buffers and prevent the + * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 51c968cd00a6..ae93cee9d25d 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -254,7 +254,7 @@ static bool cachefiles_shorten_object(struct cachefiles_object *object, ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE, - new_size, dio_size); + new_size, dio_size - new_size); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 753986ea1583..9dc81e781f2b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -138,7 +138,6 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos + skipped; ki->iocb.ki_flags = IOCB_DIRECT; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->skipped = skipped; ki->object = object; @@ -313,7 +312,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos; ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->object = object; ki->inval_counter = cres->inval_counter; @@ -382,18 +380,18 @@ presubmission_error: * Prepare a read operation, shortening it to a cached/uncached * boundary as appropriate. */ -static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, loff_t i_size) { enum cachefiles_prepare_read_trace why; - struct netfs_read_request *rreq = subreq->rreq; + struct netfs_io_request *rreq = subreq->rreq; struct netfs_cache_resources *cres = &rreq->cache_resources; struct cachefiles_object *object; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); - enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER; + enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; @@ -406,7 +404,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); why = cachefiles_trace_read_no_data; goto out_no_object; } @@ -475,7 +473,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque goto out; download_and_store: - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); out: cachefiles_end_secure(cache, saved_cred); out_no_object: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f256c8aff7bb..ca9f3e4ec4b3 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -57,6 +57,16 @@ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, trace_cachefiles_mark_inactive(object, inode); } +static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) +{ + struct inode *inode = d_backing_inode(dentry); + + inode_lock(inode); + __cachefiles_unmark_inode_in_use(object, dentry); + inode_unlock(inode); +} + /* * Unmark a backing inode and tell cachefilesd that there's something that can * be culled. @@ -68,9 +78,7 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct inode *inode = file_inode(file); if (inode) { - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, file->f_path.dentry); - inode_unlock(inode); + cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { atomic_long_add(inode->i_blocks, &cache->b_released); @@ -484,7 +492,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) object, d_backing_inode(path.dentry), ret, cachefiles_trace_trunc_error); file = ERR_PTR(ret); - goto out_dput; + goto out_unuse; } } @@ -494,15 +502,20 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), PTR_ERR(file), cachefiles_trace_open_error); - goto out_dput; + goto out_unuse; } if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); file = ERR_PTR(-EINVAL); + goto out_unuse; } + goto out_dput; + +out_unuse: + cachefiles_do_unmark_inode_in_use(object, path.dentry); out_dput: dput(path.dentry); out: @@ -590,14 +603,16 @@ static bool cachefiles_open_file(struct cachefiles_object *object, check_failed: fscache_cookie_lookup_negative(object->cookie); cachefiles_unmark_inode_in_use(object, file); - if (ret == -ESTALE) { - fput(file); - dput(dentry); + fput(file); + dput(dentry); + if (ret == -ESTALE) return cachefiles_create_file(object); - } + return false; + error_fput: fput(file); error: + cachefiles_do_unmark_inode_in_use(object, dentry); dput(dentry); return false; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 83f41bd0c3a9..00b087c14995 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -28,6 +28,11 @@ struct cachefiles_xattr { static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; +struct cachefiles_vol_xattr { + __be32 reserved; /* Reserved, should be 0 */ + __u8 data[]; /* netfs volume coherency data */ +} __packed; + /* * set the state xattr on a cache file */ @@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie) */ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { + struct cachefiles_vol_xattr *buf; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; struct dentry *dentry = volume->dentry; @@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) _enter("%x,#%d", volume->vcookie->debug_id, len); + len += sizeof(*buf); + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + buf->reserved = cpu_to_be32(0); + memcpy(buf->data, p, volume->vcookie->coherency_len); + ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - p, len, 0); + buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); @@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) cachefiles_coherency_vol_set_ok); } + kfree(buf); _leave(" = %d", ret); return ret == 0; } @@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) */ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *buf; + struct cachefiles_vol_xattr *buf; struct dentry *dentry = volume->dentry; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; @@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) _enter(""); + len += sizeof(*buf); buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) "Failed to read xattr with error %zd", xlen); } why = cachefiles_coherency_vol_check_xattr; - } else if (memcmp(buf->data, p, len) != 0) { + } else if (buf->reserved != cpu_to_be32(0)) { + why = cachefiles_coherency_vol_check_resv; + } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) { why = cachefiles_coherency_vol_check_cmp; } else { why = cachefiles_coherency_vol_check_ok; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c98e5238a1b6..aa25bffd4823 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page) * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ -static int ceph_set_page_dirty(struct page *page) +static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - if (PageDirty(page)) { - dout("%p set_page_dirty %p idx %lu -- already dirty\n", - mapping->host, page, page->index); - BUG_ON(!PagePrivate(page)); - return 0; + if (folio_test_dirty(folio)) { + dout("%p dirty_folio %p idx %lu -- already dirty\n", + mapping->host, folio, folio->index); + BUG_ON(!folio_get_private(folio)); + return false; } inode = mapping->host; @@ -111,56 +110,56 @@ static int ceph_set_page_dirty(struct page *page) if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; - dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", - mapping->host, page, page->index, + mapping->host, folio, folio->index, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. + * Reference snap context in folio->private. Also set + * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(PagePrivate(page)); - attach_page_private(page, snapc); + BUG_ON(folio_get_private(folio)); + folio_attach_private(folio, snapc); - return ceph_fscache_set_page_dirty(page); + return ceph_fscache_dirty_folio(mapping, folio); } /* - * If we are truncating the full page (i.e. offset == 0), adjust the - * dirty page counters appropriately. Only called if there is private - * data on the page. + * If we are truncating the full folio (i.e. offset == 0), adjust the + * dirty folio counters appropriately. Only called if there is private + * data on the folio. */ -static void ceph_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void ceph_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - inode = page->mapping->host; + inode = folio->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != thp_size(page)) { - dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", - inode, page, page->index, offset, length); + if (offset != 0 || length != folio_size(folio)) { + dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n", + inode, folio->index, offset, length); return; } - WARN_ON(!PageLocked(page)); - if (PagePrivate(page)) { - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); + WARN_ON(!folio_test_locked(folio)); + if (folio_get_private(folio)) { + dout("%p invalidate_folio idx %lu full dirty page\n", + inode, folio->index); - snapc = detach_page_private(page); + snapc = folio_detach_private(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); } - wait_on_page_fscache(page); + folio_wait_fscache(folio); } static int ceph_releasepage(struct page *page, gfp_t gfp) @@ -183,9 +182,9 @@ static int ceph_releasepage(struct page *page, gfp_t gfp) return 1; } -static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) +static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; u32 blockoff; @@ -200,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) rreq->len = roundup(rreq->len, lo->stripe_unit); } -static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { - struct inode *inode = subreq->rreq->mapping->host; + struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; @@ -219,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) { struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - struct netfs_read_subrequest *subreq = req->r_priv; + struct netfs_io_subrequest *subreq = req->r_priv; int num_pages; int err = req->r_result; @@ -245,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req) iput(req->r_inode); } -static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct iov_iter iter; + ssize_t err = 0; + size_t len; + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); + if (err == 0) + err = -EFAULT; + + ceph_mdsc_put_request(req); +out: + netfs_subreq_terminated(subreq, err, false); + return true; +} + +static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; - struct inode *inode = rreq->mapping->host; + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; @@ -259,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ci->i_inline_version != CEPH_INLINE_NONE && + ceph_netfs_issue_op_inline(subreq)) + return; + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, @@ -297,6 +353,45 @@ out: dout("%s: result %d\n", __func__, err); } +static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) +{ + struct inode *inode = rreq->inode; + int got = 0, want = CEPH_CAP_FILE_CACHE; + int ret = 0; + + if (rreq->origin != NETFS_READAHEAD) + return 0; + + if (file) { + struct ceph_rw_context *rw_ctx; + struct ceph_file_info *fi = file->private_data; + + rw_ctx = ceph_find_rw_context(fi); + if (rw_ctx) + return 0; + } + + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + return ret; + } + + if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + return -EACCES; + } + if (ret == 0) + return -EACCES; + + rreq->netfs_priv = (void *)(uintptr_t)got; + return 0; +} + static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) { struct inode *inode = mapping->host; @@ -307,78 +402,16 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) ceph_put_cap_refs(ci, got); } -static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .is_cache_enabled = ceph_is_cache_enabled, +const struct netfs_request_ops ceph_netfs_ops = { + .init_request = ceph_init_request, .begin_cache_operation = ceph_begin_cache_operation, - .issue_op = ceph_netfs_issue_op, + .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, .cleanup = ceph_readahead_cleanup, }; -/* read a single page, without unlocking it. */ -static int ceph_readpage(struct file *file, struct page *subpage) -{ - struct folio *folio = page_folio(subpage); - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vino vino = ceph_vino(inode); - size_t len = folio_size(folio); - u64 off = folio_file_pos(folio); - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) { - folio_unlock(folio); - return -EINVAL; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); - - return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); -} - -static void ceph_readahead(struct readahead_control *ractl) -{ - struct inode *inode = file_inode(ractl->file); - struct ceph_file_info *fi = ractl->file->private_data; - struct ceph_rw_context *rw_ctx; - int got = 0; - int ret = 0; - - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return; - - rw_ctx = ceph_find_rw_context(fi); - if (!rw_ctx) { - /* - * readahead callers do not necessarily hold Fcb caps - * (e.g. fadvise, madvise). - */ - int want = CEPH_CAP_FILE_CACHE; - - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); - if (ret < 0) - dout("start_read %p, error getting cap\n", inode); - else if (!(got & want)) - dout("start_read %p, no cache cap\n", inode); - - if (ret <= 0) - return; - } - netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); -} - #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { @@ -516,6 +549,7 @@ static u64 get_writepages_data_length(struct inode *inode, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -550,8 +584,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { - dout("%p page eof %llu\n", page, ceph_wbc.i_size); - page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); + dout("folio at %lu beyond eof %llu\n", folio->index, + ceph_wbc.i_size); + folio_invalidate(folio, 0, folio_size(folio)); return 0; } @@ -563,7 +598,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = true; req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, @@ -623,7 +658,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = false; return err; } @@ -635,6 +670,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); + if (wbc->sync_mode == WB_SYNC_NONE && + ceph_inode_to_client(inode)->write_congested) + return AOP_WRITEPAGE_ACTIVATE; + wait_on_page_fscache(page); err = writepage_nounlock(page, wbc); @@ -707,8 +746,7 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); + fsc->write_congested = false; ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); @@ -760,6 +798,10 @@ static int ceph_writepages_start(struct address_space *mapping, bool done = false; bool caching = ceph_is_cache_enabled(inode); + if (wbc->sync_mode == WB_SYNC_NONE && + fsc->write_congested) + return 0; + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -867,14 +909,16 @@ get_more_pages: continue; } if (page_offset(page) >= ceph_wbc.i_size) { - dout("%p page eof %llu\n", - page, ceph_wbc.i_size); + struct folio *folio = page_folio(page); + + dout("folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || - page_offset(page) >= i_size_read(inode)) && - clear_page_dirty_for_io(page)) - mapping->a_ops->invalidatepage(page, - 0, thp_size(page)); - unlock_page(page); + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, + folio_size(folio)); + folio_unlock(folio); continue; } if (strip_unit_end && (page->index > strip_unit_end)) { @@ -954,11 +998,8 @@ get_more_pages: if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - } - + fsc->mount_options->congestion_kb)) + fsc->write_congested = true; pages[locked_pages++] = page; pvec.pages[i] = NULL; @@ -1274,45 +1315,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; - pgoff_t index = pos >> PAGE_SHIFT; int r; - /* - * Uninlining should have already been done and everything updated, EXCEPT - * for inline_version sent to the MDS. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) { - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - /* - * The inline_version on a new inode is set to 1. If that's the - * case, then the folio is brand new and isn't yet Uptodate. - */ - r = 0; - if (index == 0 && ci->i_inline_version != 1) { - if (!folio_test_uptodate(folio)) { - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", - ci->i_inline_version); - r = -EINVAL; - } - goto out; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - goto out; - } - - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, - &ceph_netfs_read_ops, NULL); -out: + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL); if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1366,14 +1372,14 @@ out: } const struct address_space_operations ceph_aops = { - .readpage = ceph_readpage, - .readahead = ceph_readahead, + .readpage = netfs_readpage, + .readahead = netfs_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, - .set_page_dirty = ceph_set_page_dirty, - .invalidatepage = ceph_invalidatepage, + .dirty_folio = ceph_dirty_folio, + .invalidate_folio = ceph_invalidate_folio, .releasepage = ceph_releasepage, .direct_IO = noop_direct_IO, }; @@ -1508,19 +1514,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - struct page *locked_page = NULL; - if (off == 0) { - lock_page(page); - locked_page = page; - } - err = ceph_uninline_data(vma->vm_file, locked_page); - if (locked_page) - unlock_page(locked_page); - if (err < 0) - goto out_free; - } - if (off + thp_size(page) <= size) len = thp_size(page); else @@ -1577,11 +1570,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_snap_context(snapc); } while (err == 0); - if (ret == VM_FAULT_LOCKED || - ci->i_inline_version != CEPH_INLINE_NONE) { + if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1645,16 +1636,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } -int ceph_uninline_data(struct file *filp, struct page *locked_page) +int ceph_uninline_data(struct file *file) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - struct page *page = NULL; - u64 len, inline_version; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; + struct page *pages[1]; int err = 0; - bool from_pagecache = false; + u64 len; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; @@ -1665,45 +1670,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) - goto out; + goto out_unlock; - if (locked_page) { - page = locked_page; - WARN_ON(!PageUptodate(page)); - } else if (ceph_caps_issued(ci) & - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { - page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - from_pagecache = true; - lock_page(page); - } else { - put_page(page); - page = NULL; - } - } - } - - if (page) { - len = i_size_read(inode); - if (len > PAGE_SIZE) - len = PAGE_SIZE; - } else { - page = __page_cache_alloc(GFP_NOFS); - if (!page) { - err = -ENOMEM; - goto out; - } - err = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, true); - if (err < 0) { - /* no inline data */ - if (err == -ENODATA) - err = 0; - goto out; - } - len = err; - } + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, @@ -1711,7 +1682,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } req->r_mtime = inode->i_mtime; @@ -1720,7 +1691,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) - goto out; + goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, @@ -1729,10 +1700,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); @@ -1742,7 +1714,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) - goto out_put; + goto out_put_req; } { @@ -1753,7 +1725,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) "inline_version", xattr_buf, xattr_len, 0, 0); if (err) - goto out_put; + goto out_put_req; } req->r_mtime = inode->i_mtime; @@ -1764,19 +1736,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); -out_put: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; +out_unlock: + folio_unlock(folio); + folio_put(folio); out: - if (page && page != locked_page) { - if (from_pagecache) { - unlock_page(page); - put_page(page); - } else - __free_pages(page, 0); - } - + ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", inode, ceph_vinop(inode), inline_version, err); return err; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 7d22850623ef..ddea99922073 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,26 +29,25 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!(inode->i_state & I_NEW)) return; - WARN_ON_ONCE(ci->fscache); + WARN_ON_ONCE(ci->netfs_ctx.cache); - ci->fscache = fscache_acquire_cookie(fsc->fscache, 0, - &ci->i_vino, sizeof(ci->i_vino), - &ci->i_version, sizeof(ci->i_version), - i_size_read(inode)); + ci->netfs_ctx.cache = + fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); } -void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) { - struct fscache_cookie *cookie = ci->fscache; - - fscache_relinquish_cookie(cookie, false); + fscache_relinquish_cookie(ceph_fscache_cookie(ci), false); } void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) { struct ceph_inode_info *ci = ceph_inode(inode); - fscache_use_cookie(ci->fscache, will_modify); + fscache_use_cookie(ceph_fscache_cookie(ci), will_modify); } void ceph_fscache_unuse_cookie(struct inode *inode, bool update) @@ -58,9 +57,10 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update) if (update) { loff_t i_size = i_size_read(inode); - fscache_unuse_cookie(ci->fscache, &ci->i_version, &i_size); + fscache_unuse_cookie(ceph_fscache_cookie(ci), + &ci->i_version, &i_size); } else { - fscache_unuse_cookie(ci->fscache, NULL, NULL); + fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL); } } @@ -69,14 +69,14 @@ void ceph_fscache_update(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); loff_t i_size = i_size_read(inode); - fscache_update_cookie(ci->fscache, &ci->i_version, &i_size); + fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size); } void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { struct ceph_inode_info *ci = ceph_inode(inode); - fscache_invalidate(ceph_inode(inode)->fscache, + fscache_invalidate(ceph_fscache_cookie(ci), &ci->i_version, i_size_read(inode), dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 09164389fa66..7255b790a4c1 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -26,14 +26,9 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update); void ceph_fscache_update(struct inode *inode); void ceph_fscache_invalidate(struct inode *inode, bool dio_write); -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; -} - static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return ci->fscache; + return netfs_i_cookie(&ci->vfs_inode); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) @@ -54,15 +49,15 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode, fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); } -static inline int ceph_fscache_set_page_dirty(struct page *page) +static inline int ceph_fscache_dirty_folio(struct address_space *mapping, + struct folio *folio) { - struct inode *inode = page->mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_info *ci = ceph_inode(mapping->host); - return fscache_set_page_dirty(page, ceph_fscache_cookie(ci)); + return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); @@ -91,10 +86,6 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { } -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ -} - static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } @@ -133,9 +124,10 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode, { } -static inline int ceph_fscache_set_page_dirty(struct page *page) +static inline int ceph_fscache_dirty_folio(struct address_space *mapping, + struct folio *folio) { - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } static inline bool ceph_is_cache_enabled(struct inode *inode) @@ -143,7 +135,7 @@ static inline bool ceph_is_cache_enabled(struct inode *inode) return false; } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { return -ENOBUFS; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b472cd066d1c..f1ad6884d4da 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ceph_get_mds_session(session); spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + /* Don't send messages until we get async create reply */ + spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); + return; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: @@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { + err = ceph_wait_on_async_create(inode); + if (err) + return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, @@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, u64 first_tid = 0; u64 last_snap_flush = 0; + /* Don't do anything until create reply comes in */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) + return; + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { @@ -4152,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); @@ -4178,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } goto flush_cap_releases; } + ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3cf7c9c1085b..bec3c4549c07 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; - s64 total, sum, avg, min, max, sq; + s64 total, avg, min, max, sq; int i; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; - sum = m->latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + avg = m->latency_avg; min = m->latency_min; max = m->latency_max; sq = m->latency_sq_sum; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 133dbd9338e7..eae417d71136 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_mutex, no need to use page lock */ + i_rwsem, no need to use page lock */ unlock_page(cache_ctl->page); cache_ctl->dentries = kmap(cache_ctl->page); } @@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, rcu_read_lock(); spin_lock(&parent->d_lock); /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ + * marked as complete while not holding the i_rwsem. */ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) dentry = cache_ctl->dentries[cache_ctl->index]; else @@ -478,8 +478,11 @@ more: 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ more: if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } @@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ /* .snap dir? */ if (ceph_snap(parent) == CEPH_NOSNAP && diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bbed3224ad68..6c9e837aa1d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_mount_options *opt = ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; + int ret; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, inode->i_mode, isdir ? "dir" : "regular"); @@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + if ((file->f_mode & FMODE_WRITE) && + ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file); + if (ret < 0) + goto error; + } + return 0; + +error: + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return ret; } /* @@ -516,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino) } } +static void wake_async_create_waiters(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); +} + static void ceph_async_create_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct inode *dinode = d_inode(dentry); + struct inode *tinode = req->r_target_inode; int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + WARN_ON_ONCE(dinode && tinode && dinode != tinode); + + /* MDS changed -- caller must resubmit */ if (result == -EJUKEBOX) goto out; mapping_set_error(req->r_parent->i_mapping, result); if (result) { - struct dentry *dentry = req->r_dentry; - struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) d_drop(dentry); - ceph_inode_shutdown(inode); - - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + if (dinode) { + mapping_set_error(dinode->i_mapping, result); + ceph_inode_shutdown(dinode); + wake_async_create_waiters(dinode, req->r_session); + } } - if (req->r_target_inode) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - u64 ino = ceph_vino(req->r_target_inode).ino; + if (tinode) { + u64 ino = ceph_vino(tinode).ino; if (req->r_deleg_ino != ino) pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", __func__, req->r_err, req->r_deleg_ino, ino); - mapping_set_error(req->r_target_inode->i_mapping, result); - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); - } - ceph_kick_flushing_inode_caps(req->r_session, ci); - spin_unlock(&ci->i_ceph_lock); + mapping_set_error(tinode->i_mapping, result); + wake_async_create_waiters(tinode, req->r_session); } else if (!result) { pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, req->r_deleg_ino); @@ -1041,7 +1072,6 @@ static void ceph_aio_complete(struct inode *inode, } spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &aio_req->prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1778,12 +1808,6 @@ retry_snap: if (err) goto out; - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(file, NULL); - if (err < 0) - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) @@ -1845,7 +1869,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_perform_write(file, from, pos); + written = generic_perform_write(iocb, from); if (likely(written >= 0)) iocb->ki_pos = pos + written; ceph_end_io_write(inode); @@ -1855,7 +1879,6 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2109,12 +2132,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ci->i_inline_version != CEPH_INLINE_NONE) { - ret = ceph_uninline_data(file, NULL); - if (ret < 0) - goto unlock; - } - size = i_size_read(inode); /* Are we punching a hole beyond EOF? */ @@ -2139,7 +2156,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2532,7 +2548,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); - dst_ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&dst_ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ef4a980a7bf3..63113e2a4890 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent) if (!S_ISDIR(parent->i_mode)) { pr_warn_once("bad snapdir parent type (mode=0%o)\n", parent->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once("bad snapdir inode type (mode=0%o)\n", inode->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } inode->i_mode = parent->i_mode; @@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode->i_state & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(-ENOTDIR); } const struct inode_operations ceph_file_iops = { @@ -447,12 +453,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb) struct ceph_inode_info *ci; int i; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); if (!ci) return NULL; dout("alloc_inode %p\n", &ci->vfs_inode); + /* Set parameters for the netfs library */ + netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); + spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; @@ -538,9 +547,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - - ceph_fscache_inode_init(ci); - return &ci->vfs_inode; } @@ -1201,7 +1207,7 @@ out_unlock: /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. + * caller must hold directory i_rwsem for this to be safe. */ static int splice_dentry(struct dentry **pdn, struct inode *in) { @@ -1598,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, return idx == 0 ? -ENOMEM : 0; } /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ + * i_rwsem, no need to use page lock */ unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); if (idx == 0) @@ -2301,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, return err; } +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + dout("do_getvxattr result=%d\n", err); + return err; +} + /* * Check inode permissions. We verify we have a valid value for diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d1f154aec249..3e2843e86e27 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; - if (wait) - req->r_wait_for_completion = ceph_lock_wait_for_completion; - - err = ceph_mdsc_do_request(mdsc, inode, req); + err = ceph_mdsc_submit_request(mdsc, inode, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, wait ? + ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c30eefc0ac19..fa38c013126d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -555,6 +555,28 @@ bad: return -EIO; } +static int parse_reply_info_getvxattr(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 value_len; + + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ + ceph_decode_skip_32(p, end, bad); /* skip payload length */ + + ceph_decode_32_safe(p, end, value_len, bad); + + if (value_len == end - *p) { + info->xattr_info.xattr_value = *p; + info->xattr_info.xattr_value_len = value_len; + *p = end; + return value_len; + } +bad: + return -EIO; +} + /* * parse extra results */ @@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end, return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); + else if (op == CEPH_MDS_OP_GETVXATTR) + return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } @@ -2178,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, + __GFP_NOWARN | + __GFP_ZERO, order); if (rinfo->dir_entries) break; @@ -2946,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, return err; } -static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func) { int err; /* wait */ dout("do_request waiting\n"); - if (!req->r_timeout && req->r_wait_for_completion) { - err = req->r_wait_for_completion(mdsc, req); + if (wait_func) { + err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, @@ -3011,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) - err = ceph_mdsc_wait_request(mdsc, req); + err = ceph_mdsc_wait_request(mdsc, req, NULL); dout("do_request %p done, result %d\n", req, err); return err; } @@ -3097,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); - /* - * Handle an ESTALE - * if we're not talking to the authority, send to them - * if the authority has changed while we weren't looking, - * send to new authority - * Otherwise we just have to return an ESTALE - */ - if (result == -ESTALE) { - dout("got ESTALE on request %llu\n", req->r_tid); - req->r_resend_mds = -1; - if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now\n"); - req->r_direct_mode = USE_AUTH_MDS; - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } else { - int mds = __choose_mds(mdsc, req, NULL); - if (mds >= 0 && mds != req->r_session->s_mds) { - dout("but auth changed, so resending\n"); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } - } - dout("have to return ESTALE on request %llu\n", req->r_tid); - } - - if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); @@ -4841,7 +4838,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); - ceph_cleanup_empty_realms(mdsc); + ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 97c7f7bfa55f..33497846e47e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry { loff_t offset; }; +struct ceph_mds_reply_xattr { + char *xattr_value; + size_t xattr_value_len; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed { char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; + struct ceph_mds_reply_xattr xattr_info; /* extra */ union { @@ -274,8 +280,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - const struct cred *r_cred; int r_request_release_offset; + const struct cred *r_cred; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -296,12 +302,11 @@ struct ceph_mds_request { struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - + u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; - u32 r_readdir_offset; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -329,7 +334,6 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; - ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; @@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 0fcba68f9a99..c47347d2e84e 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -8,6 +8,12 @@ #include "metric.h" #include "mds_client.h" +static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) +{ + struct timespec64 t = ktime_to_timespec64(val); + ceph_encode_timespec64(ts, &t); +} + static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { @@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; - struct timespec64 ts; s64 sum; s32 items = 0; s32 len; @@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->header.ver = 1; + read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; - jiffies_to_timespec64(sum, &ts); - read->sec = cpu_to_le32(ts.tv_sec); - read->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&read->lat, sum); + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); + read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->header.ver = 1; + write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; - jiffies_to_timespec64(sum, &ts); - write->sec = cpu_to_le32(ts.tv_sec); - write->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&write->lat, sum); + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->header.ver = 1; + meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; - jiffies_to_timespec64(sum, &ts); - meta->sec = cpu_to_le32(ts.tv_sec); - meta->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&meta->lat, sum); + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ @@ -250,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m) metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; + metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; @@ -307,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m) max = new; \ } -static inline void __update_stdev(ktime_t total, ktime_t lsum, - ktime_t *sq_sump, ktime_t lat) +static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, + ktime_t *sq_sump, ktime_t lat) { - ktime_t avg, sq; - - if (unlikely(total == 1)) - return; - - /* the sq is (lat - old_avg) * (lat - new_avg) */ - avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); - sq = lat - avg; - avg = DIV64_U64_ROUND_CLOSEST(lsum, total); - sq = sq * (lat - avg); - *sq_sump += sq; + ktime_t avg; + + if (unlikely(total == 1)) { + *lavg = lat; + } else { + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = *lavg + div64_s64(lat - *lavg, total); + *sq_sump += (lat - *lavg)*(lat - avg); + *lavg = avg; + } } void ceph_update_metrics(struct ceph_metric *m, @@ -339,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m, METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); - __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, + lat); spin_unlock(&m->lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index bb45608181e7..0d0c44bd3332 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -2,7 +2,7 @@ #ifndef _FS_CEPH_MDS_METRIC_H #define _FS_CEPH_MDS_METRIC_H -#include <linux/types.h> +#include <linux/ceph/types.h> #include <linux/percpu_counter.h> #include <linux/ktime.h> @@ -19,27 +19,39 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_INODES, CLIENT_METRIC_TYPE_READ_IO_SIZES, CLIENT_METRIC_TYPE_WRITE_IO_SIZES, - - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, }; /* * This will always have the highest metric bit value * as the last element of the array. */ -#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ - CLIENT_METRIC_TYPE_CAP_INFO, \ - CLIENT_METRIC_TYPE_READ_LATENCY, \ - CLIENT_METRIC_TYPE_WRITE_LATENCY, \ - CLIENT_METRIC_TYPE_METADATA_LATENCY, \ - CLIENT_METRIC_TYPE_DENTRY_LEASE, \ - CLIENT_METRIC_TYPE_OPENED_FILES, \ - CLIENT_METRIC_TYPE_PINNED_ICAPS, \ - CLIENT_METRIC_TYPE_OPENED_INODES, \ - CLIENT_METRIC_TYPE_READ_IO_SIZES, \ - CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ - \ - CLIENT_METRIC_TYPE_MAX, \ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ } struct ceph_metric_header { @@ -60,22 +72,28 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric dentry lease header */ @@ -140,6 +158,7 @@ struct ceph_metric { u64 size_min; u64 size_max; ktime_t latency_sum; + ktime_t latency_avg; ktime_t latency_sq_sum; ktime_t latency_min; ktime_t latency_max; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b41e6724c591..322ee5add942 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 1); /* for caller */ + /* Do not release the global dummy snaprealm until unmouting */ + if (ino == CEPH_INO_GLOBAL_SNAPREALM) + atomic_set(&realm->nref, 2); + else + atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; - dout("create_snap_realm %llx %p\n", realm->ino, realm); + dout("%s %llx %p\n", __func__, realm->ino, realm); return realm; } @@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("lookup_snap_realm %llx %p\n", r->ino, r); + dout("%s %llx %p\n", __func__, r->ino, r); return r; } } @@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + dout("%s %p %llx\n", __func__, realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; @@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_empty_lock); } -void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { + struct ceph_snap_realm *global_realm; + down_write(&mdsc->snap_rwsem); + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); + if (global_realm) + ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } @@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", - realm->ino, realm, realm->parent_ino, realm->parent, - parentino, parent); + dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, + realm, realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); @@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b) * build the snap context for a given realm. */ static int build_snap_context(struct ceph_snap_realm *realm, - struct list_head* dirty_realms) + struct list_head *realm_queue, + struct list_head *dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm, */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent, dirty_realms); - if (err) - goto fail; + /* add to the queue head */ + list_add(&parent->rebuild_item, realm_queue); + return 1; } num += parent->cached_context->num_snaps; } @@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" - " (unchanged)\n", - realm->ino, realm, realm->cached_context, + dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", + __func__, realm->ino, realm, realm->cached_context, realm->cached_context->seq, (unsigned int)realm->cached_context->num_snaps); return 0; @@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", - realm->ino, realm, snapc, snapc->seq, - (unsigned int) snapc->num_snaps); + dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, + realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; @@ -409,8 +417,7 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, - realm, err); + pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); return err; } @@ -420,13 +427,50 @@ fail: static void rebuild_snap_realms(struct ceph_snap_realm *realm, struct list_head *dirty_realms) { - struct ceph_snap_realm *child; + LIST_HEAD(realm_queue); + int last = 0; + bool skip = false; + + list_add_tail(&realm->rebuild_item, &realm_queue); + + while (!list_empty(&realm_queue)) { + struct ceph_snap_realm *_realm, *child; + + _realm = list_first_entry(&realm_queue, + struct ceph_snap_realm, + rebuild_item); + + /* + * If the last building failed dues to memory + * issue, just empty the realm_queue and return + * to avoid infinite loop. + */ + if (last < 0) { + list_del_init(&_realm->rebuild_item); + continue; + } + + last = build_snap_context(_realm, &realm_queue, dirty_realms); + dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); + + /* is any child in the list ? */ + list_for_each_entry(child, &_realm->children, child_item) { + if (!list_empty(&child->rebuild_item)) { + skip = true; + break; + } + } - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm, dirty_realms); + if (!skip) { + list_for_each_entry(child, &_realm->children, child_item) + list_add_tail(&child->rebuild_item, &realm_queue); + } - list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child, dirty_realms); + /* last == 1 means need to build parent first */ + if (last <= 0) + list_del_init(&_realm->rebuild_item); + } } @@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -static void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap **pcapsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); - if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); - return; - } - capsnap->cap_flush.is_capsnap = true; - INIT_LIST_HEAD(&capsnap->cap_flush.i_list); - INIT_LIST_HEAD(&capsnap->cap_flush.g_list); - spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p already pending\n", inode); + dout("%s %p %llx.%llx already pending\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); + dout("%s %p %llx.%llx nothing dirty|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } @@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { - dout("queue_cap_snap %p " - "no new_snap|dirty_page|writing\n", inode); + dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", - inode, capsnap, old_snapc, ceph_cap_string(dirty), - capsnap->need_flush ? "" : "no_flush"); + dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", + __func__, inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); - refcount_set(&capsnap->nref, 1); - INIT_LIST_HEAD(&capsnap->ci_item); - capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; @@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, + dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", __func__, inode, ceph_vinop(inode), capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */ __ceph_finish_cap_snap(ci, capsnap); } - capsnap = NULL; + *pcapsnap = NULL; old_snapc = NULL; update_snapc: - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ci->i_head_snapc = NULL; - } else { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); - kfree(capsnap); ceph_put_snap_context(old_snapc); } @@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size, - capsnap->dirty_pages); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); return 0; } /* Fb cap still in use, delay it */ if (ci->i_wb_ref) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "used WRBUFFER, delaying\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "used WRBUFFER, delaying\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); capsnap->writing = 1; return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", - inode, capsnap, capsnap->context, + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); @@ -671,8 +706,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; + struct ceph_cap_snap *capsnap = NULL; - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + dout("%s %p %llx inode\n", __func__, realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { @@ -682,13 +718,35 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci); + + /* + * Allocate the capsnap memory outside of ceph_queue_cap_snap() + * to reduce very possible but unnecessary frequently memory + * allocate/free in this loop. + */ + if (!capsnap) { + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", + inode); + return; + } + } + capsnap->cap_flush.is_capsnap = true; + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + INIT_LIST_HEAD(&capsnap->ci_item); + + ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); + dout("%s %p %llx done\n", __func__, realm, realm->ino); } /* @@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; - int invalidate = 0; + struct ceph_snap_realm *realm_to_rebuild = NULL; + int rebuild_snapcs; int err = -ENOMEM; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("update_snap_trace deletion=%d\n", deletion); + dout("%s deletion=%d\n", __func__, deletion); more: + rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); @@ -738,10 +798,10 @@ more: err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; - invalidate += err; + rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", + dout("%s updating %llx %p %lld -> %lld\n", __func__, realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); @@ -763,22 +823,30 @@ more: if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; - invalidate = 1; + rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("update_snap_trace %llx %p seq %lld new\n", + dout("%s %llx %p seq %lld new\n", __func__, realm->ino, realm, realm->seq); - invalidate = 1; + rebuild_snapcs = 1; } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", + dout("%s %llx %p seq %lld unchanged\n", __func__, realm->ino, realm, realm->seq); } - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, - realm, invalidate, p, e); + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); + + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate && p >= e) - rebuild_snap_realms(realm, &dirty_realms); + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; @@ -814,7 +882,7 @@ fail: ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); - pr_err("update_snap_trace error %d\n", err); + pr_err("%s error %d\n", __func__, err); return err; } @@ -831,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) struct inode *inode; struct ceph_mds_session *session = NULL; - dout("flush_snaps\n"); + dout("%s\n", __func__); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, @@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); - dout("flush_snaps done\n"); + dout("%s done\n", __func__); } /** @@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, - ceph_snap_op_name(op), split, trace_len); + dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, + mds, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); inc_session_sequence(session); @@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p in newer realm %llx %p\n", - inode, ci->i_snap_realm->ino, + dout(" leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, ci->i_snap_realm); goto skip_inode; } - dout(" will move %p to split realm %llx %p\n", - inode, realm->ino, realm); + dout(" will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); @@ -1038,7 +1106,7 @@ skip_inode: return; bad: - pr_err("corrupt snap message from mds%d\n", mds); + pr_err("%s corrupt snap message from mds%d\n", __func__, mds); ceph_msg_dump(msg); out: if (locked_rwsem) @@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, } spin_unlock(&mdsc->snapid_map_lock); if (exist) { - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } @@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, if (exist) { free_anon_bdev(sm->dev); kfree(sm); - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } - dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + dout("%s create snapid map %llx -> %x\n", __func__, + sm->snap, sm->dev); return sm; } diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 573bb9556fb5..e36e8948e728 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf79f369aec6..e6987d295079 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -802,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); + fsc->write_congested = false; err = -ENOMEM; /* @@ -864,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_snap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -892,6 +894,9 @@ static int __init init_caches(void) ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); if (!ceph_cap_cachep) goto bad_cap; + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + if (!ceph_cap_snap_cachep) + goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (!ceph_cap_flush_cachep) @@ -931,6 +936,8 @@ bad_file: bad_dentry: kmem_cache_destroy(ceph_cap_flush_cachep); bad_cap_flush: + kmem_cache_destroy(ceph_cap_snap_cachep); +bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -947,6 +954,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_snap_cachep); kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 67f145e1ae7a..20ceab74e871 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -17,13 +17,11 @@ #include <linux/posix_acl.h> #include <linux/refcount.h> #include <linux/security.h> +#include <linux/netfs.h> +#include <linux/fscache.h> #include <linux/ceph/libceph.h> -#ifdef CONFIG_CEPH_FSCACHE -#include <linux/fscache.h> -#endif - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ @@ -121,6 +119,7 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; atomic_long_t writeback_count; + bool write_congested; struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; @@ -230,7 +229,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); - kfree(capsnap); + kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } @@ -317,6 +316,11 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { + struct { + /* These must be contiguous */ + struct inode vfs_inode; + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -427,11 +431,6 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; - -#ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; -#endif - struct inode vfs_inode; /* at end */ }; static inline struct ceph_inode_info * @@ -883,6 +882,8 @@ struct ceph_snap_realm { struct list_head dirty_item; /* if realm needs new context */ + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; @@ -938,7 +939,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); -extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); @@ -1048,6 +1049,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode) /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); @@ -1212,8 +1214,9 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci, /* addr.c */ extern const struct address_space_operations ceph_aops; +extern const struct netfs_request_ops ceph_netfs_ops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); -extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index fcf7dfdecf96..afec84088471 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_xattr *xattr; - struct ceph_vxattr *vxattr = NULL; + struct ceph_vxattr *vxattr; int req_mask; ssize_t err; + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto handle_non_vxattrs; + /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { @@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, err = -ERANGE; } return err; + } else { + err = ceph_do_getvxattr(inode, name, value, size); + /* this would happen with a new client and old server combo */ + if (err == -EOPNOTSUPP) + err = -ENODATA; + return err; } - +handle_non_vxattrs: req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index ea00e1a91250..9d334816eac0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -94,7 +94,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), - tcon->tidStatus); + tcon->status); if (dev_type == FILE_DEVICE_DISK) seq_puts(m, " type: DISK "); else if (dev_type == FILE_DEVICE_CD_ROM) diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index cdce1609c5c2..180c234c2f46 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -498,7 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, false); + cifs_signal_cifsd_for_reconnect(tcon->ses->server, false); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 082c21478686..2b1a1c029c75 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -210,6 +210,9 @@ cifs_read_super(struct super_block *sb) if (rc) goto out_no_root; /* tune readahead according to rsize if readahead size not set on mount */ + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx); if (cifs_sb->ctx->rasize) sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; else @@ -254,26 +257,33 @@ static void cifs_kill_sb(struct super_block *sb) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct cached_fid *cfid; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; /* * We ned to release all dentries for the cached directories * before we kill the sb. */ if (cifs_sb->root) { + for (node = rb_first(root); node; node = rb_next(node)) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); + if (IS_ERR(tcon)) + continue; + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + + /* finally release root dentry */ dput(cifs_sb->root); cifs_sb->root = NULL; } - tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon) { - cfid = &tcon->crfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { - - dput(cfid->dentry); - cfid->dentry = NULL; - } - mutex_unlock(&cfid->fid_mutex); - } kill_anon_super(sb); cifs_umount(cifs_sb); @@ -354,7 +364,7 @@ static struct inode * cifs_alloc_inode(struct super_block *sb) { struct cifsInodeInfo *cifs_inode; - cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); + cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ @@ -691,14 +701,14 @@ static void cifs_umount_begin(struct super_block *sb) tcon = cifs_sb_master_tcon(cifs_sb); spin_lock(&cifs_tcp_ses_lock); - if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { + if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) { /* we have other mounts to same share or we have already tried to force umount this and woken up all waiting network requests, nothing to do */ spin_unlock(&cifs_tcp_ses_lock); return; } else if (tcon->tc_count == 1) - tcon->tidStatus = CifsExiting; + tcon->status = TID_EXITING; spin_unlock(&cifs_tcp_ses_lock); /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ @@ -936,7 +946,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 15a5c5db038b..c0542bdcd06b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,5 +153,5 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ #define SMB3_PRODUCT_BUILD 35 -#define CIFS_VERSION "2.35" +#define CIFS_VERSION "2.36" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 48b343d03430..8de977c359b1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,6 +16,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/utsname.h> +#include <linux/netfs.h> #include "cifs_fs_sb.h" #include "cifsacl.h" #include <crypto/internal/hash.h> @@ -115,10 +116,18 @@ enum statusEnum { CifsInNegotiate, CifsNeedSessSetup, CifsInSessSetup, - CifsNeedTcon, - CifsInTcon, - CifsNeedFilesInvalidate, - CifsInFilesInvalidate +}; + +/* associated with each tree connection to the server */ +enum tid_status_enum { + TID_NEW = 0, + TID_GOOD, + TID_EXITING, + TID_NEED_RECON, + TID_NEED_TCON, + TID_IN_TCON, + TID_NEED_FILES_INVALIDATE, /* currently unused */ + TID_IN_FILES_INVALIDATE }; enum securityEnum { @@ -852,13 +861,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) #define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) -/* - * The default wsize is 1M. find_get_pages seems to return a maximum of 256 - * pages in a single call. With PAGE_SIZE == 4k, this means we can fill - * a single wsize request with a single call. - */ #define CIFS_DEFAULT_IOSIZE (1024 * 1024) -#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to @@ -1038,7 +1041,7 @@ struct cifs_tcon { char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ __u16 Flags; /* optional support bits */ - enum statusEnum tidStatus; + enum tid_status_enum status; atomic_t num_smbs_sent; union { struct { @@ -1402,6 +1405,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ /* @@ -1432,10 +1440,6 @@ struct cifsInodeInfo { u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ -#ifdef CONFIG_CIFS_FSCACHE - struct fscache_cookie *fscache; -#endif - struct inode vfs_inode; struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 68b9a436af4b..aeba371c4c70 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -123,18 +123,6 @@ */ #define CIFS_SESS_KEY_SIZE (16) -/* - * Size of the smb3 signing key - */ -#define SMB3_SIGN_KEY_SIZE (16) - -/* - * Size of the smb3 encryption/decryption key storage. - * This size is big enough to store any cipher key types. - */ -#define SMB3_ENC_DEC_KEY_SIZE (32) - -#define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) #define CIFS_CPHTXT_SIZE (16) @@ -1658,7 +1646,7 @@ struct smb_t2_rsp { #define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105 #define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106 #define SMB_FIND_FILE_UNIX 0x202 -#define SMB_FIND_FILE_POSIX_INFO 0x064 +/* #define SMB_FIND_FILE_POSIX_INFO 0x064 */ typedef struct smb_com_transaction2_qpi_req { struct smb_hdr hdr; /* wct = 14+ */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index d3701295402d..0df3b24a0bf4 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -132,6 +132,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); void +cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, + bool all_channels); +void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session); extern int cifs_reconnect(struct TCP_Server_Info *server, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 071e2f21a7db..47e927c4ff8d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -75,12 +75,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || - tcon->tidStatus != CifsNeedReconnect) { + if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) { spin_unlock(&cifs_tcp_ses_lock); return; } - tcon->tidStatus = CifsInFilesInvalidate; + tcon->status = TID_IN_FILES_INVALIDATE; spin_unlock(&cifs_tcp_ses_lock); /* list all files open on tree connection and mark them invalid */ @@ -100,8 +99,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_unlock(&tcon->crfid.fid_mutex); spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInFilesInvalidate) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_FILES_INVALIDATE) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); /* @@ -136,7 +135,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * have tcon) are allowed as we start force umount */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsExiting) { + if (tcon->status == TID_EXITING) { if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { @@ -597,7 +596,7 @@ CIFSSMBNegotiate(const unsigned int xid, set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); - /* set up max_read for readpages check */ + /* set up max_read for readahead check */ server->max_read = server->maxBuf; server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 053cb449eb16..42e14f408856 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -163,10 +163,50 @@ static void cifs_resolve_server(struct work_struct *work) } /* + * Update the tcpStatus for the server. + * This is used to signal the cifsd thread to call cifs_reconnect + * ONLY cifsd thread should call cifs_reconnect. For any other + * thread, use this function + * + * @server: the tcp ses for which reconnect is needed + * @all_channels: if this needs to be done for all channels + */ +void +cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, + bool all_channels) +{ + struct TCP_Server_Info *pserver; + struct cifs_ses *ses; + int i; + + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + spin_lock(&cifs_tcp_ses_lock); + if (!all_channels) { + pserver->tcpStatus = CifsNeedReconnect; + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + spin_lock(&ses->chan_lock); + for (i = 0; i < ses->chan_count; i++) + ses->chans[i].server->tcpStatus = CifsNeedReconnect; + spin_unlock(&ses->chan_lock); + } + spin_unlock(&cifs_tcp_ses_lock); +} + +/* * Mark all sessions and tcons for reconnect. + * IMPORTANT: make sure that this gets called only from + * cifsd thread. For any other thread, use + * cifs_signal_cifsd_for_reconnect * + * @server: the tcp ses for which reconnect is needed * @server needs to be previously set to CifsNeedReconnect. - * + * @mark_smb_session: whether even sessions need to be marked */ void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, @@ -205,7 +245,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; - tcon->tidStatus = CifsNeedReconnect; + tcon->status = TID_NEED_RECON; } if (ses->tcon_ipc) ses->tcon_ipc->need_reconnect = true; @@ -413,9 +453,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ return rc; } -static int -reconnect_dfs_server(struct TCP_Server_Info *server, - bool mark_smb_session) +static int reconnect_dfs_server(struct TCP_Server_Info *server) { int rc = 0; const char *refpath = server->current_fullpath + 1; @@ -439,7 +477,12 @@ reconnect_dfs_server(struct TCP_Server_Info *server, if (!cifs_tcp_ses_needs_reconnect(server, num_targets)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + /* + * Unconditionally mark all sessions & tcons for reconnect as we might be connecting to a + * different server or share during failover. It could be improved by adding some logic to + * only do that in case it connects to a different server or share, though. + */ + cifs_mark_tcp_ses_conns_for_reconnect(server, true); cifs_abort_connection(server); @@ -491,13 +534,20 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); - if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { + if (!server->is_dfs_conn) { spin_unlock(&cifs_tcp_ses_lock); return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); - return reconnect_dfs_server(server, mark_smb_session); + mutex_lock(&server->refpath_lock); + if (!server->origin_fullpath || !server->leaf_fullpath) { + mutex_unlock(&server->refpath_lock); + return __cifs_reconnect(server, mark_smb_session); + } + mutex_unlock(&server->refpath_lock); + + return reconnect_dfs_server(server); } #else int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) @@ -1006,7 +1056,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_hdr_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", @@ -2167,7 +2217,7 @@ get_ses_fail: static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { - if (tcon->tidStatus == CifsExiting) + if (tcon->status == TID_EXITING) return 0; if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) return 0; @@ -3473,6 +3523,9 @@ static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path, struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; char *oldmnt = cifs_sb->ctx->mount_options; + cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path, + dfs_cache_get_tgt_name(tit)); + rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref); if (rc) goto out; @@ -3571,13 +3624,18 @@ static int __follow_dfs_link(struct mount_ctx *mnt_ctx) if (rc) goto out; - /* Try all dfs link targets */ + /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an + * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as + * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than + * STATUS_PATH_NOT_COVERED." + */ for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl); tit; tit = dfs_cache_get_next_tgt(&tl, tit)) { rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit); if (!rc) { rc = is_path_remote(mnt_ctx); - break; + if (!rc || rc == -EREMOTE) + break; } } @@ -3624,9 +3682,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx) { struct TCP_Server_Info *server = mnt_ctx->server; + mutex_lock(&server->refpath_lock); server->origin_fullpath = mnt_ctx->origin_fullpath; server->leaf_fullpath = mnt_ctx->leaf_fullpath; server->current_fullpath = mnt_ctx->leaf_fullpath; + mutex_unlock(&server->refpath_lock); mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL; } @@ -3651,7 +3711,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; rc = is_path_remote(&mnt_ctx); - if (rc == -EREMOTE) + if (rc) rc = follow_dfs_link(&mnt_ctx); if (rc) goto error; @@ -3924,7 +3984,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus != CifsNeedSessSetup) { + if ((server->tcpStatus != CifsNeedSessSetup) && + (ses->status == CifsGood)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -4416,7 +4477,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - cifs_reconnect(tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(server, true); } dfs_cache_free_tgts(tl); @@ -4437,12 +4498,12 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + (tcon->status != TID_NEW && + tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - tcon->tidStatus = CifsInTcon; + tcon->status = TID_IN_TCON; spin_unlock(&cifs_tcp_ses_lock); tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); @@ -4483,13 +4544,13 @@ out: if (rc) { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsGood; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_GOOD; spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; } @@ -4505,24 +4566,24 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + (tcon->status != TID_NEW && + tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - tcon->tidStatus = CifsInTcon; + tcon->status = TID_IN_TCON; spin_unlock(&cifs_tcp_ses_lock); rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); if (rc) { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsGood; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_GOOD; spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 831f42458bf6..956f8e5cf3e7 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(tcon->ses->server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ @@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool struct TCP_Server_Info *server = tcon->ses->server; mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); - return 0; } @@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions) list_del_init(&tcon->ulist); mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); cifs_put_tcon(tcon); } } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e7af802dcfa6..d511a78383c3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3740,6 +3740,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, break; } + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tlink_tcon(open_file->tlink), + cifs_sb->ctx); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) @@ -4205,13 +4210,19 @@ cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + /* Wait for the page to be written to the cache before we allow it to + * be modified. We then assume the entire page will need writing back. + */ #ifdef CONFIG_CIFS_FSCACHE if (PageFsCache(page) && wait_on_page_fscache_killable(page) < 0) return VM_FAULT_RETRY; #endif - lock_page(page); + wait_on_page_writeback(page); + + if (lock_page_killable(page) < 0) + return VM_FAULT_RETRY; return VM_FAULT_LOCKED; } @@ -4474,6 +4485,11 @@ static void cifs_readahead(struct readahead_control *ractl) } } + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tlink_tcon(open_file->tlink), + cifs_sb->ctx); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) @@ -4754,17 +4770,17 @@ static int cifs_release_page(struct page *page, gfp_t gfp) return true; } -static void cifs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +static void cifs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - wait_on_page_fscache(page); + folio_wait_fscache(folio); } -static int cifs_launder_page(struct page *page) +static int cifs_launder_folio(struct folio *folio) { int rc = 0; - loff_t range_start = page_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); + loff_t range_start = folio_pos(folio); + loff_t range_end = range_start + folio_size(folio); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, @@ -4772,12 +4788,12 @@ static int cifs_launder_page(struct page *page) .range_end = range_end, }; - cifs_dbg(FYI, "Launder page: %p\n", page); + cifs_dbg(FYI, "Launder page: %lu\n", folio->index); - if (clear_page_dirty_for_io(page)) - rc = cifs_writepage_locked(page, &wbc); + if (folio_clear_dirty_for_io(folio)) + rc = cifs_writepage_locked(&folio->page, &wbc); - wait_on_page_fscache(page); + folio_wait_fscache(folio); return rc; } @@ -4939,12 +4955,13 @@ static void cifs_swap_deactivate(struct file *file) * need to pin the cache object to write back to. */ #ifdef CONFIG_CIFS_FSCACHE -static int cifs_set_page_dirty(struct page *page) +static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) { - return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host)); + return fscache_dirty_folio(mapping, folio, + cifs_inode_cookie(mapping->host)); } #else -#define cifs_set_page_dirty __set_page_dirty_nobuffers +#define cifs_dirty_folio filemap_dirty_folio #endif const struct address_space_operations cifs_addr_ops = { @@ -4954,11 +4971,11 @@ const struct address_space_operations cifs_addr_ops = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = cifs_set_page_dirty, + .dirty_folio = cifs_dirty_folio, .releasepage = cifs_release_page, .direct_IO = cifs_direct_io, - .invalidatepage = cifs_invalidate_page, - .launder_page = cifs_launder_page, + .invalidate_folio = cifs_invalidate_folio, + .launder_folio = cifs_launder_folio, /* * TODO: investigate and if useful we could add an cifs_migratePage * helper (under an CONFIG_MIGRATION) in the future, and also @@ -4979,8 +4996,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = cifs_set_page_dirty, + .dirty_folio = cifs_dirty_folio, .releasepage = cifs_release_page, - .invalidatepage = cifs_invalidate_page, - .launder_page = cifs_launder_page, + .invalidate_folio = cifs_invalidate_folio, + .launder_folio = cifs_launder_folio, }; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 33af72e0ac0c..a638b29e9062 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -103,7 +103,7 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - cifsi->fscache = + cifsi->netfs_ctx.cache = fscache_acquire_cookie(tcon->fscache, 0, &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), @@ -126,22 +126,15 @@ void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) void cifs_fscache_release_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifs_inode_cookie(inode); - if (cifsi->fscache) { - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_relinquish_cookie(cifsi->fscache, false); - cifsi->fscache = NULL; + if (cookie) { + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie); + fscache_relinquish_cookie(cookie, false); + cifsi->netfs_ctx.cache = NULL; } } -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - /* * Fallback page reading interface. */ diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 55129908e2c1..52355c0912ae 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -61,7 +61,7 @@ void cifs_fscache_fill_coherency(struct inode *inode, static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - return CIFS_I(inode)->fscache; + return netfs_i_cookie(inode); } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 60d853c92f6a..2f9e7d2f81b6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode) inode->i_fop = &cifs_file_ops; } - /* check if server can support readpages */ + /* check if server can support readahead */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 852e54ee82c2..bbdf3281559c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 56598f7dbe00..afaf59c22193 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -116,7 +116,7 @@ tconInfoAlloc(void) } atomic_inc(&tconInfoAllocCount); - ret_buf->tidStatus = CifsNew; + ret_buf->status = TID_NEW; ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index ebe236b9d9f5..235aa1b395eb 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,7 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - cifs_reconnect(mid->server, false); + cifs_signal_cifsd_for_reconnect(mid->server, false); } } diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 298458404252..55758b9ec877 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -107,7 +107,7 @@ struct negotiate_message { SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ struct ntlmssp_version Version; /* SECURITY_BUFFER */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __packed; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index b2fb7bd11936..c71c9a44bef4 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -228,7 +228,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) spin_unlock(&GlobalMid_Lock); if (reconnect) { - cifs_mark_tcp_ses_conns_for_reconnect(server, false); + cifs_signal_cifsd_for_reconnect(server, false); } return mid; diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 4125fd113cfb..82e916ad167c 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -41,15 +41,4 @@ #define END_OF_CHAIN 4 #define RELATED_REQUEST 8 -#define SMB2_SIGNATURE_SIZE (16) -#define SMB2_NTLMV2_SESSKEY_SIZE (16) -#define SMB2_HMACSHA256_SIZE (32) -#define SMB2_CMACAES_SIZE (16) -#define SMB3_SIGNKEY_SIZE (16) -#define SMB3_GCM128_CRYPTKEY_SIZE (16) -#define SMB3_GCM256_CRYPTKEY_SIZE (32) - -/* Maximum buffer size value we can send with 1 credit */ -#define SMB2_MAX_BUFFER_SIZE 65536 - #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b25623e3fe3d..3fe47a88f47d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -150,16 +150,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; struct cifs_ses *ses = NULL; + struct cifs_ses *iter; /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { - if (ses->Suid == le64_to_cpu(thdr->SessionId)) + list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) { + if (iter->Suid == le64_to_cpu(thdr->SessionId)) { + ses = iter; break; + } } spin_unlock(&cifs_tcp_ses_lock); - if (list_entry_is_head(ses, &srvr->smb_ses_list, - smb_ses_list)) { + if (!ses) { cifs_dbg(VFS, "no decryption - session id not found\n"); return 1; } @@ -203,7 +205,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 || - pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { + pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { /* error packets have 9 byte structure size */ cifs_dbg(VFS, "Invalid response size %u for command %d\n", le16_to_cpu(pdu->StructureSize2), command); @@ -303,7 +305,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) /* error responses do not have data area */ if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)shdr)->StructureSize) == - SMB2_ERROR_STRUCTURE_SIZE2) + SMB2_ERROR_STRUCTURE_SIZE2_LE) return NULL; /* @@ -478,11 +480,11 @@ smb2_get_lease_state(struct cifsInodeInfo *cinode) __le32 lease = 0; if (CIFS_CACHE_WRITE(cinode)) - lease |= SMB2_LEASE_WRITE_CACHING; + lease |= SMB2_LEASE_WRITE_CACHING_LE; if (CIFS_CACHE_HANDLE(cinode)) - lease |= SMB2_LEASE_HANDLE_CACHING; + lease |= SMB2_LEASE_HANDLE_CACHING_LE; if (CIFS_CACHE_READ(cinode)) - lease |= SMB2_LEASE_READ_CACHING; + lease |= SMB2_LEASE_READ_CACHING_LE; return lease; } @@ -832,8 +834,8 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve rc = __smb2_handle_cancelled_cmd(tcon, le16_to_cpu(hdr->Command), le64_to_cpu(hdr->MessageId), - le64_to_cpu(rsp->PersistentFileId), - le64_to_cpu(rsp->VolatileFileId)); + rsp->PersistentFileId, + rsp->VolatileFileId); if (rc) cifs_put_tcon(tcon); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index af5d0830bc8a..d6aaeff4a30a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -25,6 +25,7 @@ #include "smb2glob.h" #include "cifs_ioctl.h" #include "smbdirect.h" +#include "fscache.h" #include "fs_context.h" /* Change credits for different ops and return the total number of credits */ @@ -85,6 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ pr_warn_once("server overflowed SMB3 credits\n"); + trace_smb3_overflow_credits(server->CurrentMid, + server->conn_id, server->hostname, *val, + add, server->in_flight); } server->in_flight--; if (server->in_flight == 0 && @@ -250,7 +254,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_wait_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -299,7 +303,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_adj_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", @@ -896,8 +900,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, atomic_inc(&tcon->num_remote_opens); o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId); - oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId); + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; #ifdef CONFIG_CIFS_DEBUG2 oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ @@ -1191,17 +1195,12 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) { int rc; - __le16 *utf16_path; struct kvec rsp_iov = {NULL, 0}; int buftype = CIFS_NO_BUFFER; struct smb2_query_info_rsp *rsp; struct smb2_file_full_ea_info *info = NULL; - utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - rc = smb2_query_info_compound(xid, tcon, utf16_path, + rc = smb2_query_info_compound(xid, tcon, path, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, @@ -1234,7 +1233,6 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, le32_to_cpu(rsp->OutputBufferLength), ea_name); qeas_exit: - kfree(utf16_path); free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -1294,7 +1292,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, * the new EA. If not we should not add it since we * would not be able to even read the EAs back. */ - rc = smb2_query_info_compound(xid, tcon, utf16_path, + rc = smb2_query_info_compound(xid, tcon, path, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, @@ -1642,6 +1640,7 @@ smb2_ioctl_query_info(const unsigned int xid, unsigned int size[2]; void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; + void (*free_req1_func)(struct smb_rqst *r); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1651,27 +1650,29 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) - goto e_fault; - + if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) { + rc = -EFAULT; + goto free_vars; + } if (qi.output_buffer_length > 1024) { - kfree(vars); - return -EINVAL; + rc = -EINVAL; + goto free_vars; } if (!ses || !server) { - kfree(vars); - return -EIO; + rc = -EIO; + goto free_vars; } if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - buffer = memdup_user(arg + sizeof(struct smb_query_info), - qi.output_buffer_length); - if (IS_ERR(buffer)) { - kfree(vars); - return PTR_ERR(buffer); + if (qi.output_buffer_length) { + buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); + if (IS_ERR(buffer)) { + rc = PTR_ERR(buffer); + goto free_vars; + } } /* Open */ @@ -1709,45 +1710,45 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, path); if (rc) - goto iqinf_exit; + goto free_output_buffer; smb2_set_next_command(tcon, &rqst[0]); /* Query */ if (qi.flags & PASSTHRU_FSCTL) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else { - rqst[1].rq_iov = &vars->io_iov[0]; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - qi.info_type, true, buffer, - qi.output_buffer_length, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); + goto free_open_req; } + rqst[1].rq_iov = &vars->io_iov[0]; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, buffer, qi.output_buffer_length, + CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + free_req1_func = SMB2_ioctl_free; } else if (qi.flags == PASSTHRU_SET_INFO) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else { - rqst[1].rq_iov = &vars->si_iov[0]; - rqst[1].rq_nvec = 1; - - size[0] = 8; - data[0] = buffer; - - rc = SMB2_set_info_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); + goto free_open_req; + } + if (qi.output_buffer_length < 8) { + rc = -EINVAL; + goto free_open_req; } + rqst[1].rq_iov = &vars->si_iov[0]; + rqst[1].rq_nvec = 1; + + /* MS-FSCC 2.4.13 FileEndOfFileInformation */ + size[0] = 8; + data[0] = buffer; + + rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + free_req1_func = SMB2_set_info_free; } else if (qi.flags == PASSTHRU_QUERY_INFO) { rqst[1].rq_iov = &vars->qi_iov[0]; rqst[1].rq_nvec = 1; @@ -1758,6 +1759,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + free_req1_func = SMB2_query_info_free; } else { /* unknown flags */ cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n", qi.flags); @@ -1765,7 +1767,7 @@ smb2_ioctl_query_info(const unsigned int xid, } if (rc) - goto iqinf_exit; + goto free_open_req; smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); @@ -1776,14 +1778,14 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) - goto iqinf_exit; + goto free_req_1; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); if (rc) - goto iqinf_exit; + goto out; /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { @@ -1793,18 +1795,22 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); if (qi.input_buffer_length > 0 && le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length - > rsp_iov[1].iov_len) - goto e_fault; + > rsp_iov[1].iov_len) { + rc = -EFAULT; + goto out; + } if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info), (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset), qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } else { pqi = (struct smb_query_info __user *)arg; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; @@ -1812,28 +1818,30 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } - iqinf_exit: - cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); +out: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - kfree(vars); + SMB2_close_free(&rqst[2]); +free_req_1: + free_req1_func(&rqst[1]); +free_open_req: + SMB2_open_free(&rqst[0]); +free_output_buffer: kfree(buffer); +free_vars: + kfree(vars); return rc; - -e_fault: - rc = -EFAULT; - goto iqinf_exit; } static ssize_t @@ -1850,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; + struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + /* + * We need to flush all unwritten data before we can send the + * copychunk ioctl to the server. + */ + inode = d_inode(trgtfile->dentry); + filemap_write_and_wait(inode->i_mapping); + if (pcchunk == NULL) return -ENOMEM; @@ -2406,8 +2422,8 @@ again: cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } - fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId); - fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId); + fid->persistent_fid = op_rsp->PersistentFileId; + fid->volatile_fid = op_rsp->VolatileFileId; /* Anything else than ENODATA means a genuine error */ if (rc && rc != -ENODATA) { @@ -2487,7 +2503,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_pend_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", @@ -2645,7 +2661,7 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) */ int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, - __le16 *utf16_path, u32 desired_access, + const char *path, u32 desired_access, u32 class, u32 type, u32 output_len, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb) @@ -2663,6 +2679,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; int rc; + __le16 *utf16_path; + struct cached_fid *cfid = NULL; + + if (!path) + path = ""; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -2671,6 +2695,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); + rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid); + memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; @@ -2692,15 +2718,29 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = qi_iov; rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, server, - &rqst[1], COMPOUND_FID, COMPOUND_FID, - class, type, 0, - output_len, 0, - NULL); + if (cfid) { + rc = SMB2_query_info_init(tcon, server, + &rqst[1], + cfid->fid->persistent_fid, + cfid->fid->volatile_fid, + class, type, 0, + output_len, 0, + NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[1], + COMPOUND_FID, + COMPOUND_FID, + class, type, 0, + output_len, 0, + NULL); + } if (rc) goto qic_exit; - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); + if (!cfid) { + smb2_set_next_command(tcon, &rqst[1]); + smb2_set_related(&rqst[1]); + } memset(&close_iov, 0, sizeof(close_iov)); rqst[2].rq_iov = close_iov; @@ -2712,9 +2752,15 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); + if (cfid) { + rc = compound_send_recv(xid, ses, server, + flags, 1, &rqst[1], + &resp_buftype[1], &rsp_iov[1]); + } else { + rc = compound_send_recv(xid, ses, server, + flags, 3, rqst, + resp_buftype, rsp_iov); + } if (rc) { free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); if (rc == -EREMCHG) { @@ -2728,11 +2774,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, *buftype = resp_buftype[1]; qic_exit: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + if (cfid) + close_cached_dir(cfid); return rc; } @@ -2742,13 +2791,12 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_query_info_rsp *rsp; struct smb2_fs_full_size_info *info = NULL; - __le16 utf16_path = 0; /* Null - open root of share */ struct kvec rsp_iov = {NULL, 0}; int buftype = CIFS_NO_BUFFER; int rc; - rc = smb2_query_info_compound(xid, tcon, &utf16_path, + rc = smb2_query_info_compound(xid, tcon, "", FILE_READ_ATTRIBUTES, FS_FULL_SIZE_INFORMATION, SMB2_O_INFO_FILESYSTEM, @@ -3887,29 +3935,38 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, { int rc; unsigned int xid; + struct inode *inode; struct cifsFileInfo *cfile = file->private_data; + struct cifsInodeInfo *cifsi; __le64 eof; xid = get_xid(); - if (off >= i_size_read(file->f_inode) || - off + len >= i_size_read(file->f_inode)) { + inode = d_inode(cfile->dentry); + cifsi = CIFS_I(inode); + + if (off >= i_size_read(inode) || + off + len >= i_size_read(inode)) { rc = -EINVAL; goto out; } rc = smb2_copychunk_range(xid, cfile, cfile, off + len, - i_size_read(file->f_inode) - off - len, off); + i_size_read(inode) - off - len, off); if (rc < 0) goto out; - eof = cpu_to_le64(i_size_read(file->f_inode) - len); + eof = cpu_to_le64(i_size_read(inode) - len); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof); if (rc < 0) goto out; rc = 0; + + cifsi->server_eof = i_size_read(inode) - len; + truncate_setsize(inode, cifsi->server_eof); + fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); out: free_xid(xid); return rc; @@ -4283,12 +4340,12 @@ static __le32 map_oplock_to_lease(u8 oplock) { if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) - return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + return SMB2_LEASE_WRITE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_II) - return SMB2_LEASE_READ_CACHING; + return SMB2_LEASE_READ_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) - return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING | - SMB2_LEASE_WRITE_CACHING; + return SMB2_LEASE_HANDLE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_WRITE_CACHING_LE; return 0; } @@ -4350,7 +4407,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) struct create_lease *lc = (struct create_lease *)buf; *epoch = 0; /* not used */ - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE) return SMB2_OPLOCK_LEVEL_NOCHANGE; return le32_to_cpu(lc->lcontext.LeaseState); } @@ -4361,7 +4418,7 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; *epoch = le16_to_cpu(lc->lcontext.Epoch); - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE) return SMB2_OPLOCK_LEVEL_NOCHANGE; if (lease_key) memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); @@ -5804,8 +5861,8 @@ struct smb_version_values smb20_values = { .protocol_id = SMB20_PROT_ID, .req_capabilities = 0, /* MBZ */ .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5825,8 +5882,8 @@ struct smb_version_values smb21_values = { .protocol_id = SMB21_PROT_ID, .req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */ .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5846,8 +5903,8 @@ struct smb_version_values smb3any_values = { .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5867,8 +5924,8 @@ struct smb_version_values smbdefault_values = { .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5888,8 +5945,8 @@ struct smb_version_values smb30_values = { .protocol_id = SMB30_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5909,8 +5966,8 @@ struct smb_version_values smb302_values = { .protocol_id = SMB302_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5930,8 +5987,8 @@ struct smb_version_values smb311_values = { .protocol_id = SMB311_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7e7909b1ae11..1b7ad0c09566 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -163,7 +163,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, return 0; spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsExiting) { + if (tcon->status == TID_EXITING) { /* * only tree disconnect, open, and write, * (and ulogoff which does not have tcon) @@ -2734,13 +2734,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, goto err_free_req; } - trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId), - tcon->tid, - ses->Suid, CREATE_NOT_FILE, - FILE_WRITE_ATTRIBUTES); + trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid, + CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); - SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId), - le64_to_cpu(rsp->VolatileFileId)); + SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); /* Eventually save off posix specific response info and timestaps */ @@ -3009,14 +3006,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } else if (rsp == NULL) /* unlikely to happen, but safer to check */ goto creat_exit; else - trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId), - tcon->tid, - ses->Suid, oparms->create_options, - oparms->desired_access); + trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid, + oparms->create_options, oparms->desired_access); atomic_inc(&tcon->num_remote_opens); - oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId); - oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId); + oparms->fid->persistent_fid = rsp->PersistentFileId; + oparms->fid->volatile_fid = rsp->VolatileFileId; oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); @@ -3313,8 +3308,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; if (query_attrs) req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; else @@ -3677,8 +3672,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); @@ -3858,12 +3853,14 @@ void smb2_reconnect_server(struct work_struct *work) tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); if (!tcon) { resched = true; - list_del_init(&ses->rlist); - cifs_put_smb_ses(ses); + list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + } goto done; } - tcon->tidStatus = CifsGood; + tcon->status = TID_GOOD; tcon->retry = false; tcon->need_reconnect = false; @@ -3951,8 +3948,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -4033,8 +4030,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, shdr = &req->hdr; shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); - req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->ReadChannelInfoOffset = 0; /* reserved */ req->ReadChannelInfoLength = 0; /* reserved */ req->Channel = 0; /* reserved */ @@ -4094,8 +4091,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, */ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); - req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); - req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + req->PersistentFileId = (u64)-1; + req->VolatileFileId = (u64)-1; } } if (remaining_bytes > io_parms->length) @@ -4307,21 +4304,19 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); trace_smb3_read_err(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); } else - trace_smb3_read_done(xid, - le64_to_cpu(req->PersistentFileId), - io_parms->tcon->tid, ses->Suid, - io_parms->offset, 0); + trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, + ses->Suid, io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); @@ -4463,8 +4458,8 @@ smb2_async_writev(struct cifs_writedata *wdata, shdr = (struct smb2_hdr *)req; shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); - req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid); - req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid); + req->PersistentFileId = wdata->cfile->fid.persistent_fid; + req->VolatileFileId = wdata->cfile->fid.volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4562,7 +4557,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (rc) { trace_smb3_write_err(0 /* no xid */, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, rc); kref_put(&wdata->refcount, release); @@ -4615,8 +4610,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); - req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4645,7 +4640,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { trace_smb3_write_err(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, io_parms->length, rc); @@ -4654,7 +4649,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, } else { *nbytes = le32_to_cpu(rsp->DataLength); trace_smb3_write_done(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, *nbytes); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 33cfd0a1adf1..d8c4388b190d 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,16 +56,6 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL -#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) - -struct smb2_err_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; - __le16 Reserved; /* MBZ */ - __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ -} __packed; - #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { @@ -139,47 +129,6 @@ struct share_redirect_error_context_rsp { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -#define SMB2_LEASE_NONE cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) - -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002) -#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004) - -#define SMB2_LEASE_KEY_SIZE 16 - -struct lease_context { - u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; -} __packed; - -struct lease_context_v2 { - u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; - __le64 ParentLeaseKeyLow; - __le64 ParentLeaseKeyHigh; - __le16 Epoch; - __le16 Reserved; -} __packed; - -struct create_lease { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context lcontext; -} __packed; - -struct create_lease_v2 { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context_v2 lcontext; - __u8 Pad[4]; -} __packed; - struct create_durable { struct create_context ccontext; __u8 Name[8]; @@ -192,13 +141,6 @@ struct create_durable { } Data; } __packed; -struct create_posix { - struct create_context ccontext; - __u8 Name[16]; - __le32 Mode; - __u32 Reserved; -} __packed; - /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ #define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002 @@ -287,12 +229,6 @@ struct copychunk_ioctl { __u32 Reserved2; } __packed; -/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ -struct file_zero_data_information { - __le64 FileOffset; - __le64 BeyondFinalZero; -} __packed; - struct copychunk_ioctl_rsp { __le32 ChunksWritten; __le32 ChunkBytesWritten; @@ -338,11 +274,6 @@ struct fsctl_get_integrity_information_rsp { __le32 ClusterSizeInBytes; } __packed; -struct file_allocated_range_buffer { - __le64 file_offset; - __le64 length; -} __packed; - /* Integrity ChecksumAlgorithm choices for above */ #define CHECKSUM_TYPE_NONE 0x0000 #define CHECKSUM_TYPE_CRC64 0x0002 @@ -351,53 +282,6 @@ struct file_allocated_range_buffer { /* Integrity flags for above */ #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 -/* Reparse structures - see MS-FSCC 2.1.2 */ - -/* struct fsctl_reparse_info_req is empty, only response structs (see below) */ - -struct reparse_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -struct reparse_guid_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 ReparseGuid[16]; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -struct reparse_mount_point_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __le16 SubstituteNameOffset; - __le16 SubstituteNameLength; - __le16 PrintNameOffset; - __le16 PrintNameLength; - __u8 PathBuffer[]; /* Variable Length */ -} __packed; - -#define SYMLINK_FLAG_RELATIVE 0x00000001 - -struct reparse_symlink_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __le16 SubstituteNameOffset; - __le16 SubstituteNameLength; - __le16 PrintNameOffset; - __le16 PrintNameLength; - __le32 Flags; - __u8 PathBuffer[]; /* Variable Length */ -} __packed; - -/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ - - /* See MS-DFSC 2.2.2 */ struct fsctl_get_dfs_referral_req { __le16 MaxReferralLevel; @@ -413,22 +297,6 @@ struct network_resiliency_req { } __packed; /* There is no buffer for the response ie no struct network_resiliency_rsp */ - -struct validate_negotiate_info_req { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 DialectCount; - __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ -} __packed; - -struct validate_negotiate_info_rsp { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 Dialect; /* Dialect in use for the connection */ -} __packed; - #define RSS_CAPABLE cpu_to_le32(0x00000001) #define RDMA_CAPABLE cpu_to_le32(0x00000002) @@ -464,14 +332,6 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; - /* * Maximum number of iovs we need for an ioctl request. * [0] : struct smb2_ioctl_req @@ -479,370 +339,11 @@ struct duplicate_extents_to_file { */ #define SMB2_IOCTL_IOV_SIZE 2 -struct smb2_ioctl_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __u16 Reserved; - __le32 CtlCode; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 InputOffset; - __le32 InputCount; - __le32 MaxInputResponse; - __le32 OutputOffset; - __le32 OutputCount; - __le32 MaxOutputResponse; - __le32 Flags; - __u32 Reserved2; - __u8 Buffer[]; -} __packed; - -struct smb2_ioctl_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __u16 Reserved; - __le32 CtlCode; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 InputOffset; - __le32 InputCount; - __le32 OutputOffset; - __le32 OutputCount; - __le32 Flags; - __u32 Reserved2; - /* char * buffer[] */ -} __packed; - -#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 -#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 -#define SMB2_LOCKFLAG_UNLOCK 0x0004 -#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 - -struct smb2_lock_element { - __le64 Offset; - __le64 Length; - __le32 Flags; - __le32 Reserved; -} __packed; - -struct smb2_lock_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 48 */ - __le16 LockCount; - /* - * The least significant four bits are the index, the other 28 bits are - * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 - */ - __le32 LockSequenceNumber; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - /* Followed by at least one */ - struct smb2_lock_element locks[1]; -} __packed; - -struct smb2_lock_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_echo_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -struct smb2_echo_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -/* search (query_directory) Flags field */ -#define SMB2_RESTART_SCANS 0x01 -#define SMB2_RETURN_SINGLE_ENTRY 0x02 -#define SMB2_INDEX_SPECIFIED 0x04 -#define SMB2_REOPEN 0x10 - -#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 - -/* - * Valid FileInformation classes. - * - * Note that these are a subset of the (file) QUERY_INFO levels defined - * later in this file (but since QUERY_DIRECTORY uses equivalent numbers - * we do not redefine them here) - * - * FileDirectoryInfomation 0x01 - * FileFullDirectoryInformation 0x02 - * FileIdFullDirectoryInformation 0x26 - * FileBothDirectoryInformation 0x03 - * FileIdBothDirectoryInformation 0x25 - * FileNamesInformation 0x0C - * FileIdExtdDirectoryInformation 0x3C - */ - -struct smb2_query_directory_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 FileInformationClass; - __u8 Flags; - __le32 FileIndex; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le16 FileNameOffset; - __le16 FileNameLength; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_directory_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* Possible InfoType values */ -#define SMB2_O_INFO_FILE 0x01 -#define SMB2_O_INFO_FILESYSTEM 0x02 -#define SMB2_O_INFO_SECURITY 0x03 -#define SMB2_O_INFO_QUOTA 0x04 - -/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ -#define OWNER_SECINFO 0x00000001 -#define GROUP_SECINFO 0x00000002 -#define DACL_SECINFO 0x00000004 -#define SACL_SECINFO 0x00000008 -#define LABEL_SECINFO 0x00000010 -#define ATTRIBUTE_SECINFO 0x00000020 -#define SCOPE_SECINFO 0x00000040 -#define BACKUP_SECINFO 0x00010000 -#define UNPROTECTED_SACL_SECINFO 0x10000000 -#define UNPROTECTED_DACL_SECINFO 0x20000000 -#define PROTECTED_SACL_SECINFO 0x40000000 -#define PROTECTED_DACL_SECINFO 0x80000000 - -/* Flags used for FileFullEAinfo */ -#define SL_RESTART_SCAN 0x00000001 -#define SL_RETURN_SINGLE_ENTRY 0x00000002 -#define SL_INDEX_SPECIFIED 0x00000004 - -struct smb2_query_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 41 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 OutputBufferLength; - __le16 InputBufferOffset; - __u16 Reserved; - __le32 InputBufferLength; - __le32 AdditionalInformation; - __le32 Flags; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __u8 Buffer[1]; -} __packed; - -struct smb2_query_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* - * Maximum number of iovs we need for a set-info request. - * The largest one is rename/hardlink - * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info - * [1] : path - * [2] : compound padding - */ -#define SMB2_SET_INFO_IOV_SIZE 3 - -struct smb2_set_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 BufferLength; - __le16 BufferOffset; - __u16 Reserved; - __le32 AdditionalInformation; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 2 */ -} __packed; - -struct smb2_oplock_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __u64 PersistentFid; - __u64 VolatileFid; -} __packed; - -#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) - -struct smb2_lease_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 44 */ - __le16 Epoch; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 CurrentLeaseState; - __le32 NewLeaseState; - __le32 BreakReason; - __le32 AccessMaskHint; - __le32 ShareMaskHint; -} __packed; - -struct smb2_lease_ack { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 Reserved; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 LeaseState; - __le64 LeaseDuration; -} __packed; - /* - * PDU infolevel structure definitions + * PDU query infolevel structure definitions * BB consider moving to a different header */ -/* File System Information Classes */ -#define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Local only */ -#define FS_SIZE_INFORMATION 3 /* Query */ -#define FS_DEVICE_INFORMATION 4 /* Query */ -#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ -#define FS_CONTROL_INFORMATION 6 /* Query, Set */ -#define FS_FULL_SIZE_INFORMATION 7 /* Query */ -#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */ -#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */ -#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ -#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ - -struct smb2_fs_full_size_info { - __le64 TotalAllocationUnits; - __le64 CallerAvailableAllocationUnits; - __le64 ActualAvailableAllocationUnits; - __le32 SectorsPerAllocationUnit; - __le32 BytesPerSector; -} __packed; - -#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 -#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 -#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 -#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 - -/* sector size info struct */ -struct smb3_fs_ss_info { - __le32 LogicalBytesPerSector; - __le32 PhysicalBytesPerSectorForAtomicity; - __le32 PhysicalBytesPerSectorForPerf; - __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity; - __le32 Flags; - __le32 ByteOffsetForSectorAlignment; - __le32 ByteOffsetForPartitionAlignment; -} __packed; - -/* volume info struct - see MS-FSCC 2.5.9 */ -#define MAX_VOL_LABEL_LEN 32 -struct smb3_fs_vol_info { - __le64 VolumeCreationTime; - __u32 VolumeSerialNumber; - __le32 VolumeLabelLength; /* includes trailing null */ - __u8 SupportsObjects; /* True if eg like NTFS, supports objects */ - __u8 Reserved; - __u8 VolumeLabel[]; /* variable len */ -} __packed; - -/* partial list of QUERY INFO levels */ -#define FILE_DIRECTORY_INFORMATION 1 -#define FILE_FULL_DIRECTORY_INFORMATION 2 -#define FILE_BOTH_DIRECTORY_INFORMATION 3 -#define FILE_BASIC_INFORMATION 4 -#define FILE_STANDARD_INFORMATION 5 -#define FILE_INTERNAL_INFORMATION 6 -#define FILE_EA_INFORMATION 7 -#define FILE_ACCESS_INFORMATION 8 -#define FILE_NAME_INFORMATION 9 -#define FILE_RENAME_INFORMATION 10 -#define FILE_LINK_INFORMATION 11 -#define FILE_NAMES_INFORMATION 12 -#define FILE_DISPOSITION_INFORMATION 13 -#define FILE_POSITION_INFORMATION 14 -#define FILE_FULL_EA_INFORMATION 15 -#define FILE_MODE_INFORMATION 16 -#define FILE_ALIGNMENT_INFORMATION 17 -#define FILE_ALL_INFORMATION 18 -#define FILE_ALLOCATION_INFORMATION 19 -#define FILE_END_OF_FILE_INFORMATION 20 -#define FILE_ALTERNATE_NAME_INFORMATION 21 -#define FILE_STREAM_INFORMATION 22 -#define FILE_PIPE_INFORMATION 23 -#define FILE_PIPE_LOCAL_INFORMATION 24 -#define FILE_PIPE_REMOTE_INFORMATION 25 -#define FILE_MAILSLOT_QUERY_INFORMATION 26 -#define FILE_MAILSLOT_SET_INFORMATION 27 -#define FILE_COMPRESSION_INFORMATION 28 -#define FILE_OBJECT_ID_INFORMATION 29 -/* Number 30 not defined in documents */ -#define FILE_MOVE_CLUSTER_INFORMATION 31 -#define FILE_QUOTA_INFORMATION 32 -#define FILE_REPARSE_POINT_INFORMATION 33 -#define FILE_NETWORK_OPEN_INFORMATION 34 -#define FILE_ATTRIBUTE_TAG_INFORMATION 35 -#define FILE_TRACKING_INFORMATION 36 -#define FILEID_BOTH_DIRECTORY_INFORMATION 37 -#define FILEID_FULL_DIRECTORY_INFORMATION 38 -#define FILE_VALID_DATA_LENGTH_INFORMATION 39 -#define FILE_SHORT_NAME_INFORMATION 40 -#define FILE_SFIO_RESERVE_INFORMATION 44 -#define FILE_SFIO_VOLUME_INFORMATION 45 -#define FILE_HARD_LINK_INFORMATION 46 -#define FILE_NORMALIZED_NAME_INFORMATION 48 -#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 -#define FILE_STANDARD_LINK_INFORMATION 54 -#define FILE_ID_INFORMATION 59 -#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 - -struct smb2_file_internal_info { - __le64 IndexNumber; -} __packed; /* level 6 Query */ - -struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[]; /* New name to be assigned */ - /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ -} __packed; /* level 10 Set */ - -struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[]; /* Name to be assigned to new link */ -} __packed; /* level 11 Set */ - struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; __u8 flags; @@ -851,38 +352,6 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */ char ea_data[]; /* \0 terminated name plus value */ } __packed; /* level 15 Set */ -/* - * This level 18, although with struct with same name is different from cifs - * level 0x107. Level 0x107 has an extra u64 between AccessFlags and - * CurrentByteOffset. - */ -struct smb2_file_all_info { /* data block encoding of response to level 18 */ - __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 Attributes; - __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; /* size ie offset to first free byte in file */ - __le32 NumberOfLinks; /* hard links */ - __u8 DeletePending; - __u8 Directory; - __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ - __le64 IndexNumber; - __le32 EASize; - __le32 AccessFlags; - __le64 CurrentByteOffset; - __le32 Mode; - __le32 AlignmentRequirement; - __le32 FileNameLength; - char FileName[1]; -} __packed; /* level 18 Query */ - -struct smb2_file_eof_info { /* encoding of request for level 10 */ - __le64 EndOfFile; /* new end of file value */ -} __packed; /* level 20 Set */ - struct smb2_file_reparse_point_info { __le64 IndexNumber; __le32 Tag; @@ -935,6 +404,8 @@ struct create_posix_rsp { struct cifs_sid group; /* var-sized on the wire */ } __packed; +#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 + /* * SMB2-only POSIX info level for query dir * @@ -966,31 +437,6 @@ struct smb2_posix_info { */ } __packed; -/* Level 100 query info */ -struct smb311_posix_qinfo { - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 EndOfFile; - __le64 AllocationSize; - __le32 DosAttributes; - __le64 Inode; - __le32 DeviceId; - __le32 Zero; - /* beginning of POSIX Create Context Response */ - __le32 HardLinks; - __le32 ReparseTag; - __le32 Mode; - u8 Sids[]; - /* - * var sized owner SID - * var sized group SID - * le32 filenamelength - * u8 filename[] - */ -} __packed; - /* * Parsed version of the above struct. Allows direct access to the * variable length fields diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 4a7062fd1c26..a69f1eed1cfe 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -283,7 +283,7 @@ extern int smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, - __le16 *utf16_path, u32 desired_access, + const char *path, u32 desired_access, u32 class, u32 type, u32 output_len, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6cecf302dcfd..bc279616c513 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -1006,6 +1006,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout); DEFINE_SMB3_CREDIT_EVENT(insufficient_credits); DEFINE_SMB3_CREDIT_EVENT(too_many_credits); DEFINE_SMB3_CREDIT_EVENT(add_credits); +DEFINE_SMB3_CREDIT_EVENT(adj_credits); +DEFINE_SMB3_CREDIT_EVENT(hdr_credits); +DEFINE_SMB3_CREDIT_EVENT(nblk_credits); +DEFINE_SMB3_CREDIT_EVENT(pend_credits); +DEFINE_SMB3_CREDIT_EVENT(wait_credits); +DEFINE_SMB3_CREDIT_EVENT(waitff_credits); +DEFINE_SMB3_CREDIT_EVENT(overflow_credits); DEFINE_SMB3_CREDIT_EVENT(set_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index a4c3e027cca2..c667e6ddfe2f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -430,7 +430,7 @@ unmask: * be taken as the remainder of this one. We need to kill the * socket so the server throws away the partial SMB */ - cifs_mark_tcp_ses_conns_for_reconnect(server, false); + cifs_signal_cifsd_for_reconnect(server, false); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); } @@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS); + tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); if (!tr_hdr) return -ENOMEM; memset(&cur_rqst[0], 0, sizeof(cur_rqst)); memset(&iov, 0, sizeof(iov)); - memset(tr_hdr, 0, sizeof(*tr_hdr)); iov.iov_base = tr_hdr; iov.iov_len = sizeof(*tr_hdr); @@ -542,7 +541,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_nblk_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -1, in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, 1, scredits); @@ -648,7 +647,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_waitff_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..3f3c81e6b1ab 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -14,6 +14,7 @@ #include <linux/time.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/stat.h> #include <linux/cred.h> #include <linux/errno.h> diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d9f1bd7153df..2185328b65c7 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep; static struct inode *coda_alloc_inode(struct super_block *sb) { struct coda_inode_info *ei; - ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 95e72d271b95..8f0af4f62631 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -135,6 +135,8 @@ #define elf_format compat_elf_format #define init_elf_binfmt init_compat_elf_binfmt #define exit_elf_binfmt exit_compat_elf_binfmt +#define binfmt_elf_test_cases compat_binfmt_elf_test_cases +#define binfmt_elf_test_suite compat_binfmt_elf_test_suite /* * We share all the actual code with the native (64-bit) version. diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d3cd2a94d1e8..d1f9d2632202 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -34,6 +34,14 @@ */ DEFINE_SPINLOCK(configfs_dirent_lock); +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + * But parent configfs_subsystem is NULL when config_item is root. + * Use this mutex when config_item is root. + */ +static DEFINE_MUTEX(configfs_subsystem_mutex); + static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { @@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) group->cg_item.ci_name = group->cg_item.ci_namebuf; sd = root->d_fsdata; + mutex_lock(&configfs_subsystem_mutex); link_group(to_config_group(sd->s_element), group); + mutex_unlock(&configfs_subsystem_mutex); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); @@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) inode_unlock(d_inode(root)); if (err) { + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } put_fragment(frag); @@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) dput(dentry); + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } diff --git a/fs/coredump.c b/fs/coredump.c index 1c060c0a2d72..ebc43f960b64 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -31,7 +31,6 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> -#include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> @@ -42,6 +41,7 @@ #include <linux/path.h> #include <linux/timekeeping.h> #include <linux/sysctl.h> +#include <linux/elf.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -53,6 +53,9 @@ #include <trace/events/sched.h> +static bool dump_vma_snapshot(struct coredump_params *cprm); +static void free_vma_snapshot(struct coredump_params *cprm); + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -531,6 +534,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) * by any locks. */ .mm_flags = mm->flags, + .vma_meta = NULL, }; audit_core_dumps(siginfo->si_signo); @@ -745,6 +749,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) pr_info("Core dump to |%s disabled\n", cn.corename); goto close_fail; } + if (!dump_vma_snapshot(&cprm)) + goto close_fail; + file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); /* @@ -758,6 +765,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); + free_vma_snapshot(&cprm); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -980,6 +988,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1039,9 +1049,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1078,18 +1099,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void free_vma_snapshot(struct coredump_params *cprm) +{ + if (cprm->vma_meta) { + int i; + for (i = 0; i < cprm->vma_count; i++) { + struct file *file = cprm->vma_meta[i].file; + if (file) + fput(file); + } + kvfree(cprm->vma_meta); + cprm->vma_meta = NULL; + } +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. */ -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr) +static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *vma, *gate_vma; struct mm_struct *mm = current->mm; int i; - size_t vma_data_size = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1097,36 +1129,51 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, * mmap_lock in read mode. */ if (mmap_write_lock_killable(mm)) - return -EINTR; + return false; + cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - *vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); - *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); - if (!*vma_meta) { + cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); + if (!cprm->vma_meta) { mmap_write_unlock(mm); - return -ENOMEM; + return false; } for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma), i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); + m->pgoff = vma->vm_pgoff; - vma_data_size += m->dump_size; + m->file = vma->vm_file; + if (m->file) + get_file(m->file); } mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) { - kvfree(*vma_meta); - return -EFAULT; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = cprm->vma_meta + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + cprm->vma_data_size += m->dump_size; } - *vma_data_size_ptr = vma_data_size; - return 0; + return true; } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index bfc2a5b74ed3..2217fe5ece6f 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); + bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, + GFP_NOFS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); @@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, if (num_pages == 0) { fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); - bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - SECTOR_SHIFT); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); } ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); if (WARN_ON(ret != bytes_this_page)) { @@ -81,7 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, err = submit_bio_wait(bio); if (err) goto out; - bio_reset(bio); + bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); num_pages = 0; } } @@ -150,12 +149,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return -EINVAL; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, nr_pages); + bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS); do { - bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - 9); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); i = 0; offset = 0; @@ -182,7 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, err = submit_bio_wait(bio); if (err) goto out; - bio_reset(bio); + bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); } while (len != 0); err = 0; out: diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 4ef3f714046a..526a4c1bed99 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page) } EXPORT_SYMBOL(fscrypt_free_bounce_page); +/* + * Generate the IV for the given logical block number within the given file. + * For filenames encryption, lblk_num == 0. + * + * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() + * needs to know about any IV generation methods where the low bits of IV don't + * simply contain the lblk_num (e.g., IV_INO_LBLK_32). + */ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci) { @@ -240,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); * which must still be locked and not uptodate. Normally, blocksize == * PAGE_SIZE and the whole page is decrypted at once. * - * This is for use by the filesystem's ->readpages() method. + * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index c57bebfa48fe..93c2ca858092 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -17,6 +17,7 @@ #include <linux/buffer_head.h> #include <linux/sched/mm.h> #include <linux/slab.h> +#include <linux/uio.h> #include "fscrypt_private.h" @@ -315,6 +316,10 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh); * * fscrypt_set_bio_crypt_ctx() must have already been called on the bio. * + * This function isn't required in cases where crypto-mergeability is ensured in + * another way, such as I/O targeting only a single file (and thus a single key) + * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity. + * * Return: true iff the I/O is mergeable */ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, @@ -363,3 +368,91 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, return fscrypt_mergeable_bio(bio, inode, next_lblk); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); + +/** + * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is + * supported as far as encryption is concerned + * @iocb: the file and position the I/O is targeting + * @iter: the I/O data segment(s) + * + * Return: %true if there are no encryption constraints that prevent DIO from + * being supported; %false if DIO is unsupported. (Note that in the + * %true case, the filesystem might have other, non-encryption-related + * constraints that prevent DIO from actually being supported.) + */ +bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +{ + const struct inode *inode = file_inode(iocb->ki_filp); + const unsigned int blocksize = i_blocksize(inode); + + /* If the file is unencrypted, no veto from us. */ + if (!fscrypt_needs_contents_encryption(inode)) + return true; + + /* We only support DIO with inline crypto, not fs-layer crypto. */ + if (!fscrypt_inode_uses_inline_crypto(inode)) + return false; + + /* + * Since the granularity of encryption is filesystem blocks, the file + * position and total I/O length must be aligned to the filesystem block + * size -- not just to the block device's logical block size as is + * traditionally the case for DIO on many filesystems. + * + * We require that the user-provided memory buffers be filesystem block + * aligned too. It is simpler to have a single alignment value required + * for all properties of the I/O, as is normally the case for DIO. + * Also, allowing less aligned buffers would imply that data units could + * cross bvecs, which would greatly complicate the I/O stack, which + * assumes that bios can be split at any bvec boundary. + */ + if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(fscrypt_dio_supported); + +/** + * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs + * @inode: the file on which I/O is being done + * @lblk: the block at which the I/O is being started from + * @nr_blocks: the number of blocks we want to submit starting at @lblk + * + * Determine the limit to the number of blocks that can be submitted in a bio + * targeting @lblk without causing a data unit number (DUN) discontiguity. + * + * This is normally just @nr_blocks, as normally the DUNs just increment along + * with the logical blocks. (Or the file is not encrypted.) + * + * In rare cases, fscrypt can be using an IV generation method that allows the + * DUN to wrap around within logically contiguous blocks, and that wraparound + * will occur. If this happens, a value less than @nr_blocks will be returned + * so that the wraparound doesn't occur in the middle of a bio, which would + * cause encryption/decryption to produce wrong results. + * + * Return: the actual number of blocks that can be submitted + */ +u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) +{ + const struct fscrypt_info *ci; + u32 dun; + + if (!fscrypt_inode_uses_inline_crypto(inode)) + return nr_blocks; + + if (nr_blocks <= 1) + return nr_blocks; + + ci = inode->i_crypt_info; + if (!(fscrypt_policy_flags(&ci->ci_policy) & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) + return nr_blocks; + + /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */ + + dun = ci->ci_hashed_ino + lblk; + + return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun); +} +EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks); @@ -11,7 +11,6 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/fs.h> -#include <linux/genhd.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> @@ -390,7 +389,7 @@ static struct page *dax_busy_page(void *entry) } /* - * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * dax_lock_page - Lock the DAX entry corresponding to a page * @page: The page whose entry we want to lock * * Context: Process context. diff --git a/fs/dcache.c b/fs/dcache.c index c84269c6e8bf..93f4f5ee07bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1766,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) char *dname; int err; - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, + GFP_KERNEL); if (!dentry) return NULL; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2f117c57160d..3dcf0b8b4e93 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -450,6 +450,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. + * + * NOTE: it's expected that most callers should _ignore_ the errors returned + * by this function. Other debugfs functions handle the fact that the "dentry" + * passed to them could be an error and they don't crash in that case. + * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -551,6 +556,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. + * + * NOTE: it's expected that most callers should _ignore_ the errors returned + * by this function. Other debugfs functions handle the fact that the "dentry" + * passed to them could be an error and they don't crash in that case. + * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 654443558047..aef06e607b40 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -396,18 +396,12 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * bio_alloc() is guaranteed to return a bio when allowed to sleep and * we request a valid number of vectors. */ - bio = bio_alloc(GFP_KERNEL, nr_vecs); - - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL); bio->bi_iter.bi_sector = first_sector; - bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; - - bio->bi_write_hint = dio->iocb->ki_hint; - sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 7d85e64ea62f..9ad61b582f07 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -540,12 +540,13 @@ const struct address_space_operations ecryptfs_aops = { * XXX: This is pretty broken for multiple reasons: ecryptfs does not * actually use buffer_heads, and ecryptfs will crash without * CONFIG_BLOCK. But it matches the behavior before the default for - * address_space_operations without the ->set_page_dirty method was + * address_space_operations without the ->dirty_folio method was * cleaned up, so this is the best we can do without maintainer * feedback. */ #ifdef CONFIG_BLOCK - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, #endif .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 39116af0390f..0b1c878317ab 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) struct ecryptfs_inode_info *inode_info; struct inode *inode = NULL; - inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL); + inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { diff --git a/fs/efs/super.c b/fs/efs/super.c index 62b155b9366b..b287f47c165b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 226a57c57ee6..780db1e5f4b7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -28,10 +28,10 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, struct inode *inode, + erofs_blk_t blkaddr, enum erofs_kmap_type type) { - struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *const mapping = inode->i_mapping; erofs_off_t offset = blknr_to_addr(blkaddr); pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; @@ -60,6 +60,12 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, return buf->base + (offset & ~PAGE_MASK); } +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) +{ + return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); +} + static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map, int flags) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index eee9b0b31b63..18e59821c597 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022, Alibaba Cloud */ #include "internal.h" @@ -67,7 +68,7 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, static int erofs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); - struct address_space *mapping = dir->i_mapping; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; const size_t dirsize = i_size_read(dir); unsigned int i = ctx->pos / EROFS_BLKSIZ; unsigned int ofs = ctx->pos % EROFS_BLKSIZ; @@ -75,26 +76,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) bool initial = true; while (ctx->pos < dirsize) { - struct page *dentry_page; struct erofs_dirent *de; unsigned int nameoff, maxsize; - dentry_page = read_mapping_page(mapping, i, NULL); - if (dentry_page == ERR_PTR(-ENOMEM)) { - err = -ENOMEM; - break; - } else if (IS_ERR(dentry_page)) { + de = erofs_bread(&buf, dir, i, EROFS_KMAP); + if (IS_ERR(de)) { erofs_err(dir->i_sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); - err = -EFSCORRUPTED; + err = PTR_ERR(de); break; } - de = (struct erofs_dirent *)kmap(dentry_page); - nameoff = le16_to_cpu(de->nameoff); - if (nameoff < sizeof(struct erofs_dirent) || nameoff >= PAGE_SIZE) { erofs_err(dir->i_sb, @@ -119,10 +113,6 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) err = erofs_fill_dentries(dir, ctx, de, &ofs, nameoff, maxsize); skip_this: - kunmap(dentry_page); - - put_page(dentry_page); - ctx->pos = blknr_to_addr(i) + ofs; if (err) @@ -130,6 +120,7 @@ skip_this: ++i; ofs = 0; } + erofs_put_metabuf(&buf); return err < 0 ? err : 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 3ea62c6fb00a..1238ca104f09 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,6 +12,7 @@ #define EROFS_SUPER_OFFSET 1024 #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -186,8 +187,8 @@ struct erofs_inode_extended { __le32 i_uid; __le32 i_gid; - __le64 i_ctime; - __le32 i_ctime_nsec; + __le64 i_mtime; + __le32 i_mtime_nsec; __le32 i_nlink; __u8 i_reserved2[16]; }; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index ff62f84f47d3..e8b37ba5e9ad 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -113,8 +113,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, set_nlink(inode, le32_to_cpu(die->i_nlink)); /* extended inode has its own timestamp */ - inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime); + inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec); inode->i_size = le64_to_cpu(die->i_size); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b8272fb95fd6..5298c4ee277d 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -325,7 +325,7 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - unsigned int z_idataoff; + erofs_off_t z_idataoff; unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ @@ -479,6 +479,8 @@ struct erofs_map_dev { extern const struct file_operations erofs_file_fops; void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); +void *erofs_bread(struct erofs_buf *buf, struct inode *inode, + erofs_blk_t blkaddr, enum erofs_kmap_type type); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 8629e616028c..554efa363317 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" @@ -86,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static struct page *find_target_block_classic(struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *find_target_block_classic(struct erofs_buf *target, + struct inode *dir, + struct erofs_qstr *name, + int *_ndirents) { unsigned int startprfx, endprfx; int head, back; - struct address_space *const mapping = dir->i_mapping; - struct page *candidate = ERR_PTR(-ENOENT); + void *candidate = ERR_PTR(-ENOENT); startprfx = endprfx = 0; head = 0; @@ -101,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir, while (head <= back) { const int mid = head + (back - head) / 2; - struct page *page = read_mapping_page(mapping, mid, NULL); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_dirent *de; - if (!IS_ERR(page)) { - struct erofs_dirent *de = kmap_atomic(page); + de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, EROFS_BLKSIZ); const int ndirents = nameoff / sizeof(*de); @@ -113,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir, struct erofs_qstr dname; if (!ndirents) { - kunmap_atomic(de); - put_page(page); + erofs_put_metabuf(&buf); erofs_err(dir->i_sb, "corrupted dir block %d @ nid %llu", mid, EROFS_I(dir)->nid); DBG_BUGON(1); - page = ERR_PTR(-EFSCORRUPTED); + de = ERR_PTR(-EFSCORRUPTED); goto out; } @@ -135,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir, /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); - kunmap_atomic(de); if (!diff) { *_ndirents = 0; @@ -145,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir, startprfx = matched; if (!IS_ERR(candidate)) - put_page(candidate); - candidate = page; + erofs_put_metabuf(target); + *target = buf; + candidate = de; *_ndirents = ndirents; } else { - put_page(page); + erofs_put_metabuf(&buf); back = mid - 1; endprfx = matched; @@ -158,8 +159,8 @@ static struct page *find_target_block_classic(struct inode *dir, } out: /* free if the candidate is valid */ if (!IS_ERR(candidate)) - put_page(candidate); - return page; + erofs_put_metabuf(target); + return de; } return candidate; } @@ -169,8 +170,7 @@ int erofs_namei(struct inode *dir, erofs_nid_t *nid, unsigned int *d_type) { int ndirents; - struct page *page; - void *data; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; struct erofs_qstr qn; @@ -181,26 +181,20 @@ int erofs_namei(struct inode *dir, qn.end = name->name + name->len; ndirents = 0; - page = find_target_block_classic(dir, &qn, &ndirents); - if (IS_ERR(page)) - return PTR_ERR(page); + de = find_target_block_classic(&buf, dir, &qn, &ndirents); + if (IS_ERR(de)) + return PTR_ERR(de); - data = kmap_atomic(page); /* the target page has been mapped */ if (ndirents) - de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents); - else - de = (struct erofs_dirent *)data; + de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); *d_type = de->file_type; } - - kunmap_atomic(data); - put_page(page); - + erofs_put_metabuf(&buf); return PTR_ERR_OR_ZERO(de); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 915eefe0d7e2..0c4b41130c2f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -84,7 +84,7 @@ static void erofs_inode_init_once(void *ptr) static struct inode *erofs_alloc_inode(struct super_block *sb) { struct erofs_inode *vi = - kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; @@ -281,21 +281,19 @@ static int erofs_init_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; unsigned int blkszbits; void *data; int ret; - page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); - if (IS_ERR(page)) { + data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); - return PTR_ERR(page); + return PTR_ERR(data); } sbi = EROFS_SB(sb); - - data = kmap(page); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; @@ -365,8 +363,7 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_sb_has_ztailpacking(sbi)) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); out: - kunmap(page); - put_page(page); + erofs_put_metabuf(&buf); return ret; } @@ -535,25 +532,29 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) return ret; } -static void erofs_managed_cache_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * We could introduce an extra locking instead but it seems unnecessary. + */ +static void erofs_managed_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) { - const unsigned int stop = length + offset; + const size_t stop = length + offset; - DBG_BUGON(!PageLocked(page)); + DBG_BUGON(!folio_test_locked(folio)); /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > PAGE_SIZE || stop < length); + DBG_BUGON(stop > folio_size(folio) || stop < length); - if (offset == 0 && stop == PAGE_SIZE) - while (!erofs_managed_cache_releasepage(page, GFP_NOFS)) + if (offset == 0 && stop == folio_size(folio)) + while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS)) cond_resched(); } static const struct address_space_operations managed_cache_aops = { .releasepage = erofs_managed_cache_releasepage, - .invalidatepage = erofs_managed_cache_invalidatepage, + .invalidate_folio = erofs_managed_cache_invalidate_folio, }; static int erofs_init_managed_cache(struct super_block *sb) @@ -568,8 +569,7 @@ static int erofs_init_managed_cache(struct super_block *sb) inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, - GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->managed_cache = inode; return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index dac252bc9228..f3babf1e6608 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -221,9 +221,11 @@ void erofs_unregister_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + if (sbi->s_kobj.state_in_sysfs) { + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } } int __init erofs_init_sysfs(void) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 423bc1a61da5..e6dea6dfca16 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -192,7 +192,10 @@ enum z_erofs_collectmode { COLLECT_PRIMARY_FOLLOWED, }; -struct z_erofs_collector { +struct z_erofs_decompress_frontend { + struct inode *const inode; + struct erofs_map_blocks map; + struct z_erofs_pagevec_ctor vector; struct z_erofs_pcluster *pcl, *tailpcl; @@ -202,13 +205,6 @@ struct z_erofs_collector { z_erofs_next_pcluster_t owned_head; enum z_erofs_collectmode mode; -}; - -struct z_erofs_decompress_frontend { - struct inode *const inode; - - struct z_erofs_collector clt; - struct erofs_map_blocks map; bool readahead; /* used for applying cache strategy on the fly */ @@ -216,30 +212,30 @@ struct z_erofs_decompress_frontend { erofs_off_t headoffset; }; -#define COLLECTOR_INIT() { \ - .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } - #define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .clt = COLLECTOR_INIT(), \ - .backmost = true, } + .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = COLLECT_PRIMARY_FOLLOWED } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); -static void preload_compressed_pages(struct z_erofs_collector *clt, - struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, + enum z_erofs_cache_alloctype type, + struct page **pagepool) { - struct z_erofs_pcluster *pcl = clt->pcl; + struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); + struct z_erofs_pcluster *pcl = fe->pcl; bool standalone = true; + /* + * optimistic allocation without direct reclaim since inplace I/O + * can be used if low memory otherwise. + */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; struct page **pages; pgoff_t index; - if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < COLLECT_PRIMARY_FOLLOWED) return; pages = pcl->compressed_pages; @@ -288,7 +284,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -350,47 +346,47 @@ int erofs_try_to_free_cached_page(struct page *page) } /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ -static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, +static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct page *page) { - struct z_erofs_pcluster *const pcl = clt->pcl; + struct z_erofs_pcluster *const pcl = fe->pcl; - while (clt->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--clt->icpage_ptr, NULL, page)) + while (fe->icpage_ptr > pcl->compressed_pages) + if (!cmpxchg(--fe->icpage_ptr, NULL, page)) return true; return false; } /* callers must be with collection lock held */ -static int z_erofs_attach_page(struct z_erofs_collector *clt, +static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct page *page, enum z_erofs_page_type type, bool pvec_safereuse) { int ret; /* give priority for inplaceio */ - if (clt->mode >= COLLECT_PRIMARY && + if (fe->mode >= COLLECT_PRIMARY && type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(clt, page)) + z_erofs_try_inplace_io(fe, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, page, type, + ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, pvec_safereuse); - clt->cl->vcnt += (unsigned int)ret; + fe->cl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt) +static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) { - struct z_erofs_pcluster *pcl = clt->pcl; - z_erofs_next_pcluster_t *owned_head = &clt->owned_head; + struct z_erofs_pcluster *pcl = f->pcl; + z_erofs_next_pcluster_t *owned_head = &f->owned_head; /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - clt->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = COLLECT_PRIMARY_FOLLOWED; return; } @@ -401,24 +397,24 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt) if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - clt->mode = COLLECT_PRIMARY_HOOKED; - clt->tailpcl = NULL; + f->mode = COLLECT_PRIMARY_HOOKED; + f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - clt->mode = COLLECT_PRIMARY; + f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_collection(struct z_erofs_collector *clt, +static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, struct inode *inode, struct erofs_map_blocks *map) { - struct z_erofs_pcluster *pcl = clt->pcl; + struct z_erofs_pcluster *pcl = fe->pcl; struct z_erofs_collection *cl; unsigned int length; /* to avoid unexpected loop formed by corrupted images */ - if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { + if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -449,15 +445,15 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, } mutex_lock(&cl->lock); /* used to check tail merging loop due to corrupted images */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = pcl; + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = pcl; - z_erofs_try_to_claim_pcluster(clt); - clt->cl = cl; + z_erofs_try_to_claim_pcluster(fe); + fe->cl = cl; return 0; } -static int z_erofs_register_collection(struct z_erofs_collector *clt, +static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, struct inode *inode, struct erofs_map_blocks *map) { @@ -485,8 +481,8 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, Z_EROFS_PCLUSTER_FULL_LENGTH : 0); /* new pclusters should be claimed as type 1, primary and followed */ - pcl->next = clt->owned_head; - clt->mode = COLLECT_PRIMARY_FOLLOWED; + pcl->next = fe->owned_head; + fe->mode = COLLECT_PRIMARY_FOLLOWED; cl = z_erofs_primarycollection(pcl); cl->pageofs = map->m_la & ~PAGE_MASK; @@ -512,18 +508,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, } if (grp != &pcl->obj) { - clt->pcl = container_of(grp, + fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); err = -EEXIST; goto err_out; } } /* used to check tail merging loop due to corrupted images */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = pcl; - clt->owned_head = &pcl->next; - clt->pcl = pcl; - clt->cl = cl; + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = pcl; + fe->owned_head = &pcl->next; + fe->pcl = pcl; + fe->cl = cl; return 0; err_out: @@ -532,18 +528,18 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_collector *clt, +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, struct inode *inode, struct erofs_map_blocks *map) { struct erofs_workgroup *grp; int ret; - DBG_BUGON(clt->cl); + DBG_BUGON(fe->cl); /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ - DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); if (map->m_flags & EROFS_MAP_META) { if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { @@ -555,28 +551,28 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { - clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); + fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_collection(clt, inode, map); + ret = z_erofs_register_collection(fe, inode, map); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_collection(clt, inode, map); + ret = z_erofs_lookup_collection(fe, inode, map); if (ret) { - erofs_workgroup_put(&clt->pcl->obj); + erofs_workgroup_put(&fe->pcl->obj); return ret; } out: - z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, - clt->cl->pagevec, clt->cl->vcnt); + z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, + fe->cl->pagevec, fe->cl->vcnt); /* since file-backed online pages are traversed in reverse order */ - clt->icpage_ptr = clt->pcl->compressed_pages + - z_erofs_pclusterpages(clt->pcl); + fe->icpage_ptr = fe->pcl->compressed_pages + + z_erofs_pclusterpages(fe->pcl); return 0; } @@ -610,24 +606,24 @@ static void z_erofs_collection_put(struct z_erofs_collection *cl) erofs_workgroup_put(&pcl->obj); } -static bool z_erofs_collector_end(struct z_erofs_collector *clt) +static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = clt->cl; + struct z_erofs_collection *cl = fe->cl; if (!cl) return false; - z_erofs_pagevec_ctor_exit(&clt->vector, false); + z_erofs_pagevec_ctor_exit(&fe->vector, false); mutex_unlock(&cl->lock); /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) z_erofs_collection_put(cl); - clt->cl = NULL; + fe->cl = NULL; return true; } @@ -651,7 +647,6 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; - struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); bool tight = true; @@ -672,7 +667,7 @@ repeat: if (offset + cur >= map->m_la && offset + cur < map->m_la + map->m_llen) { /* didn't get a valid collection previously (very rare) */ - if (!clt->cl) + if (!fe->cl) goto restart_now; goto hitted; } @@ -680,7 +675,7 @@ repeat: /* go ahead the next map_blocks */ erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); - if (z_erofs_collector_end(clt)) + if (z_erofs_collector_end(fe)) fe->backmost = false; map->m_la = offset + cur; @@ -693,11 +688,11 @@ restart_now: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(clt, inode, map); + err = z_erofs_collector_begin(fe, inode, map); if (err) goto err_out; - if (z_erofs_is_inline_pcluster(clt->pcl)) { + if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, @@ -709,20 +704,18 @@ restart_now: goto err_out; } get_page(fe->map.buf.page); - WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page); - clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); + fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } else { - /* preload all compressed pages (can change mode if needed) */ + /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + z_erofs_bind_cache(fe, cache_strategy, pagepool); } - hitted: /* * Ensure the current partial page belongs to this submit chain rather @@ -730,8 +723,8 @@ hitted: * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or pagevec (should be processed in strict order.) */ - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && - clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && + fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -746,18 +739,18 @@ hitted: Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); if (cur) - tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); retry: - err = z_erofs_attach_page(clt, page, page_type, - clt->mode >= COLLECT_PRIMARY_FOLLOWED); + err = z_erofs_attach_page(fe, page, page_type, + fe->mode >= COLLECT_PRIMARY_FOLLOWED); /* should allocate an additional short-lived page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(clt, newpage, + err = z_erofs_attach_page(fe, newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); if (!err) goto retry; @@ -773,7 +766,7 @@ retry: /* bump up the number of spiltted parts of a page */ ++spiltted; /* also update nr_pages */ - clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); + fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1); next_part: /* can be used for verification */ map->m_llen = offset + cur - map->m_la; @@ -1073,12 +1066,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* wake up the caller thread for sync decompression */ if (sync) { - unsigned long flags; - - spin_lock_irqsave(&io->u.wait.lock, flags); if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); + complete(&io->u.done); + return; } @@ -1098,10 +1088,10 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, struct page **pagepool, - struct address_space *mc, - gfp_t gfp) + struct address_space *mc) { const pgoff_t index = pcl->obj.index; + gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; struct address_space *mapping; @@ -1224,7 +1214,7 @@ jobqueue_init(struct super_block *sb, } else { fg_out: q = fgq; - init_waitqueue_head(&fgq->u.wait); + init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); } q->sb = sb; @@ -1309,7 +1299,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; - z_erofs_next_pcluster_t owned_head = f->clt.owned_head; + z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; struct block_device *last_bdev; @@ -1357,8 +1347,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi), - GFP_NOFS); + MNGD_MAPPING(sbi)); if (!page) continue; @@ -1370,15 +1359,14 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, mdev.m_bdev); last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; - bio->bi_opf = REQ_OP_READ; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1417,7 +1405,7 @@ static void z_erofs_runqueue(struct super_block *sb, { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); @@ -1428,8 +1416,7 @@ static void z_erofs_runqueue(struct super_block *sb, return; /* wait until all bios are completed */ - io_wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); @@ -1517,7 +1504,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) err = z_erofs_do_read_page(&f, page, &pagepool); z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - (void)z_erofs_collector_end(&f.clt); + (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(inode->i_sb, &f, &pagepool, @@ -1567,7 +1554,7 @@ static void z_erofs_readahead(struct readahead_control *rac) put_page(page); } z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); - (void)z_erofs_collector_end(&f.clt); + (void)z_erofs_collector_end(&f); z_erofs_runqueue(inode->i_sb, &f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e043216b545f..800b11c53f57 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -97,7 +97,7 @@ struct z_erofs_decompressqueue { z_erofs_next_pcluster_t head; union { - wait_queue_head_t wait; + struct completion done; struct work_struct work; } u; }; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 361b1d6e4bf9..572f0b8151ba 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -431,48 +431,47 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { struct erofs_inode *const vi = EROFS_I(m->inode); - struct erofs_map_blocks *const map = m->map; const unsigned int lclusterbits = vi->z_logical_clusterbits; - unsigned long lcn = m->lcn; - int err; - if (lcn < lookback_distance) { - erofs_err(m->inode->i_sb, - "bogus lookback distance @ nid %llu", vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } + while (m->lcn >= lookback_distance) { + unsigned long lcn = m->lcn - lookback_distance; + int err; - /* load extent head logical cluster if needed */ - lcn -= lookback_distance; - err = z_erofs_load_cluster_from_disk(m, lcn, false); - if (err) - return err; + /* load extent head logical cluster if needed */ + err = z_erofs_load_cluster_from_disk(m, lcn, false); + if (err) + return err; - switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - if (!m->delta[0]) { + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (!m->delta[0]) { + erofs_err(m->inode->i_sb, + "invalid lookback distance 0 @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + lookback_distance = m->delta[0]; + continue; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + m->headtype = m->type; + m->map->m_la = (lcn << lclusterbits) | m->clusterofs; + return 0; + default: erofs_err(m->inode->i_sb, - "invalid lookback distance 0 @ nid %llu", - vi->nid); + "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); DBG_BUGON(1); - return -EFSCORRUPTED; + return -EOPNOTSUPP; } - return z_erofs_extent_lookback(m, m->delta[0]); - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - m->headtype = m->type; - map->m_la = (lcn << lclusterbits) | m->clusterofs; - break; - default: - erofs_err(m->inode->i_sb, - "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } - return 0; + + erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; } static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, @@ -494,7 +493,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { - map->m_plen = 1 << lclusterbits; + map->m_plen = 1ULL << lclusterbits; return 0; } lcn = m->lcn + 1; @@ -540,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, return -EFSCORRUPTED; } out: - map->m_plen = m->compressedlcs << lclusterbits; + map->m_plen = (u64)m->compressedlcs << lclusterbits; return 0; err_bonus_cblkcnt: erofs_err(m->inode->i_sb, diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302..e3e55d5e0be1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -56,7 +56,6 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> -#include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> @@ -118,7 +117,7 @@ bool path_noexec(const struct path *path) * Note that a shared library must be both readable and executable due to * security reasons. * - * Also note that we take the address to load from from the file itself. + * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { @@ -495,8 +494,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; @@ -536,7 +541,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, if (!valid_arg_len(bprm, len)) goto out; - /* We're going to work our way backwords. */ + /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; @@ -1269,7 +1274,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Must be called _before_ exec_mmap() as bprm->mm is - * not visibile until then. This also enables the update + * not visible until then. This also enables the update * to be lockless. */ retval = set_mm_exe_file(bprm->mm, bprm->file); @@ -1303,12 +1308,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - /* - * Ensure that the uaccess routines can actually operate on userspace - * pointers: - */ - force_uaccess_begin(); - if (me->flags & PF_KTHREAD) free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | @@ -1897,6 +1896,9 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; @@ -1923,6 +1925,19 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out_free; + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); @@ -1951,6 +1966,8 @@ int kernel_execve(const char *kernel_filename, } retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 619e5b4bed10..c6800b880920 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -203,7 +203,8 @@ struct exfat_mount_options { /* on error: continue, panic, remount-ro */ enum exfat_error_mode errors; unsigned utf8:1, /* Use of UTF-8 character set */ - discard:1; /* Issue discard requests on deletions */ + discard:1, /* Issue discard requests on deletions */ + keep_last_dots:1; /* Keep trailing periods in paths */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ }; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index d890fd34bb2d..2f5130059236 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -218,8 +218,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) if (exfat_free_cluster(inode, &clu)) return -EIO; - exfat_clear_volume_dirty(sb); - return 0; } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index df805bd05508..fc0ea1684880 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -490,7 +490,8 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations exfat_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = exfat_readpage, .readahead = exfat_readahead, .writepage = exfat_writepage, diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index af4eb39cc0c3..a02a04a993bf 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -65,11 +65,14 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) return ret; } -/* returns the length of a struct qstr, ignoring trailing dots */ -static unsigned int exfat_striptail_len(unsigned int len, const char *name) +/* returns the length of a struct qstr, ignoring trailing dots if necessary */ +static unsigned int exfat_striptail_len(unsigned int len, const char *name, + bool keep_last_dots) { - while (len && name[len - 1] == '.') - len--; + if (!keep_last_dots) { + while (len && name[len - 1] == '.') + len--; + } return len; } @@ -83,7 +86,8 @@ static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr) struct super_block *sb = dentry->d_sb; struct nls_table *t = EXFAT_SB(sb)->nls_io; const unsigned char *name = qstr->name; - unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned int len = exfat_striptail_len(qstr->len, qstr->name, + EXFAT_SB(sb)->options.keep_last_dots); unsigned long hash = init_name_hash(dentry); int i, charlen; wchar_t c; @@ -104,8 +108,10 @@ static int exfat_d_cmp(const struct dentry *dentry, unsigned int len, { struct super_block *sb = dentry->d_sb; struct nls_table *t = EXFAT_SB(sb)->nls_io; - unsigned int alen = exfat_striptail_len(name->len, name->name); - unsigned int blen = exfat_striptail_len(len, str); + unsigned int alen = exfat_striptail_len(name->len, name->name, + EXFAT_SB(sb)->options.keep_last_dots); + unsigned int blen = exfat_striptail_len(len, str, + EXFAT_SB(sb)->options.keep_last_dots); wchar_t c1, c2; int charlen, i; @@ -136,7 +142,8 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr) { struct super_block *sb = dentry->d_sb; const unsigned char *name = qstr->name; - unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned int len = exfat_striptail_len(qstr->len, qstr->name, + EXFAT_SB(sb)->options.keep_last_dots); unsigned long hash = init_name_hash(dentry); int i, charlen; unicode_t u; @@ -161,8 +168,11 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct super_block *sb = dentry->d_sb; - unsigned int alen = exfat_striptail_len(name->len, name->name); - unsigned int blen = exfat_striptail_len(len, str); + unsigned int alen = exfat_striptail_len(name->len, name->name, + EXFAT_SB(sb)->options.keep_last_dots); + unsigned int blen = exfat_striptail_len(len, str, + EXFAT_SB(sb)->options.keep_last_dots); + unicode_t u_a, u_b; int charlen, i; @@ -416,13 +426,25 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); + int pathlen = strlen(path); - /* strip all trailing periods */ - namelen = exfat_striptail_len(strlen(path), path); + /* + * get the length of the pathname excluding + * trailing periods, if any. + */ + namelen = exfat_striptail_len(pathlen, path, false); + if (EXFAT_SB(sb)->options.keep_last_dots) { + /* + * Do not allow the creation of files with names + * ending with period(s). + */ + if (!lookup && (namelen < pathlen)) + return -EINVAL; + namelen = pathlen; + } if (!namelen) return -ENOENT; - - if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) + if (pathlen > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) return -ENAMETOOLONG; /* @@ -554,7 +576,6 @@ static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, exfat_set_volume_dirty(sb); err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE, &info); - exfat_clear_volume_dirty(sb); if (err) goto unlock; @@ -812,7 +833,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) /* This doesn't modify ei */ ei->dir.dir = DIR_DELETED; - exfat_clear_volume_dirty(sb); inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); @@ -846,7 +866,6 @@ static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, exfat_set_volume_dirty(sb); err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR, &info); - exfat_clear_volume_dirty(sb); if (err) goto unlock; @@ -976,7 +995,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } ei->dir.dir = DIR_DELETED; - exfat_clear_volume_dirty(sb); inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); @@ -1311,7 +1329,6 @@ del_out: */ new_ei->dir.dir = DIR_DELETED; } - exfat_clear_volume_dirty(sb); out: return ret; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8c9fb7dcec16..8ca21e7917d1 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -100,7 +100,6 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data; - bool sync; /* retain persistent-flags */ new_flags |= sbi->vol_flags_persistent; @@ -119,16 +118,11 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags) p_boot->vol_flags = cpu_to_le16(new_flags); - if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh)) - sync = true; - else - sync = false; - set_buffer_uptodate(sbi->boot_bh); mark_buffer_dirty(sbi->boot_bh); - if (sync) - sync_dirty_buffer(sbi->boot_bh); + __sync_dirty_buffer(sbi->boot_bh, REQ_SYNC | REQ_FUA | REQ_PREFLUSH); + return 0; } @@ -174,6 +168,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",errors=remount-ro"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->keep_last_dots) + seq_puts(m, ",keep_last_dots"); if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); return 0; @@ -183,7 +179,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb) { struct exfat_inode_info *ei; - ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS); if (!ei) return NULL; @@ -217,6 +213,7 @@ enum { Opt_charset, Opt_errors, Opt_discard, + Opt_keep_last_dots, Opt_time_offset, /* Deprecated options */ @@ -243,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_string("iocharset", Opt_charset), fsparam_enum("errors", Opt_errors, exfat_param_enums), fsparam_flag("discard", Opt_discard), + fsparam_flag("keep_last_dots", Opt_keep_last_dots), fsparam_s32("time_offset", Opt_time_offset), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), @@ -297,6 +295,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_discard: opts->discard = 1; break; + case Opt_keep_last_dots: + opts->keep_last_dots = 1; + break; case Opt_time_offset: /* * Make the limit 24 just in case someone invents something diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index df14e750e9fe..998dd2ac8008 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode) unsigned long offset; unsigned long block; struct ext2_group_desc * gdp; - struct backing_dev_info *bdi; - - bdi = inode_to_bdi(inode); - if (bdi_rw_congested(bdi)) - return; block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 602578b72d8c..52377a0ee735 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -967,7 +967,8 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc } const struct address_space_operations ext2_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_writepage, @@ -982,7 +983,8 @@ const struct address_space_operations ext2_aops = { }; const struct address_space_operations ext2_nobh_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, @@ -998,8 +1000,7 @@ const struct address_space_operations ext2_nobh_aops = { static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, }; /* diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 94f1fbd7d3ac..f6a19f6d9f6d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->i_block_alloc_info = NULL; @@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits) res += 1LL << (bits-2); res += 1LL << (2*(bits-2)); res += 1LL << (3*(bits-2)); + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ - if (res < upper_limit) + if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 80414dcba6e1..1db12847a83b 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -55,13 +55,13 @@ struct ext4_fc_del_range { struct ext4_fc_dentry_info { __le32 fc_parent_ino; __le32 fc_ino; - __u8 fc_dname[0]; + __u8 fc_dname[]; }; /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ struct ext4_fc_inode { __le32 fc_ino; - __u8 fc_raw_inode[0]; + __u8 fc_raw_inode[]; }; /* Value structure for tag EXT4_FC_TAG_TAIL. */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8cc11715518a..6feb07e3e1eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,9 +36,11 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct inode *inode) +static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) { - if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + struct inode *inode = file_inode(iocb->ki_filp); + + if (!fscrypt_dio_supported(iocb, iter)) return false; if (fsverity_active(inode)) return false; @@ -61,7 +63,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(inode)) { + if (!ext4_dio_supported(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -265,7 +267,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, goto out; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; out: @@ -509,7 +511,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(inode)) { + if (!ext4_dio_supported(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d815502cc97c..646ece9b3455 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -137,8 +137,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static void ext4_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -186,7 +184,7 @@ void ext4_evict_inode(struct inode *inode) * journal. So although mm thinks everything is clean and * ready for reaping the inode might still have some pages to * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidatepage() + * checkpointed. Thus calling jbd2_journal_invalidate_folio() * (via truncate_inode_pages()) to discard these buffers can * cause data loss. Also even if we did not discard these * buffers, we would have no way to find them after the inode @@ -1571,16 +1569,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + struct folio *folio = page_folio(page); - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); if (invalidate) { - if (page_mapped(page)) - clear_page_dirty_for_io(page); - block_invalidatepage(page, 0, PAGE_SIZE); - ClearPageUptodate(page); + if (folio_mapped(folio)) + folio_clear_dirty_for_io(folio); + block_invalidate_folio(folio, 0, + folio_size(folio)); + folio_clear_uptodate(folio); } - unlock_page(page); + folio_unlock(folio); } pagevec_release(&pvec); } @@ -1971,6 +1971,7 @@ out_no_pagelock: static int ext4_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret = 0; loff_t size; unsigned int len; @@ -1980,8 +1981,8 @@ static int ext4_writepage(struct page *page, bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return -EIO; } @@ -3207,40 +3208,39 @@ static void ext4_readahead(struct readahead_control *rac) ext4_mpage_readpages(inode, rac, NULL); } -static void ext4_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void ext4_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - trace_ext4_invalidatepage(page, offset, length); + trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ - WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); - block_invalidatepage(page, offset, length); + block_invalidate_folio(folio, offset, length); } -static int __ext4_journalled_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +static int __ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, size_t length) { - journal_t *journal = EXT4_JOURNAL(page->mapping->host); + journal_t *journal = EXT4_JOURNAL(folio->mapping->host); - trace_ext4_journalled_invalidatepage(page, offset, length); + trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0 && length == PAGE_SIZE) - ClearPageChecked(page); + if (offset == 0 && length == folio_size(folio)) + folio_clear_checked(folio); - return jbd2_journal_invalidatepage(journal, page, offset, length); + return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ -static void ext4_journalled_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +static void ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, + size_t length) { - WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); + WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3434,6 +3434,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; out: + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3566,31 +3573,32 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the page is being dirtied, corresponding buffers should already be - * attached to the transaction (we take care of this in ext4_page_mkwrite() and - * ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->set_page_dirty is called under VFS locks and the page + * Whenever the folio is being dirtied, corresponding buffers should already + * be attached to the transaction (we take care of this in ext4_page_mkwrite() + * and ext4_write_begin()). However we cannot move buffers to dirty transaction + * lists here because ->dirty_folio is called under VFS locks and the folio * is not necessarily locked. * - * We cannot just dirty the page and leave attached buffers clean, because the + * We cannot just dirty the folio and leave attached buffers clean, because the * buffers' dirty state is "definitive". We cannot just set the buffers dirty * or jbddirty because all the journalling code will explode. * - * So what we do is to mark the page "pending dirty" and next time writepage + * So what we do is to mark the folio "pending dirty" and next time writepage * is called, propagate that into the buffers appropriately. */ -static int ext4_journalled_set_page_dirty(struct page *page) +static bool ext4_journalled_dirty_folio(struct address_space *mapping, + struct folio *folio) { - WARN_ON_ONCE(!page_has_buffers(page)); - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + WARN_ON_ONCE(!folio_buffers(folio)); + folio_set_checked(folio); + return filemap_dirty_folio(mapping, folio); } -static int ext4_set_page_dirty(struct page *page) +static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { - WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); - WARN_ON_ONCE(!page_has_buffers(page)); - return __set_page_dirty_buffers(page); + WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); + WARN_ON_ONCE(!folio_buffers(folio)); + return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, @@ -3607,9 +3615,9 @@ static const struct address_space_operations ext4_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, - .set_page_dirty = ext4_set_page_dirty, + .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, + .invalidate_folio = ext4_invalidate_folio, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, @@ -3625,9 +3633,9 @@ static const struct address_space_operations ext4_journalled_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, - .set_page_dirty = ext4_journalled_set_page_dirty, + .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, - .invalidatepage = ext4_journalled_invalidatepage, + .invalidate_folio = ext4_journalled_invalidate_folio, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, @@ -3642,9 +3650,9 @@ static const struct address_space_operations ext4_da_aops = { .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, - .set_page_dirty = ext4_set_page_dirty, + .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, + .invalidate_folio = ext4_invalidate_folio, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, @@ -3656,9 +3664,8 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, - .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, }; @@ -5245,13 +5252,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } /* - * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate - * buffers that are attached to a page stradding i_size and are undergoing + * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate + * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { - struct page *page; unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = 0; @@ -5259,25 +5265,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * If the page is fully truncated, we don't need to wait for any commit - * (and we even should not as __ext4_journalled_invalidatepage() may - * strip all buffers from the page but keep the page dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * If the folio is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidate_folio() may + * strip all buffers from the folio but keep the folio dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in - * the page remain valid. This is most beneficial for the common case of + * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { - page = find_lock_page(inode->i_mapping, + struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!page) + if (!folio) return; - ret = __ext4_journalled_invalidatepage(page, offset, - PAGE_SIZE - offset); - unlock_page(page); - put_page(page); + ret = __ext4_journalled_invalidate_folio(folio, offset, + folio_size(folio) - offset); + folio_unlock(folio); + folio_put(folio); if (ret != -EBUSY) return; commit_tid = 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 40b7d8485b44..14695e2b5042 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -325,10 +325,9 @@ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; - char b[BDEVNAME_SIZE]; - if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bio_devname(bio, b), + if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n", + bio->bi_bdev, (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { @@ -374,10 +373,8 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { - int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? - REQ_SYNC : 0; - io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; - bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); + if (io->io_wbc->sync_mode == WB_SYNC_ALL) + io->io_bio->bi_opf |= REQ_SYNC; submit_bio(io->io_bio); } io->io_bio = NULL; @@ -400,10 +397,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; @@ -423,10 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) { + if (io->io_bio == NULL) io_submit_init_bio(io, bh); - io->io_bio->bi_write_hint = inode->i_write_hint; - } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 4cd62f1d848c..af491e170c4a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -109,7 +109,7 @@ static void verity_work(struct work_struct *work) struct bio *bio = ctx->bio; /* - * fsverity_verify_bio() may call readpages() again, and although verity + * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. @@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ - bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages)); + bio = bio_alloc(bdev, bio_max_segs(nr_pages), + REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, - rac ? REQ_RAHEAD : 0); + if (rac) + bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1847b46af808..1466fbdbc8e3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1321,7 +1321,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; - ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index f46a7339d6cf..03ef087537c7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -143,3 +143,10 @@ config F2FS_IOSTAT Support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events. You have to turn on "iostat_enable" sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 16e826e01f09..eaa240b21f07 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) return __f2fs_get_acl(inode, type, NULL); } -static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, - struct posix_acl **acl) +static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) { umode_t mode = inode->i_mode; int error; @@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; } -static int __f2fs_set_acl(struct inode *inode, int type, +static int __f2fs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { int name_index; @@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = f2fs_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(mnt_userns, inode, + &mode, &acl); if (error) return error; set_acl_inode(inode, mode); @@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(inode, type, acl, NULL); + return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); } /* @@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { - error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); } else { @@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, } if (acl) { if (!error) - error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); } else { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 982f0170639f..f5366feea82d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,6 +98,13 @@ repeat: } if (unlikely(!PageUptodate(page))) { + if (page->index == sbi->metapage_eio_ofs && + sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->metapage_eio_ofs = page->index; + sbi->metapage_eio_cnt = 0; + } f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -282,18 +289,22 @@ out: return blkno - start; } -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) { struct page *page; bool readahead = false; + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true); + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!down_write_trylock(&sbi->cp_global_sem)) + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -436,26 +447,27 @@ stop: return nwritten; } -static int f2fs_set_meta_page_dirty(struct page *page) +static bool f2fs_dirty_meta_folio(struct address_space *mapping, + struct folio *folio) { - trace_f2fs_set_page_dirty(page, META); - - if (!PageUptodate(page)) - SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - set_page_private_reference(page); - return 1; + trace_f2fs_set_page_dirty(&folio->page, META); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (!folio_test_dirty(folio)) { + filemap_dirty_folio(mapping, folio); + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); + set_page_private_reference(&folio->page); + return true; } - return 0; + return false; } const struct address_space_operations f2fs_meta_aops = { .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, - .set_page_dirty = f2fs_set_meta_page_dirty, - .invalidatepage = f2fs_invalidate_page, + .dirty_folio = f2fs_dirty_meta_folio, + .invalidate_folio = f2fs_invalidate_folio, .releasepage = f2fs_release_page, #ifdef CONFIG_MIGRATION .migratepage = f2fs_migrate_page, @@ -864,6 +876,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, @@ -871,15 +884,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (err) return NULL; - if (le32_to_cpu(cp_block->cp_pack_total_block_count) > - sbi->blocks_per_seg) { + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) @@ -1014,7 +1028,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type) stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void f2fs_update_dirty_page(struct inode *inode, struct page *page) +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -1029,7 +1043,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - set_page_private_reference(page); + set_page_private_reference(&folio->page); } void f2fs_remove_dirty_inode(struct inode *inode) @@ -1159,7 +1173,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - if (!down_write_trylock(&sbi->quota_sem)) + if (!f2fs_down_write_trylock(&sbi->quota_sem)) return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; @@ -1171,7 +1185,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } - up_write(&sbi->quota_sem); + f2fs_up_write(&sbi->quota_sem); return ret; } @@ -1228,10 +1242,10 @@ retry_flush_dents: * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ - down_write(&sbi->node_change); + f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) @@ -1241,15 +1255,15 @@ retry_flush_dents: } retry_flush_nodes: - down_write(&sbi->node_write); + f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); return err; } @@ -1262,13 +1276,13 @@ retry_flush_nodes: * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -1543,6 +1557,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); @@ -1612,7 +1627,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1693,7 +1708,7 @@ stop: trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); return err; } @@ -1741,9 +1756,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) struct cp_control cpc = { .reason = CP_SYNC, }; int err; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return err; } @@ -1831,9 +1846,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { int ret; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); ret = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return ret; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d0c3aeba5945..12a56f9e1572 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) } if (ret != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, - dic->rlen, + F2FS_I_SB(dic->inode)->sb->s_id, ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } @@ -1267,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { goto out_free; } @@ -1384,7 +1383,7 @@ unlock_continue: f2fs_put_dnode(&dn); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); @@ -1410,7 +1409,7 @@ out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); out_free: @@ -1505,9 +1504,7 @@ continue_unlock: if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; @@ -1750,7 +1747,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) const struct address_space_operations f2fs_compress_aops = { .releasepage = f2fs_release_page, - .invalidatepage = f2fs_invalidate_page, + .invalidate_folio = f2fs_invalidate_folio, }; struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c417864c66a..8e0c2e773c8d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work) bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* - * fsverity_verify_bio() may call readpages() again, and while verity + * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the @@ -354,7 +354,7 @@ static void f2fs_write_end_io(struct bio *bio) } struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) + block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; @@ -369,10 +369,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } } - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); - } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } @@ -389,22 +388,46 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } +static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +{ + unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int fua_flag = io_flag & temp_mask; + unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if ((1 << fio->temp) & meta_flag) + fio->op_flags |= REQ_META; + if ((1 << fio->temp) & fua_flag) + fio->op_flags |= REQ_FUA; +} + static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); + if (fio->type == DATA) + __attach_io_flag(fio, sbi->data_io_flag); + else if (fio->type == NODE) + __attach_io_flag(fio, sbi->node_io_flag); - f2fs_target_device(sbi, fio->new_blkaddr, bio); + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); + bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO, + &f2fs_bioset); + bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); @@ -500,34 +523,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi, __submit_bio(sbi, bio, type); } -static void __attach_io_flag(struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int io_flag, fua_flag, meta_flag; - - if (fio->type == DATA) - io_flag = sbi->data_io_flag; - else if (fio->type == NODE) - io_flag = sbi->node_io_flag; - else - return; - - fua_flag = io_flag & temp_mask; - meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; - - /* - * data/node io flag bits per temp: - * REQ_META | REQ_FUA | - * 5 | 4 | 3 | 2 | 1 | 0 | - * Cold | Warm | Hot | Cold | Warm | Hot | - */ - if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -535,9 +530,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - __attach_io_flag(fio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - if (is_read_io(fio->op)) trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else @@ -590,18 +582,17 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -616,9 +607,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_read(&io->io_rwsem); + f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); - up_read(&io->io_rwsem); + f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); @@ -679,9 +670,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); - __attach_io_flag(fio); - bio_set_op_attrs(bio, fio->op, fio->op_flags); - inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page): WB_DATA_TYPE(fio->page)); @@ -742,9 +730,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) @@ -766,7 +754,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct list_head *head = &io->bio_list; struct bio_entry *be; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; @@ -790,7 +778,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, __submit_bio(sbi, *bio, DATA); break; } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (ret) { @@ -816,7 +804,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (list_empty(head)) continue; - down_read(&io->bio_list_lock); + f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -826,14 +814,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (found) break; } - up_read(&io->bio_list_lock); + f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -846,7 +834,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, break; } } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (found) @@ -875,10 +863,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); - bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -906,7 +892,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -973,7 +959,7 @@ out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, @@ -984,17 +970,17 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); - bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, - bio_max_segs(nr_pages), &f2fs_bioset); + bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), + REQ_OP_READ | op_flag, + for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); - + bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); - - f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; @@ -1383,9 +1369,9 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) - down_read(&sbi->node_change); + f2fs_down_read(&sbi->node_change); else - up_read(&sbi->node_change); + f2fs_up_read(&sbi->node_change); } else { if (lock) f2fs_lock_op(sbi); @@ -2406,7 +2392,7 @@ static void f2fs_readahead(struct readahead_control *rac) if (!f2fs_is_compress_backend_ready(inode)) return; - /* If the file has inline data, skip readpages */ + /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; @@ -2460,6 +2446,9 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; + if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) @@ -2530,6 +2519,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + if (fio) { if (page_private_gcing(fio->page)) return true; @@ -2749,13 +2741,13 @@ write: * the below discard race condition. */ if (IS_NOQUOTA(inode)) - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto done; } @@ -3047,8 +3039,7 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, + f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } @@ -3154,8 +3145,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; - /* skip writing during file defragment */ - if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); @@ -3163,8 +3154,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); - else if (atomic_read(&sbi->wb_sync_req[DATA])) + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); @@ -3213,14 +3208,14 @@ void f2fs_write_failed(struct inode *inode, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3353,7 +3348,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; - if (len == PAGE_SIZE) + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, pagep, @@ -3492,17 +3487,16 @@ unlock_out: return copied; } -void f2fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_SIZE || length != PAGE_SIZE)) + (offset || length != folio_size(folio))) return; - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { @@ -3513,17 +3507,16 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } - clear_page_private_gcing(page); + clear_page_private_gcing(&folio->page); if (test_opt(sbi, COMPRESS_CACHE) && inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); + clear_page_private_data(&folio->page); - if (page_private_atomic(page)) - return f2fs_drop_inmem_page(inode, page); + if (page_private_atomic(&folio->page)) + return f2fs_drop_inmem_page(inode, &folio->page); - detach_page_private(page); - set_page_private(page, 0); + folio_detach_private(folio); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -3550,35 +3543,35 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 1; } -static int f2fs_set_data_page_dirty(struct page *page) +static bool f2fs_dirty_data_folio(struct address_space *mapping, + struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = mapping->host; - trace_f2fs_set_page_dirty(page, DATA); + trace_f2fs_set_page_dirty(&folio->page, DATA); - if (!PageUptodate(page)) - SetPageUptodate(page); - if (PageSwapCache(page)) - return __set_page_dirty_nobuffers(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + BUG_ON(folio_test_swapcache(folio)); if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(page)) { - f2fs_register_inmem_page(inode, page); - return 1; + if (!page_private_atomic(&folio->page)) { + f2fs_register_inmem_page(inode, &folio->page); + return true; } /* * Previously, this page has been registered, we just * return here. */ - return 0; + return false; } - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - f2fs_update_dirty_page(inode, page); - return 1; + if (!folio_test_dirty(folio)) { + filemap_dirty_folio(mapping, folio); + f2fs_update_dirty_folio(inode, folio); + return true; } - return 0; + return false; } @@ -3721,19 +3714,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); for (; secidx < end_sec; secidx++) { - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { struct page *page; @@ -3741,7 +3735,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } @@ -3750,22 +3744,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, f2fs_put_page(page, 1); } - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); if (ret) break; } done: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -3938,8 +3933,8 @@ const struct address_space_operations f2fs_dblock_aops = { .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, - .set_page_dirty = f2fs_set_data_page_dirty, - .invalidatepage = f2fs_invalidate_page, + .dirty_folio = f2fs_dirty_data_folio, + .invalidate_folio = f2fs_invalidate_folio, .releasepage = f2fs_release_page, .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, @@ -4044,6 +4039,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = blks_to_bytes(inode, map.m_lblk); + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { iomap->length = blks_to_bytes(inode, map.m_len); if (map.m_flags & F2FS_MAP_MAPPED) { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8c50518475a9..fcdf253cd211 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_MUTEX(f2fs_stat_mutex); +static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -338,14 +338,16 @@ static char *s_flag[] = { [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", [SBI_IS_RESIZEFS] = " resizefs", + [SBI_IS_FREEZING] = " freezefs", }; static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); @@ -474,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v) si->node_segs, si->bg_node_segs); seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Low (%d)\n", + "Urgent High (%d), Urgent Mid (%d), " + "Urgent Low (%d)\n", si->sbi->gc_reclaimed_segs[GC_NORMAL], si->sbi->gc_reclaimed_segs[GC_IDLE_CB], si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], si->sbi->gc_reclaimed_segs[GC_IDLE_AT], si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_MID], si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); @@ -532,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &si->sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", @@ -573,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -584,6 +591,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -619,9 +627,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -629,10 +637,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_del(&si->stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); kfree(si); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 166f08623362..a0e51937d92e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -766,7 +766,7 @@ add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -793,7 +793,7 @@ add_dentry: f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); @@ -858,7 +858,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) struct page *page; int err = 0; - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -869,7 +869,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } @@ -877,7 +877,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); @@ -888,7 +888,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68b44015514f..cd1e65bcf0b0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -123,6 +123,20 @@ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ + +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_queue_head_t read_waiters; +#endif +}; + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -386,6 +400,10 @@ struct discard_cmd_control { struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ @@ -561,6 +579,9 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 +/* maximum retry of EIO'ed meta page */ +#define MAX_RETRY_META_PAGE_EIO 100 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -574,6 +595,9 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ union { @@ -721,7 +745,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ @@ -752,7 +777,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ - struct rw_semaphore i_sem; /* protect fi info */ + struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -777,8 +802,8 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ - struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ @@ -897,6 +922,7 @@ struct f2fs_nm_info { nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ @@ -904,7 +930,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ @@ -1017,7 +1043,7 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct rw_semaphore curseg_lock; /* for preventing curseg change */ + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ @@ -1201,11 +1227,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct rw_semaphore io_rwsem; /* blocking op for bio */ + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ - struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1267,6 +1293,7 @@ enum { SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ }; enum { @@ -1286,6 +1313,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + GC_URGENT_MID, MAX_GC_MODE, }; @@ -1571,7 +1599,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct rw_semaphore sb_lock; /* lock for raw super block */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ @@ -1591,18 +1619,20 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ - struct rw_semaphore io_order_lock; + struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ + pgoff_t metapage_eio_ofs; /* EIO page offset */ + int metapage_eio_cnt; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct rw_semaphore node_write; /* locking node writes */ - struct rw_semaphore node_change; /* locking node change */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -1662,12 +1692,14 @@ struct f2fs_sb_info { block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ - struct rw_semaphore quota_sem; /* blocking cp for flags */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ @@ -1678,7 +1710,7 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct rw_semaphore gc_lock; /* + struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ @@ -1698,7 +1730,7 @@ struct f2fs_sb_info { /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; - struct rw_semaphore pin_sem; + struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -2092,9 +2124,81 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) +{ + __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + init_waitqueue_head(&sem->read_waiters); +#endif +} + +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) +{ + return rwsem_is_locked(&sem->internal_rwsem); +} + +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} + +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif +} + +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) +{ + return down_read_trylock(&sem->internal_rwsem); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#endif + +static inline void f2fs_up_read(struct f2fs_rwsem *sem) +{ + up_read(&sem->internal_rwsem); +} + +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wake_up_all(&sem->read_waiters); +#endif +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + f2fs_down_read(&sbi->cp_rwsem); } static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) @@ -2103,22 +2207,22 @@ static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) f2fs_show_injection_info(sbi, FAULT_LOCK_OP); return 0; } - return down_read_trylock(&sbi->cp_rwsem); + return f2fs_down_read_trylock(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + f2fs_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + f2fs_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + f2fs_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) @@ -2681,6 +2785,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (is_inflight_io(sbi, type)) return false; + if (sbi->gc_mode == GC_URGENT_MID) + return true; + if (sbi->gc_mode == GC_URGENT_LOW && (type == DISCARD_TIME || type == GC_TIME)) return true; @@ -3579,7 +3686,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); @@ -3597,7 +3705,7 @@ void f2fs_add_orphan_inode(struct inode *inode); void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); -void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); @@ -3631,7 +3739,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio); + block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); @@ -3661,8 +3769,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, enum iostat_type io_type, int compr_blocks, bool allow_balance); void f2fs_write_failed(struct inode *inode, loff_t to); -void f2fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length); +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, @@ -4371,7 +4478,11 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int rw = iov_iter_rw(iter); - if (f2fs_post_read_required(inode)) + if (!fscrypt_dio_supported(iocb, iter)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) return true; /* disallow direct IO if any of devices has unaligned blksize */ @@ -4426,6 +4537,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c98ef6af97d..5b89af0f27f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -237,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); nid_t pino; - down_write(&fi->i_sem); + f2fs_down_write(&fi->i_sem); if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); file_got_pino(inode); } - up_write(&fi->i_sem); + f2fs_up_write(&fi->i_sem); } static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, @@ -318,9 +318,9 @@ go_write: * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&F2FS_I(inode)->i_sem); + f2fs_down_read(&F2FS_I(inode)->i_sem); cp_reason = need_do_checkpoint(inode); - up_read(&F2FS_I(inode)->i_sem); + f2fs_up_read(&F2FS_I(inode)->i_sem); if (cp_reason) { /* all the dirty node pages should be flushed for POR */ @@ -812,7 +812,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri; + struct f2fs_inode *ri = NULL; unsigned int flags; if (f2fs_has_extra_attr(inode) && @@ -844,7 +844,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -904,7 +904,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -958,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -970,7 +970,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * larger than i_size. */ filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -980,10 +980,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(&init_user_ns, inode, attr); + __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1112,7 +1112,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache_range(inode, blk_start, blk_end - 1); @@ -1122,7 +1122,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1355,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); @@ -1365,7 +1365,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1500,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, @@ -1514,7 +1514,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) { f2fs_unlock_op(sbi); filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1526,7 +1526,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1600,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); @@ -1618,7 +1618,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); } filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_invalidate_lock(mapping); @@ -1674,13 +1674,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); @@ -1690,7 +1690,7 @@ next_alloc: err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); file_dont_truncate(inode); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); expanded += map.m_len; sec_len -= map.m_len; @@ -1989,11 +1989,12 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2008,7 +2009,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - f2fs_disable_compressed_file(inode); + if (!f2fs_disable_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) @@ -2020,7 +2024,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by @@ -2031,7 +2035,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -2044,7 +2048,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; @@ -2058,9 +2062,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2100,9 +2105,10 @@ err_out: static int f2fs_ioc_start_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2135,9 +2141,10 @@ out: static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2164,9 +2171,10 @@ out: static int f2fs_ioc_abort_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2351,7 +2359,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -2370,7 +2378,7 @@ got_it: 16)) err = -EFAULT; out_err: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } @@ -2447,12 +2455,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); @@ -2483,12 +2491,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) do_more: if (!range->sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, range->sync, true, false, @@ -2559,10 +2567,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, bool fragmented = false; int err; - /* if in-place-update policy is enabled, don't waste time here */ - if (f2fs_should_update_inplace(inode, NULL)) - return -EINVAL; - pg_start = range->start >> PAGE_SHIFT; pg_end = (range->start + range->len) >> PAGE_SHIFT; @@ -2570,6 +2574,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, range->start + range->len - 1); @@ -2651,7 +2662,7 @@ do_map: goto check; } - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -2676,15 +2687,16 @@ check: if (map.m_lblk < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); out: + clear_inode_flag(inode, FI_OPU_WRITE); inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; @@ -2820,10 +2832,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) goto out_src; } @@ -2841,9 +2853,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); if (src != dst) - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); out_unlock: if (src != dst) inode_unlock(dst); @@ -2938,7 +2950,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } @@ -2990,7 +3002,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *ipage; + struct f2fs_inode *ri = NULL; kprojid_t kprojid; int err; @@ -3014,17 +3026,8 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (IS_NOQUOTA(inode)) return err; - ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - - if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, - i_projid)) { - err = -EOVERFLOW; - f2fs_put_page(ipage, 1); - return err; - } - f2fs_put_page(ipage, 1); + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; err = f2fs_dquot_initialize(inode); if (err) @@ -3215,9 +3218,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -3294,11 +3297,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) if (!vbuf) return -ENOMEM; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); count = utf16s_to_utf8s(sbi->raw_super->volume_name, ARRAY_SIZE(sbi->raw_super->volume_name), UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (copy_to_user((char __user *)arg, vbuf, min(FSLABEL_MAX, count))) @@ -3326,7 +3329,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) if (err) goto out; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); memset(sbi->raw_super->volume_name, 0, sizeof(sbi->raw_super->volume_name)); @@ -3336,7 +3339,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) err = f2fs_commit_super(sbi, false); - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); out: @@ -3462,7 +3465,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3499,7 +3502,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3615,7 +3618,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) goto unlock_inode; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3652,7 +3655,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3770,7 +3773,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) if (ret) goto err; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, @@ -3859,7 +3862,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) prev_block, len, range.flags); out: filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); file_end_write(filp); @@ -4291,12 +4294,12 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) trace_f2fs_direct_IO_enter(inode, iocb, count, READ); if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { ret = -EAGAIN; goto out; } } else { - down_read(&fi->i_gc_rwsem[READ]); + f2fs_down_read(&fi->i_gc_rwsem[READ]); } /* @@ -4315,7 +4318,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = iomap_dio_complete(dio); } - up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[READ]); file_accessed(file); out: @@ -4445,7 +4448,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; if (ret > 0) { @@ -4479,10 +4482,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); const bool do_opu = f2fs_lfs_mode(sbi); - const int whint_mode = F2FS_OPTION(sbi).whint_mode; const loff_t pos = iocb->ki_pos; const ssize_t count = iov_iter_count(from); - const enum rw_hint hint = iocb->ki_hint; unsigned int dio_flags; struct iomap_dio *dio; ssize_t ret; @@ -4497,12 +4498,12 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, goto out; } - if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) { ret = -EAGAIN; goto out; } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); ret = -EAGAIN; goto out; } @@ -4511,12 +4512,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, if (ret) goto out; - down_read(&fi->i_gc_rwsem[WRITE]); + f2fs_down_read(&fi->i_gc_rwsem[WRITE]); if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); + f2fs_down_read(&fi->i_gc_rwsem[READ]); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; /* * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of @@ -4539,11 +4538,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = iomap_dio_complete(dio); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - up_read(&fi->i_gc_rwsem[WRITE]); + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); if (ret < 0) goto out; @@ -4644,12 +4641,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); if (!f2fs_truncate(inode)) file_dont_truncate(inode); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } else { file_dont_truncate(inode); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ee308a8de432..ea5b93b689cd 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -103,23 +103,26 @@ static int gc_thread_func(void *data) sbi->gc_urgent_high_remaining--; } spin_unlock(&sbi->gc_urgent_high_lock); + } + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; } if (foreground) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; - } else if (!down_write_trylock(&sbi->gc_lock)) { + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -1038,8 +1041,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } - if (f2fs_check_nid_range(sbi, dni->ino)) + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); return false; + } *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); @@ -1230,7 +1235,7 @@ static int move_data_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) - down_write(&fio.sbi->io_order_lock); + f2fs_down_write(&fio.sbi->io_order_lock); mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); @@ -1316,7 +1321,7 @@ recover_block: true, true, true); up_out: if (lfs_mode) - up_write(&fio.sbi->io_order_lock); + f2fs_up_write(&fio.sbi->io_order_lock); put_out: f2fs_put_dnode(&dn); out: @@ -1475,7 +1480,7 @@ next_step: special_file(inode->i_mode)) continue; - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); sbi->skipped_gc_rwsem++; @@ -1488,7 +1493,7 @@ next_step: if (f2fs_post_read_required(inode)) { int err = ra_data_block(inode, start_bidx); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) { iput(inode); continue; @@ -1499,7 +1504,7 @@ next_step: data_page = f2fs_get_read_data_page(inode, start_bidx, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -1518,14 +1523,14 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; continue; } - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -1548,8 +1553,8 @@ next_step: submitted++; if (locked) { - up_write(&fi->i_gc_rwsem[WRITE]); - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -1807,7 +1812,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); put_gc_inode(&gc_list); @@ -1936,7 +1941,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) long long block_count; int segs = secs * sbi->segs_per_sec; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); section_count = le32_to_cpu(raw_sb->section_count); segment_count = le32_to_cpu(raw_sb->segment_count); @@ -1957,7 +1962,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) cpu_to_le32(dev_segs + segs); } - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -2031,7 +2036,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!down_write_trylock(&sbi->gc_lock)) + if (!f2fs_down_write_trylock(&sbi->gc_lock)) return -EAGAIN; /* stop CP to protect MAIN_SEC in free_segment_range */ @@ -2051,15 +2056,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) out_unlock: f2fs_unlock_op(sbi); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) return err; set_sbi_flag(sbi, SBI_IS_RESIZEFS); freeze_super(sbi->sb); - down_write(&sbi->gc_lock); - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2104,8 +2109,8 @@ recover_out: spin_unlock(&sbi->stat_lock); } out_err: - up_write(&sbi->cp_global_sem); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4b5cefa3f90c..a578bf83b803 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -629,7 +629,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, } if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -658,7 +658,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); out: f2fs_put_page(ipage, 1); return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0ec8e32a00b4..71f232dcf3c2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -778,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); - sb_start_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: @@ -809,7 +810,8 @@ retry: if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } - sb_end_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); @@ -885,6 +887,7 @@ void f2fs_handle_failed_inode(struct inode *inode) err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); goto out; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5f213f05556d..5ed79b29999f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,7 +22,8 @@ #include "acl.h" #include <trace/events/f2fs.h> -static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, + struct inode *dir, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; @@ -46,7 +47,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -67,7 +68,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else - F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, F2FS_DEF_PROJID); err = fscrypt_prepare_new_inode(dir, inode, &encrypt); @@ -196,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; int i, cold_count, hot_count; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; @@ -206,7 +207,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * break; } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (i == cold_count + hot_count) return; @@ -299,19 +300,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, (!ext_cnt && !noext_cnt)) return; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { if (is_extension_exist(name, extlist[i], false)) { - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); return; } } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); for (i = 0; i < noext_cnt; i++) { if (is_extension_exist(name, noext[i], false)) { @@ -349,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -679,7 +680,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -750,7 +751,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -807,7 +808,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -834,8 +835,9 @@ out: return err; } -static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, + struct inode **whiteout) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -845,7 +847,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -909,20 +911,22 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); } -static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +static int f2fs_create_whiteout(struct user_namespace *mnt_userns, + struct inode *dir, struct inode **whiteout) { if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); + return __f2fs_tmpfile(mnt_userns, dir, NULL, + S_IFCHR | WHITEOUT_MODE, whiteout); } -static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = d_inode(old_dentry); @@ -960,7 +964,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); + err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); if (err) return err; } @@ -1023,11 +1027,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_page = NULL; new_inode->i_ctime = current_time(new_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) f2fs_add_orphan_inode(new_inode); @@ -1048,13 +1052,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, true); } - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); @@ -1107,8 +1111,7 @@ out_dir: out_old: f2fs_put_page(old_page, 0); out: - if (whiteout) - iput(whiteout); + iput(whiteout); return err; } @@ -1214,38 +1217,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, /* update directory entry info of old dir inode */ f2fs_set_link(old_dir, old_entry, old_page, new_inode); - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); if (old_nlink) { - down_write(&F2FS_I(old_dir)->i_sem); + f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); - up_write(&F2FS_I(old_dir)->i_sem); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); } f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (!new_dir_entry) file_lost_pino(new_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(new_inode, old_dir->i_ino); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); if (new_nlink) { - down_write(&F2FS_I(new_dir)->i_sem); + f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); - up_write(&F2FS_I(new_dir)->i_sem); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); } f2fs_mark_inode_dirty_sync(new_dir, false); @@ -1300,7 +1303,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return f2fs_rename(mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); } static const char *f2fs_encrypted_get_link(struct dentry *dentry, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 50b2874e758c..c45d341dcf6e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need; } @@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -431,14 +431,14 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct nat_entry *new, *e; /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ - if (rwsem_is_locked(&sbi->cp_global_sem)) + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) return; new = __alloc_nat_entry(sbi, nid, false); if (!new) return; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) e = __init_nat_entry(nm_i, new, ne, false); @@ -447,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); if (e != new) __free_nat_entry(new); } @@ -459,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = __init_nat_entry(nm_i, new, NULL, true); @@ -508,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -516,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) return 0; spin_lock(&nm_i->nat_list_lock); @@ -538,7 +538,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&nm_i->nat_list_lock); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -560,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->nid = nid; retry: /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return 0; } @@ -576,11 +576,11 @@ retry: * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || !down_read_trylock(&curseg->journal_rwsem)) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto retry; } @@ -589,15 +589,15 @@ retry: ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto cache; } /* Fill node_info from nat page */ index = current_nat_addr(sbi, nid); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); if (IS_ERR(page)) @@ -1609,17 +1609,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) + if (!f2fs_down_read_trylock(&sbi->node_write)) goto redirty_out; } else { - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); unlock_page(page); return 0; } @@ -1627,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto redirty_out; } @@ -1648,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); @@ -1782,6 +1782,7 @@ continue_unlock: if (!atomic || page == last_page) { set_fsync_mark(page, 1); + percpu_counter_inc(&sbi->rf_node_block_count); if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) @@ -2111,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[NODE]); - else if (atomic_read(&sbi->wb_sync_req[NODE])) + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } trace_f2fs_writepages(mapping->host, wbc, NODE); @@ -2132,23 +2137,24 @@ skip_write: return 0; } -static int f2fs_set_node_page_dirty(struct page *page) +static bool f2fs_dirty_node_folio(struct address_space *mapping, + struct folio *folio) { - trace_f2fs_set_page_dirty(page, NODE); + trace_f2fs_set_page_dirty(&folio->page, NODE); - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(page)) - f2fs_inode_chksum_set(F2FS_P_SB(page), page); + if (IS_INODE(&folio->page)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - set_page_private_reference(page); - return 1; + if (!folio_test_dirty(folio)) { + filemap_dirty_folio(mapping, folio); + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + set_page_private_reference(&folio->page); + return true; } - return 0; + return false; } /* @@ -2157,8 +2163,8 @@ static int f2fs_set_node_page_dirty(struct page *page) const struct address_space_operations f2fs_node_aops = { .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, - .set_page_dirty = f2fs_set_node_page_dirty, - .invalidatepage = f2fs_invalidate_page, + .dirty_folio = f2fs_dirty_node_folio, + .invalidate_folio = f2fs_invalidate_folio, .releasepage = f2fs_release_page, #ifdef CONFIG_MIGRATION .migratepage = f2fs_migrate_page, @@ -2225,14 +2231,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) unsigned int i; bool ret = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) { ret = false; break; } } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return ret; } @@ -2415,7 +2421,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) unsigned int i, idx; nid_t nid; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) @@ -2438,7 +2444,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) out: scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, @@ -2473,7 +2479,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); while (1) { if (!test_bit_le(NAT_BLOCK_OFFSET(nid), @@ -2488,7 +2494,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, } if (ret) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2508,7 +2514,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* find free nids from current sum_pages */ scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2953,7 +2959,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { unsigned int valid = 0, nid_ofs = 0; @@ -2973,7 +2979,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) __update_nat_bits(nm_i, nat_ofs, valid); } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3071,15 +3077,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * nat_cnt[DIRTY_NAT]. */ if (cpc->reason & CP_UMOUNT) { - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -3108,7 +3114,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ return err; @@ -3218,6 +3224,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); @@ -3228,7 +3235,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -3334,7 +3341,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -3364,7 +3371,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); if (nm_i->free_nid_bitmap) { diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 18b98cf0465b..4c1d34bfea78 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,6 +31,9 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 +/* control total # of node writes used for roll-fowrad recovery */ +#define DEF_RF_NODE_BLOCKS 0 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 #define SETVEC_SIZE 32 diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 79773d322c47..3cb7f8a43b4d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -56,6 +56,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) + return false; return true; } @@ -343,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page) return 0; } +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % sbi->blocks_per_seg) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { @@ -350,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - valid_user_blocks(sbi); int err = 0; @@ -424,11 +442,14 @@ next: break; } + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } return err; } @@ -704,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct page *page = NULL; int err = 0; block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -715,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; - f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = f2fs_get_tmp_page(sbi, blkaddr); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -759,9 +779,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } if (!err) f2fs_allocate_new_segments(sbi); @@ -796,7 +821,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -845,7 +870,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1dabc8244083..22dfeb991529 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,8 +313,7 @@ next: skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -471,7 +470,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); @@ -483,7 +482,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } @@ -521,7 +520,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); f2fs_gc(sbi, false, false, false, NULL_SEGNO); } } @@ -529,7 +528,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { - int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); @@ -570,7 +569,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || - (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem))) + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ @@ -803,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { @@ -1156,14 +1154,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = false; dpolicy->granularity = granularity; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; @@ -1171,12 +1169,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->granularity = 1; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = - DEF_MIN_DISCARD_ISSUE_TIME; + dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; @@ -1781,7 +1779,7 @@ static int issue_discard_thread(void *data) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; - unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); @@ -2180,6 +2178,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; @@ -2821,7 +2823,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) if (!sbi->am.atgc_enabled) return; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -2831,7 +2833,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) @@ -2982,7 +2984,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -3006,7 +3008,7 @@ unlock: type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, @@ -3038,23 +3040,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { @@ -3133,7 +3135,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: @@ -3192,9 +3194,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) goto out; @@ -3431,7 +3433,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -3514,7 +3516,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, @@ -3550,7 +3552,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) - down_read(&fio->sbi->io_order_lock); + f2fs_down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); @@ -3570,7 +3572,7 @@ reallocate: f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) - up_read(&fio->sbi->io_order_lock); + f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, @@ -3705,7 +3707,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; - down_write(&SM_I(sbi)->curseg_lock); + f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ @@ -3774,7 +3776,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_write(&SM_I(sbi)->curseg_lock); + f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -4789,6 +4791,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) sanity_check_seg_type(sbi, curseg->seg_type); + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + return -EFSCORRUPTED; + } + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; @@ -5258,7 +5267,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - init_rwsem(&sm_info->curseg_lock); + init_f2fs_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { err = f2fs_create_flush_cmd_control(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0291cd55cf09..5c94caf0c0a1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -651,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. - * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -667,6 +669,7 @@ enum { F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index baefd398ec1a..ea939db18f88 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1345,8 +1345,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep, - GFP_F2FS_ZERO, false, F2FS_SB(sb)); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + return NULL; + } + + fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; @@ -1355,16 +1359,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); - init_rwsem(&fi->i_sem); + init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - init_rwsem(&fi->i_gc_rwsem[READ]); - init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_xattr_sem); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -1501,8 +1505,9 @@ static void f2fs_free_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void destroy_device_list(struct f2fs_sb_info *sbi) @@ -1662,11 +1667,15 @@ static int f2fs_freeze(struct super_block *sb) /* ensure no checkpoint required */ if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) return -EINVAL; + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } static int f2fs_unfreeze(struct super_block *sb) { + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } @@ -2075,6 +2084,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + unsigned int gc_mode; int err = 0; int ret; block_t unusable; @@ -2087,8 +2097,11 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); + gc_mode = sbi->gc_mode; + sbi->gc_mode = GC_URGENT_HIGH; + while (!f2fs_time_over(sbi, DISABLE_TIME)) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; @@ -2110,7 +2123,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -2122,8 +2135,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); restore_flag: + sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } @@ -2135,19 +2149,18 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); } @@ -2504,8 +2517,7 @@ retry: &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -2688,7 +2700,7 @@ int f2fs_quota_sync(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; - int ret; + int ret = 0; /* * Now when everything is written we can discard the pagecache so @@ -2699,26 +2711,26 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, type)) - return 0; + if (!sb_has_quota_active(sb, cnt)) + continue; inode_lock(dqopt->files[cnt]); /* * do_quotactl * f2fs_quota_sync - * down_read(quota_sem) + * f2fs_down_read(quota_sem) * dquot_writeback_dquots() * f2fs_dquot_commit * block_operation - * down_read(quota_sem) + * f2fs_down_read(quota_sem) */ f2fs_lock_op(sbi); - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); inode_unlock(dqopt->files[cnt]); @@ -2843,11 +2855,11 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -2856,11 +2868,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = dquot_acquire(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -3574,6 +3586,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; @@ -3601,14 +3614,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - init_rwsem(&sbi->io_order_lock); + init_f2fs_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - init_rwsem(&sbi->sb_lock); - init_rwsem(&sbi->pin_sem); + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -3619,11 +3632,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); if (err) - percpu_counter_destroy(&sbi->alloc_valid_block_count); + goto err_node_block; + return 0; +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); return err; } @@ -3957,7 +3979,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; if (f2fs_block_unit_discard(sbi)) sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + 1 << F2FS_IPU_HONOR_OPU_WRITE; } sbi->readdir_ra = 1; @@ -4067,11 +4090,11 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_rwsem(&sbi->gc_lock); + init_f2fs_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); - init_rwsem(&sbi->cp_global_sem); - init_rwsem(&sbi->node_write); - init_rwsem(&sbi->node_change); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); @@ -4092,18 +4115,18 @@ try_onemore: } for (j = HOT; j < n; j++) { - init_rwsem(&sbi->write_io[i][j].io_rwsem); + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); sbi->write_io[i][j].sbi = sbi; sbi->write_io[i][j].bio = NULL; spin_lock_init(&sbi->write_io[i][j].io_lock); INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_rwsem(&sbi->write_io[i][j].bio_list_lock); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); } } - init_rwsem(&sbi->cp_rwsem); - init_rwsem(&sbi->quota_sem); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); @@ -4528,7 +4551,7 @@ static struct file_system_type f2fs_fs_type = { .name = "f2fs", .mount = f2fs_mount, .kill_sb = kill_f2fs_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("f2fs"); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8ac506671245..4c50aedd5144 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -41,6 +41,16 @@ enum { ATGC_INFO, /* struct atgc_management */ }; +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" +}; + struct f2fs_attr { struct attribute attr; ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); @@ -316,8 +326,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif + if (!strcmp(a->attr.name, "gc_urgent")) + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_mode]); + if (!strcmp(a->attr.name, "gc_segment_mode")) - return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_segment_mode]); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return sysfs_emit(buf, "%u\n", @@ -363,7 +378,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) @@ -373,7 +388,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (ret) f2fs_update_extension_list(sbi, name, hot, !set); out: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); return ret ? ret : count; } @@ -468,6 +483,13 @@ out: } } else if (t == 2) { sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } } else { return -EINVAL; } @@ -481,7 +503,7 @@ out: } else if (t == GC_IDLE_AT) { if (!sbi->am.atgc_enabled) return -EINVAL; - sbi->gc_mode = GC_AT; + sbi->gc_mode = GC_IDLE_AT; } else { sbi->gc_mode = GC_NORMAL; } @@ -716,6 +738,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); @@ -728,6 +754,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -832,6 +859,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reclaim_segments), ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), @@ -847,6 +878,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index fe5acdccaae1..3d793202cc9f 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -208,7 +208,7 @@ cleanup: * from re-instantiating cached pages we are truncating (since unlike * normal file accesses, garbage collection isn't limited by i_size). */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); truncate_inode_pages(inode->i_mapping, inode->i_size); err2 = f2fs_truncate(inode); if (err2) { @@ -216,7 +216,7 @@ cleanup: err2); set_sbi_flag(sbi, SBI_NEED_FSCK); } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); return err ?: err2; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8e5cd9c916ff..c76c15086e5f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -525,10 +525,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr, &base_size, &is_inline); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -562,9 +562,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error; size_t rest = buffer_size; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -786,9 +786,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - down_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - up_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c4a274285858..249825017da7 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ if (name_len >= sizeof(d1->d_name)) \ name_len = sizeof(d1->d_name) - 1; \ \ - if (put_user(0, d2->d_name) || \ + if (put_user(0, &d2->d_name[0]) || \ put_user(0, &d2->d_reclen) || \ copy_to_user(d1->d_name, name, name_len) || \ put_user(0, d1->d_name + name_len) || \ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a6f1c6d426d1..bf6051bdf1d1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -342,7 +342,8 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations fat_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = fat_readpage, .readahead = fat_readahead, .writepage = fat_writepage, @@ -745,7 +746,7 @@ static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; - ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/fcntl.c b/fs/fcntl.c index 9c6c6a3e2de5..f15d885b9796 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd, u64 h; switch (cmd) { - case F_GET_FILE_RW_HINT: - h = file_write_hint(file); - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_FILE_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; - - spin_lock(&file->f_lock); - file->f_write_hint = hint; - spin_unlock(&file->f_lock); - return 0; case F_GET_RW_HINT: h = inode->i_write_hint; if (copy_to_user(argp, &h, sizeof(*argp))) @@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_GET_RW_HINT: case F_SET_RW_HINT: - case F_GET_FILE_RW_HINT: - case F_SET_FILE_RW_HINT: err = fcntl_rw_hint(filp, cmd, arg); break; default: diff --git a/fs/file.c b/fs/file.c index 97d212a9b814..ee9317346702 100644 --- a/fs/file.c +++ b/fs/file.c @@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } +/* + * Note how the fdtable bitmap allocations very much have to be a multiple of + * BITS_PER_LONG. This is not only because we walk those things in chunks of + * 'unsigned long' in some places, but simply because that is how the Linux + * kernel bitmaps are defined to work: they are not "bits in an array of bytes", + * they are very much "bits in an array of unsigned long". + * + * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied + * by that "1024/sizeof(ptr)" before, we already know there are sufficient + * clear low bits. Clang seems to realize that, gcc ends up being confused. + * + * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, + * let's consider it documentation (and maybe a test-case for gcc to improve + * its code generation ;) + */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; @@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); + nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal @@ -269,6 +285,19 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +/* + * Note that a sane fdtable size always has to be a multiple of + * BITS_PER_LONG, since we have bitmaps that are sized by this. + * + * 'max_fds' will normally already be properly aligned, but it + * turns out that in the close_range() -> __close_range() -> + * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end + * up having a 'max_fds' value that isn't already aligned. + * + * Rather than make close_range() have to worry about this, + * just make that BITS_PER_LONG alignment be part of a sane + * fdtable size. Becuase that's really what it is. + */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; @@ -276,7 +305,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) count = count_open_files(fdt); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; - return min(count, max_fds); + return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* diff --git a/fs/file_table.c b/fs/file_table.c index 7d2e692b66a9..ada8fe814db9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -412,6 +412,7 @@ void __fput_sync(struct file *file) } EXPORT_SYMBOL(fput); +EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 578a5062706e..22eed5a73ac2 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb) { struct vxfs_inode_info *vi; - vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL); + vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; inode_init_once(&vi->vfs_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f8d7fe6db989..591fe9cf1659 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -894,43 +894,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** - * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion (may be NULL) - * @cong_bits: mask of WB_[a]sync_congested bits to test - * - * Tests whether @inode is congested. @cong_bits is the mask of congestion - * bits to test and the return value is the mask of set bits. - * - * If cgroup writeback is enabled for @inode, the congestion state is - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg - * associated with @inode is congested; otherwise, the root wb's congestion - * state is used. - * - * @inode is allowed to be NULL as this function is often called on - * mapping->host which is NULL for the swapper space. - */ -int inode_congested(struct inode *inode, int cong_bits) -{ - /* - * Once set, ->i_wb never becomes NULL while the inode is alive. - * Start transaction iff ->i_wb is visible. - */ - if (inode && inode_to_wb_is_valid(inode)) { - struct bdi_writeback *wb; - struct wb_lock_cookie lock_cookie = {}; - bool congested; - - wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); - congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, &lock_cookie); - return congested; - } - - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} -EXPORT_SYMBOL_GPL(inode_congested); - -/** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi @@ -1903,8 +1866,7 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - if (current->plug) - blk_flush_plug(current->plug, false); + blk_flush_plug(current->plug, false); cond_resched(); } @@ -2234,7 +2196,6 @@ void wb_workfn(struct work_struct *work) long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); - current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { @@ -2263,8 +2224,6 @@ void wb_workfn(struct work_struct *work) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); - - current->flags &= ~PF_SWAPWRITE; } /* @@ -2301,8 +2260,7 @@ void wakeup_flusher_threads(enum wb_reason reason) /* * If we are expecting writeback progress we must submit plugged IO. */ - if (blk_needs_flush_plug(current)) - blk_flush_plug(current->plug, true); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 76316c4a3fb7..b313a978ae0a 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -38,6 +38,3 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_OLD_API - bool diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 2749933852a9..d645f8b302a2 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -214,7 +214,7 @@ void fscache_relinquish_cache(struct fscache_cache *cache) cache->ops = NULL; cache->cache_priv = NULL; - smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT); + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT); fscache_put_cache(cache, where); } EXPORT_SYMBOL(fscache_relinquish_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9bb1ab5fe5ed..9d3cf0111709 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; -unsigned int fscache_lru_cookie_timeout = 10 * HZ; +static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { @@ -1069,6 +1069,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, } EXPORT_SYMBOL(__fscache_invalidate); +#ifdef CONFIG_PROC_FS /* * Generate a list of extant cookies in /proc/fs/fscache/cookies */ @@ -1145,3 +1146,4 @@ const struct seq_operations fscache_cookies_seq_ops = { .stop = fscache_cookies_seq_stop, .show = fscache_cookies_seq_show, }; +#endif diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f121c21590dc..1336f517e9b1 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -56,7 +56,9 @@ static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_cookies_seq_ops; +#endif extern struct timer_list fscache_cookie_lru_timer; extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); @@ -71,17 +73,6 @@ static inline void fscache_see_cookie(struct fscache_cookie *cookie, } /* - * io.c - */ -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - -/* * main.c */ extern unsigned fscache_debug; @@ -148,7 +139,9 @@ int fscache_stats_show(struct seq_file *m, void *v); /* * volume.c */ +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_volumes_seq_ops; +#endif struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, enum fscache_volume_trace where); diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 7a769ea57720..3af3b08a9bb3 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -159,27 +159,29 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres, EXPORT_SYMBOL(__fscache_begin_write_operation); /** - * fscache_set_page_dirty - Mark page dirty and pin a cache object for writeback - * @page: The page being dirtied + * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. + * @folio: The folio being dirtied. * @cookie: The cookie referring to the cache object * - * Set the dirty flag on a page and pin an in-use cache object in memory when - * dirtying a page so that writeback can later write to it. This is intended - * to be called from the filesystem's ->set_page_dirty() method. + * Set the dirty flag on a folio and pin an in-use cache object in memory + * so that writeback can later write to it. This is intended + * to be called from the filesystem's ->dirty_folio() method. * - * Returns 1 if PG_dirty was set on the page, 0 otherwise. + * Return: true if the dirty flag was set on the folio, false otherwise. */ -int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie) +bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, + struct fscache_cookie *cookie) { - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; bool need_use = false; _enter(""); - if (!__set_page_dirty_nobuffers(page)) - return 0; + if (!filemap_dirty_folio(mapping, folio)) + return false; if (!fscache_cookie_valid(cookie)) - return 1; + return true; if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { spin_lock(&inode->i_lock); @@ -192,9 +194,9 @@ int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie) if (need_use) fscache_use_cookie(cookie, true); } - return 1; + return true; } -EXPORT_SYMBOL(fscache_set_page_dirty); +EXPORT_SYMBOL(fscache_dirty_folio); struct fscache_write_request { struct netfs_cache_resources cache_resources; @@ -233,8 +235,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv; - fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources), - wreq->mapping, wreq->start, wreq->len, + fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, wreq->set_bits); if (wreq->term_func) @@ -294,7 +295,7 @@ abandon_end: abandon_free: kfree(wreq); abandon: - fscache_clear_page_bits(cookie, mapping, start, len, cond); + fscache_clear_page_bits(mapping, start, len, cond); if (term_func) term_func(term_func_priv, ret, false); } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 000d2e5627e9..7cede9a3bc96 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; - struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - - /* - * Get any fuse_mount belonging to this fuse_conn; s_bdi is - * shared between all of them - */ - - if (!list_empty(&fc->mounts)) { - fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); up_read(&fc->killsb); fuse_conn_put(fc); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 182b24a14804..d7d3a7f06862 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1326,8 +1326,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) static const struct address_space_operations fuse_dax_file_aops = { .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, }; static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460d..0e537e580dc1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fm->sb) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fm->sb) { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; @@ -941,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 656e921f3506..9ff27b8a9782 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1773,7 +1773,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* * Only call invalidate_inode_pages2() after removing - * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock. */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 829094451774..f18d14d5fea1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -966,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac) struct fuse_io_args *ia; struct fuse_args_pages *ap; + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + break; + nr_pages = readahead_count(rac) - nr_pages; if (nr_pages > max_pages) nr_pages = max_pages; @@ -1413,6 +1421,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else @@ -1958,6 +1967,7 @@ err: static int fuse_writepage(struct page *page, struct writeback_control *wbc) { + struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; if (fuse_page_is_writeback(page->mapping->host, page->index)) { @@ -1973,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) return 0; } + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return AOP_WRITEPAGE_ACTIVATE; + err = fuse_writepage_locked(page); unlock_page(page); @@ -2226,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping, if (fuse_is_bad(inode)) goto out; + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; + data.inode = inode; data.wpa = NULL; data.ff = NULL; @@ -2330,17 +2348,17 @@ unlock: return copied; } -static int fuse_launder_page(struct page *page) +static int fuse_launder_folio(struct folio *folio) { int err = 0; - if (clear_page_dirty_for_io(page)) { - struct inode *inode = page->mapping->host; + if (folio_clear_dirty_for_io(folio)) { + struct inode *inode = folio->mapping->host; /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, page->index); - err = fuse_writepage_locked(page); + fuse_wait_on_page_writeback(inode, folio->index); + err = fuse_writepage_locked(&folio->page); if (!err) - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_page_writeback(inode, folio->index); } return err; } @@ -3161,8 +3179,8 @@ static const struct address_space_operations fuse_file_aops = { .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, - .launder_page = fuse_launder_page, - .set_page_dirty = __set_page_dirty_nobuffers, + .launder_folio = fuse_launder_folio, + .dirty_folio = filemap_dirty_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e59fbdefeb..488b460e046f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; @@ -626,7 +627,7 @@ struct fuse_conn { /** Connection successful. Only set in INIT */ unsigned conn_init:1; - /** Do readpages asynchronously? Only set in INIT */ + /** Do readahead asynchronously? Only set in INIT */ unsigned async_read:1; /** Return an unique read error after abort. Only set in INIT */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ee846ce371d8..8c0665c5dff8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -23,6 +23,7 @@ #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/pid_namespace.h> +#include <uapi/linux/magic.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ @@ -73,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; - fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); if (!fi) return NULL; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fbc09dab1f85..33cde4bbccdc 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -170,7 +170,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, #else if (flags & FUSE_IOCTL_COMPAT) { inarg.flags |= FUSE_IOCTL_32BIT; -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) inarg.flags |= FUSE_IOCTL_COMPAT_X32; #endif @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 9d737904d07c..86b7dbb6a0d4 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -8,6 +8,7 @@ #include <linux/dax.h> #include <linux/pci.h> #include <linux/pfn_t.h> +#include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 005e920f5d4a..72c9f31ce724 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -606,18 +606,12 @@ out: gfs2_trans_end(sdp); } -/** - * jdata_set_page_dirty - Page dirtying function - * @page: The page to dirty - * - * Returns: 1 if it dirtyed the page, or 0 otherwise - */ - -static int jdata_set_page_dirty(struct page *page) +static bool jdata_dirty_folio(struct address_space *mapping, + struct folio *folio) { if (current->journal_info) - SetPageChecked(page); - return __set_page_dirty_buffers(page); + folio_set_checked(folio); + return block_dirty_folio(mapping, folio); } /** @@ -672,22 +666,23 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) unlock_buffer(bh); } -static void gfs2_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void gfs2_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - unsigned int stop = offset + length; - int partial_page = (offset || length < PAGE_SIZE); + struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host); + size_t stop = offset + length; + int partial_page = (offset || length < folio_size(folio)); struct buffer_head *bh, *head; unsigned long pos = 0; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); if (!partial_page) - ClearPageChecked(page); - if (!page_has_buffers(page)) + folio_clear_checked(folio); + head = folio_buffers(folio); + if (!head) goto out; - bh = head = page_buffers(page); + bh = head; do { if (pos + bh->b_size > stop) return; @@ -699,7 +694,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset, } while (bh != head); out: if (!partial_page) - try_to_release_page(page, 0); + filemap_release_folio(folio, 0); } /** @@ -779,9 +774,9 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .releasepage = iomap_releasepage, - .invalidatepage = iomap_invalidatepage, + .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, @@ -794,9 +789,9 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, - .set_page_dirty = jdata_set_page_dirty, + .dirty_folio = jdata_dirty_folio, .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, + .invalidate_folio = gfs2_invalidate_folio, .releasepage = gfs2_releasepage, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d67108489148..39080b2d6cf8 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -606,9 +606,9 @@ out: return ret; } -static inline __be64 *gfs2_indirect_init(struct metapath *mp, - struct gfs2_glock *gl, unsigned int i, - unsigned offset, u64 bn) +static inline void gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) { __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + ((i > 1) ? sizeof(struct gfs2_meta_header) : @@ -621,7 +621,6 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; *ptr = cpu_to_be64(bn); - return ptr; } enum alloc_state { @@ -2146,7 +2145,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rs_delete(ip, NULL); + gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 8c39a8571b1f..22b41acfbbc3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -706,7 +706,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; @@ -775,8 +775,7 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, size_t *window_size) { size_t count = iov_iter_count(i); - char __user *p; - int pages = 1; + size_t size, offs; if (likely(!count)) return false; @@ -785,18 +784,20 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (!iter_is_iovec(i)) return false; + size = PAGE_SIZE; + offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); if (*prev_count != count || !*window_size) { - int pages, nr_dirtied; + size_t nr_dirtied; - pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE)); + size = ALIGN(offs + count, PAGE_SIZE); + size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - - current->nr_dirtied, 1); - pages = min(pages, nr_dirtied); + current->nr_dirtied, 8); + size = min(size, nr_dirtied << PAGE_SHIFT); } *prev_count = count; - p = i->iov[0].iov_base + i->iov_offset; - *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); + *window_size = size - offs; return true; } @@ -851,9 +852,9 @@ retry_under_glock: leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } if (gfs2_holder_queued(gh)) @@ -920,9 +921,9 @@ retry_under_glock: leftover = fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } out: @@ -950,20 +951,19 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and retry. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = gfs2_file_direct_read(iocb, to, &gh); - if (likely(ret != -ENOTBLK)) - return ret; - iocb->ki_flags &= ~IOCB_DIRECT; - } + if (iocb->ki_flags & IOCB_DIRECT) + return gfs2_file_direct_read(iocb, to, &gh); + + pagefault_disable(); iocb->ki_flags |= IOCB_NOIO; ret = generic_file_read_iter(iocb, to); iocb->ki_flags &= ~IOCB_NOIO; + pagefault_enable(); if (ret >= 0) { if (!iov_iter_count(to)) return ret; written = ret; - } else { + } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; if (iocb->ki_flags & IOCB_NOWAIT) @@ -989,12 +989,11 @@ retry_under_glock: leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); if (leftover != window_size) { - if (!gfs2_holder_queued(&gh)) { - if (written) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(&gh)) + goto retry_under_glock; + if (written) + goto out_uninit; + goto retry; } } if (gfs2_holder_queued(&gh)) @@ -1068,12 +1067,11 @@ retry_under_glock: gfs2_holder_disallow_demote(gh); if (leftover != window_size) { from->count = min(from->count, window_size - leftover); - if (!gfs2_holder_queued(gh)) { - if (read) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + if (read && !(iocb->ki_flags & IOCB_DIRECT)) + goto out_uninit; + goto retry; } } out_unlock: @@ -1083,6 +1081,7 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); + from->count = orig_count - read; return read ? read : ret; } @@ -1497,7 +1496,6 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (error != GLR_TRYFAILED) break; fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; - fl_gh->gh_error = 0; msleep(sleeptime); } if (error) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6b23399eaee0..630c6550eacf 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -542,7 +542,7 @@ restart: * some reason. If this holder is the head of the list, it * means we have a blocked holder at the head, so return 1. */ - if (gh->gh_list.prev == &gl->gl_holders) + if (list_is_first(&gh->gh_list, &gl->gl_holders)) return 1; do_error(gl, 0); break; @@ -669,6 +669,8 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) /* Check for state != intended state */ if (unlikely(state != gl->gl_target)) { + if (gh && (ret & LM_OUT_CANCELED)) + gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { @@ -1259,7 +1261,6 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; - gh->gh_error = 0; gh->gh_iflags = 0; gfs2_glock_hold(gl); } @@ -1565,6 +1566,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); + gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && @@ -1691,6 +1693,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; spin_lock(&gl->gl_lockref.lock); + if (list_is_first(&gh->gh_list, &gl->gl_holders) && + !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + spin_unlock(&gl->gl_lockref.lock); + gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + __gfs2_glock_dq(gh); spin_unlock(&gl->gl_lockref.lock); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 89905f4f29bb..c8ec876f33ea 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -131,7 +131,21 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, + &ip->i_gl); + if (unlikely(error)) + goto fail; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, + &io_gl); + if (unlikely(error)) + goto fail; + + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + &ip->i_iopen_gh); + gfs2_glock_put(io_gl); if (unlikely(error)) goto fail; @@ -161,16 +175,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail; - if (blktype != GFS2_BLKST_UNLINKED) - gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - gfs2_glock_put(io_gl); - if (unlikely(error)) - goto fail; - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; @@ -716,13 +720,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock3; + error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_gunlock3; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -731,10 +739,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, init_dinode(dip, ip, symname); gfs2_trans_end(sdp); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - if (error) - goto fail_gunlock2; - glock_set_object(ip->i_gl, ip); glock_set_object(io_gl, ip); gfs2_set_iop(inode); @@ -745,14 +749,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(default_acl); default_acl = NULL; } if (acl) { error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(acl); acl = NULL; } @@ -760,11 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) - goto fail_gunlock3; + goto fail_gunlock4; error = link_dinode(dip, name, ip, &da); if (error) - goto fail_gunlock3; + goto fail_gunlock4; mark_inode_dirty(inode); d_instantiate(dentry, inode); @@ -782,9 +786,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); return error; -fail_gunlock3: +fail_gunlock4: glock_clear_object(ip->i_gl, ip); glock_clear_object(io_gl, ip); +fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: gfs2_glock_put(io_gl); @@ -793,7 +798,7 @@ fail_free_inode: if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 50578f881e6d..2559a79cf14b 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -261,6 +261,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, int req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; + int error; req = make_mode(gl->gl_name.ln_sbd, req_state); lkf = make_flags(gl, flags, req); @@ -279,8 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, * Submit the actual lock request. */ - return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, +again: + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + if (error == -EBUSY) { + msleep(20); + goto again; + } + return error; } static void gdlm_put_lock(struct gfs2_glock *gl) @@ -312,8 +319,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } +again: error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); + if (error == -EBUSY) { + msleep(20); + goto again; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ca0bb3a73912..6ba51cbb94cf 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO); bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; - bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -489,11 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) { struct bio *new; - new = bio_alloc(GFP_NOIO, nr_iovecs); - bio_copy_dev(new, prev); + new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO); + bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; bio_chain(new, prev); submit_bio(prev); return new; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 72d30a682ece..d8bd1d48bd78 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -89,13 +89,15 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } const struct address_space_operations gfs2_meta_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; const struct address_space_operations gfs2_rgrp_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; @@ -222,9 +224,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], struct buffer_head *bh = *bhs; struct bio *bio; - bio = bio_alloc(GFP_NOIO, num); + bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); while (num > 0) { bh = *bhs; if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { @@ -235,7 +236,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], num--; } bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); } } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7f8410d8fdc1..c9b423c874a3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) ClearPageDirty(page); lock_page(page); - bio = bio_alloc(GFP_NOFS, 1); + bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio_set_dev(bio, sb->s_bdev); bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META); submit_bio(bio); wait_on_page_locked(page); bio_put(bio); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0fb3c01bc557..801ad9f4f2be 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation - * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) + if (atomic_read(&inode->i_writecount) <= 1) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); } @@ -922,15 +923,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock_init(&rgd->rd_rsspin); mutex_init(&rgd->rd_mutex); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_PREFERRED; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -944,6 +945,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: @@ -1415,7 +1417,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); - minlen = max_t(u64, r.minlen, + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, q->limits.discard_granularity) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3e2ca1fb4305..46dd94e9e085 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 64c67090f503..bdb773e5c88f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1396,7 +1396,7 @@ out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); @@ -1425,7 +1425,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) { struct gfs2_inode *ip; - ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); if (!ip) return NULL; ip->i_flags = 0; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a6002b2d146d..d87ea98cf535 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -15,7 +15,7 @@ #include <linux/kobject.h> #include <linux/uaccess.h> #include <linux/gfs2_ondisk.h> -#include <linux/genhd.h> +#include <linux/blkdev.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 2a5143246282..55f45e9b4930 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -159,7 +159,8 @@ static int hfs_writepages(struct address_space *mapping, } const struct address_space_operations hfs_btree_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, @@ -169,7 +170,8 @@ const struct address_space_operations hfs_btree_aops = { }; const struct address_space_operations hfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 5beb82652435..8082eb01127c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -9,7 +9,7 @@ */ #include <linux/cdrom.h> -#include <linux/genhd.h> +#include <linux/blkdev.h> #include <linux/nls.h> #include <linux/slab.h> diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 12d9bae39363..6764afa98a6f 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) { struct hfs_inode_info *i; - i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d08a8d1d40a4..446a816aa8e1 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -156,7 +156,8 @@ static int hfsplus_writepages(struct address_space *mapping, } const struct address_space_operations hfsplus_btree_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, @@ -166,7 +167,8 @@ const struct address_space_operations hfsplus_btree_aops = { }; const struct address_space_operations hfsplus_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b9e3db3f855f..8479add998b5 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) { struct hfsplus_inode_info *i; - i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 51ae6f1eb4a5..0b8ad6586df5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/cdrom.h> -#include <linux/genhd.h> #include <asm/unaligned.h> #include "hfsplus_fs.h" @@ -64,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, offset = start & (io_size - 1); sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, sb->s_bdev); - bio_set_op_attrs(bio, op, op_flags); if (op != WRITE && data) *data = (u8 *)buf + offset; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ef481c3d9019..14f9ac973a2e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -14,6 +14,7 @@ #include <linux/statfs.h> #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/writeback.h> #include <linux/mount.h> #include <linux/namei.h> #include "hostfs.h" @@ -222,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT); + hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; @@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, static const struct address_space_operations hostfs_aops = { .writepage = hostfs_writepage, .readpage = hostfs_readpage, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .write_begin = hostfs_write_begin, .write_end = hostfs_write_end, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index fb37f57130aa..99493a23c5d0 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -245,7 +245,8 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } const struct address_space_operations hpfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = hpfs_readpage, .writepage = hpfs_writepage, .readahead = hpfs_readahead, diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a7dbfc892022..1cb89595b875 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7c6c7498be0..dd3a088db11d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -1110,7 +1111,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; - p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); + p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; @@ -1144,7 +1145,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, .migratepage = hugetlbfs_migrate_page, .error_remove_page = hugetlbfs_error_remove_page, }; diff --git a/fs/inode.c b/fs/inode.c index 63324df6fa27..9d9b422504d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb) if (ops->alloc_inode) inode = ops->alloc_inode(sb); else - inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; diff --git a/fs/internal.h b/fs/internal.h index 8590c973c2f4..08503dc68d2b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -74,7 +74,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); -extern int finish_automount(struct vfsmount *, struct path *); +extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); @@ -158,11 +158,6 @@ extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); /* - * read_write.c - */ -extern int rw_verify_area(int, struct file *, const loff_t *, size_t); - -/* * pipe.c */ extern const struct file_operations pipefifo_fops; @@ -184,7 +179,9 @@ int sb_init_dio_done_wq(struct super_block *sb); /* * fs/stat.c: */ -int do_statx(int dfd, const char __user *filename, unsigned flags, + +int getname_statx_lookup_flags(int flags); +int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* diff --git a/fs/io-wq.c b/fs/io-wq.c index bb7f161bb19c..32aeb2c581c5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,7 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> -#include <linux/tracehook.h> +#include <linux/task_work.h> #include <linux/audit.h> #include <uapi/linux/io_uring.h> @@ -76,6 +76,7 @@ struct io_wqe_acct { unsigned max_workers; int index; atomic_t nr_running; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; }; @@ -91,7 +92,7 @@ enum { */ struct io_wqe { raw_spinlock_t lock; - struct io_wqe_acct acct[2]; + struct io_wqe_acct acct[IO_WQ_ACCT_NR]; int node; @@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker) if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - preempt_disable(); + raw_spin_unlock(&wqe->lock); io_wqe_dec_running(worker); worker->flags = 0; + preempt_disable(); current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker) static inline bool io_acct_run_queue(struct io_wqe_acct *acct) { + bool ret = false; + + raw_spin_lock(&acct->lock); if (!wq_list_empty(&acct->work_list) && !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - return true; - return false; + ret = true; + raw_spin_unlock(&acct->lock); + + return ret; } /* @@ -385,7 +391,6 @@ fail: } static void io_wqe_dec_running(struct io_worker *worker) - __must_hold(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!(worker->flags & IO_WORKER_F_UP)) return; - if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - raw_spin_unlock(&wqe->lock); - io_queue_worker_create(worker, acct, create_worker_cb); - raw_spin_lock(&wqe->lock); - } + if (!atomic_dec_and_test(&acct->nr_running)) + return; + if (!io_acct_run_queue(acct)) + return; + + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + io_queue_worker_create(worker, acct, create_worker_cb); } /* @@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker) * it's currently on the freelist */ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; + raw_spin_lock(&wqe->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); + raw_spin_unlock(&wqe->lock); } } @@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, struct io_worker *worker) - __must_hold(wqe->lock) + __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; @@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, * work being added and clearing the stalled bit. */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); unstalled = io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); if (wq_has_sleeper(&wqe->wq->hash->wait)) @@ -515,7 +522,9 @@ static bool io_flush_signals(void) { if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { __set_current_state(TASK_RUNNING); - tracehook_notify_signal(); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); return true; } return false; @@ -538,7 +547,6 @@ static void io_assign_current_work(struct io_worker *worker, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) - __releases(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -555,7 +563,9 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ + raw_spin_lock(&acct->lock); work = io_get_next_work(acct, worker); + raw_spin_unlock(&acct->lock); if (work) { __io_worker_busy(wqe, worker); @@ -569,10 +579,9 @@ static void io_worker_handle_work(struct io_worker *worker) raw_spin_lock(&worker->lock); worker->next_work = work; raw_spin_unlock(&worker->lock); - } - raw_spin_unlock(&wqe->lock); - if (!work) + } else { break; + } io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -608,8 +617,6 @@ static void io_worker_handle_work(struct io_worker *worker) wake_up(&wq->hash->wait); } } while (work); - - raw_spin_lock(&wqe->lock); } while (1); } @@ -633,12 +640,10 @@ static int io_wqe_worker(void *data) long ret; set_current_state(TASK_INTERRUPTIBLE); -loop: - raw_spin_lock(&wqe->lock); - if (io_acct_run_queue(acct)) { + while (io_acct_run_queue(acct)) io_worker_handle_work(worker); - goto loop; - } + + raw_spin_lock(&wqe->lock); /* timed out, exit unless we're the last worker */ if (last_timeout && acct->nr_workers > 1) { acct->nr_workers--; @@ -662,10 +667,8 @@ loop: last_timeout = !ret; } - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock(&wqe->lock); + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) io_worker_handle_work(worker); - } audit_free(current); io_worker_exit(worker); @@ -705,10 +708,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - - raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock(&worker->wqe->lock); } static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, @@ -778,10 +778,12 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; + raw_spin_unlock(&wqe->lock); while (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + ; + } else { + raw_spin_unlock(&wqe->lock); } - raw_spin_unlock(&wqe->lock); io_worker_ref_put(wqe->wq); kfree(worker); return; @@ -914,6 +916,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -927,10 +930,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); io_wqe_insert_work(wqe, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); + raw_spin_unlock(&acct->lock); + raw_spin_lock(&wqe->lock); rcu_read_lock(); do_create = !io_wqe_activate_free_worker(wqe, acct); rcu_read_unlock(); @@ -946,18 +951,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; raw_spin_lock(&wqe->lock); - /* fatal condition, failed to create the first worker */ - if (!acct->nr_workers) { - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_item, - .data = work, - .cancel_all = false, - }; - - if (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + if (acct->nr_workers) { + raw_spin_unlock(&wqe->lock); + return; } raw_spin_unlock(&wqe->lock); + + /* fatal condition, failed to create the first worker */ + match.fn = io_wq_work_match_item, + match.data = work, + match.cancel_all = false, + + io_acct_cancel_pending_work(wqe, acct, &match); } } @@ -1032,22 +1037,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match) - __releases(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; + raw_spin_lock(&acct->lock); wq_list_for_each(node, prev, &acct->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); io_run_cancel(work, wqe); match->nr_pending++; /* not safe to continue after unlock */ return true; } + raw_spin_unlock(&acct->lock); return false; } @@ -1061,7 +1067,6 @@ retry: struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { - raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; break; @@ -1103,13 +1108,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) { - raw_spin_unlock(&wqe->lock); + if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - } + raw_spin_lock(&wqe->lock); io_wqe_cancel_running_work(wqe, &match); raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) @@ -1190,6 +1193,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) acct->index = i; atomic_set(&acct->nr_running, 0); INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } wqe->wq = wq; raw_spin_lock_init(&wqe->lock); @@ -1282,9 +1286,7 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } @@ -1376,7 +1378,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); - for (i = 0; i < 2; i++) { + for (i = 0; i < IO_WQ_ACCT_NR; i++) { if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) new_count[i] = task_rlimit(current, RLIMIT_NPROC); } diff --git a/fs/io_uring.c b/fs/io_uring.c index 77b9c7e4793b..4479013854d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,7 +78,6 @@ #include <linux/task_work.h> #include <linux/pagemap.h> #include <linux/io_uring.h> -#include <linux/tracehook.h> #include <linux/audit.h> #include <linux/security.h> @@ -112,8 +111,7 @@ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -263,11 +261,18 @@ struct io_rsrc_data { bool quiesce; }; +struct io_buffer_list { + struct list_head list; + struct list_head buf_list; + __u16 bgid; +}; + struct io_buffer { struct list_head list; __u64 addr; __u32 len; __u16 bid; + __u16 bgid; }; struct io_restriction { @@ -326,6 +331,14 @@ struct io_submit_state { struct blk_plug plug; }; +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; +}; + +#define IO_BUFFERS_HASH_BITS 5 + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -335,11 +348,11 @@ struct io_ring_ctx { unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; - unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -378,7 +391,9 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct xarray io_buffers; + struct list_head *io_buffers; + struct list_head io_buffers_cache; + struct list_head apoll_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -399,7 +414,7 @@ struct io_ring_ctx { struct { unsigned cached_cq_tail; unsigned cq_entries; - struct eventfd_ctx *cq_ev_fd; + struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -421,6 +436,8 @@ struct io_ring_ctx { struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; + + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; struct io_restriction restrictions; @@ -436,6 +453,8 @@ struct io_ring_ctx { struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; }; /* Keep this last, we don't need it for the fast path */ @@ -461,6 +480,11 @@ struct io_ring_ctx { }; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + struct io_uring_task { /* submission side */ int cached_refs; @@ -469,13 +493,13 @@ struct io_uring_task { const struct io_ring_ctx *last; struct io_wq *io_wq; struct percpu_counter inflight; - atomic_t inflight_tracked; atomic_t in_idle; spinlock_t task_lock; struct io_wq_work_list task_list; struct io_wq_work_list prior_task_list; struct callback_head task_work; + struct file **registered_rings; bool task_running; }; @@ -560,7 +584,8 @@ struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; u64 addr; - u64 len; + u32 len; + u32 flags; }; struct io_connect { @@ -579,6 +604,7 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; + size_t done_io; }; struct io_open { @@ -621,10 +647,10 @@ struct io_epoll { struct io_splice { struct file *file_out; - struct file *file_in; loff_t off_out; loff_t off_in; u64 len; + int splice_fd_in; unsigned int flags; }; @@ -642,7 +668,7 @@ struct io_statx { int dfd; unsigned int mask; unsigned int flags; - const char __user *filename; + struct filename *filename; struct statx __user *buffer; }; @@ -690,6 +716,12 @@ struct io_hardlink { int flags; }; +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -741,6 +773,9 @@ enum { REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -799,6 +834,12 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), }; struct async_poll { @@ -825,7 +866,7 @@ enum { * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, - * or directly as just 'ki_filp' in this struct. + * or directly as just 'file' in this struct. */ struct io_kiocb { union { @@ -855,6 +896,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_msg msg; }; u8 opcode; @@ -865,7 +907,11 @@ struct io_kiocb { u64 user_data; u32 result; - u32 cflags; + /* fd initially, then cflags for completion */ + union { + u32 cflags; + int fd; + }; struct io_ring_ctx *ctx; struct task_struct *task; @@ -874,10 +920,14 @@ struct io_kiocb { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; + union { + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + /* cache ->apoll->events */ + int apoll_events; + }; atomic_t refs; - struct io_kiocb *link; + atomic_t poll_refs; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -885,12 +935,13 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - struct io_wq_work work; - /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - atomic_t poll_refs; + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + struct io_kiocb *link; + /* custom credentials, valid IFF REQ_F_CREDS is set */ + const struct cred *creds; + struct io_wq_work work; }; struct io_tctx_node { @@ -917,6 +968,7 @@ struct io_op_def { /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; + unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; /* do prep async if is going to be punted */ @@ -1011,6 +1063,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .poll_exclusive = 1, }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1105,6 +1158,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1127,8 +1183,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed); +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); +static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static void io_drop_inflight_file(struct io_kiocb *req); +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1141,6 +1200,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1257,75 +1317,127 @@ static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) } static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx) + struct io_ring_ctx *ctx, + unsigned int issue_flags) { if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + lockdep_assert_held(&ctx->uring_lock); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); + } else { + percpu_ref_get(req->fixed_rsrc_refs); + } } } -static unsigned int __io_put_kbuf(struct io_kiocb *req) +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { struct io_buffer *kbuf = req->kbuf; unsigned int cflags; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; + cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); + list_add(&kbuf->list, list); req->kbuf = NULL; return cflags; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { + lockdep_assert_held(&req->ctx->completion_lock); + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf(req); + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } -static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) { - bool got = percpu_ref_tryget(ref); + unsigned int cflags; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; - /* already at zero, wait for ->release() */ - if (!got) - wait_for_completion(compl); - percpu_ref_resurrect(ref); - if (got) - percpu_ref_put(ref); + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + lockdep_assert_held(&req->ctx->uring_lock); + + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) - __must_hold(&req->ctx->timeout_lock) +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - struct io_kiocb *req; + struct list_head *hash_list; + struct io_buffer_list *bl; - if (task && head->task != task) - return false; - if (cancel_all) - return true; + hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + list_for_each_entry(bl, hash_list, list) + if (bl->bgid == bgid || bgid == -1U) + return bl; - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; + return NULL; } -static bool io_match_linked(struct io_kiocb *head) +static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { - struct io_kiocb *req; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + /* don't recycle if we already did IO to this buffer */ + if (req->flags & REQ_F_PARTIAL_IO) + return; + + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + + lockdep_assert_held(&ctx->uring_lock); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; + + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static bool io_match_task(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) + __must_hold(&req->ctx->timeout_lock) +{ + if (task && head->task != task) + return false; + return cancel_all; } /* @@ -1335,24 +1447,9 @@ static bool io_match_linked(struct io_kiocb *head) static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { - bool matched; - if (task && head->task != task) return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; + return cancel_all; } static inline bool req_has_async_data(struct io_kiocb *req) @@ -1409,7 +1506,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int hash_bits; + int i, hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -1436,6 +1533,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; + ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, + sizeof(struct list_head), GFP_KERNEL); + if (!ctx->io_buffers) + goto err; + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) + INIT_LIST_HEAD(&ctx->io_buffers[i]); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1444,14 +1548,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->io_buffers_pages); + INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1468,6 +1575,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); + kfree(ctx->io_buffers); kfree(ctx); return NULL; } @@ -1500,14 +1608,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } -static inline void io_req_track_inflight(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_INFLIGHT)) { - req->flags |= REQ_F_INFLIGHT; - atomic_inc(¤t->io_uring->inflight_tracked); - } -} - static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -1551,14 +1651,6 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - switch (req->opcode) { - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } } static void io_prep_async_link(struct io_kiocb *req) @@ -1610,8 +1702,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); + trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, + &req->work, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1652,12 +1744,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->timeout_lock); - while (!list_empty(&ctx->timeout_list)) { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1674,29 +1765,33 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; spin_unlock_irq(&ctx->timeout_lock); } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); -} - static inline void io_commit_cqring(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -1726,23 +1821,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[tail & mask]; } -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +static void io_eventfd_signal(struct io_ring_ctx *ctx) { - if (likely(!ctx->cq_ev_fd)) - return false; + struct io_ev_fd *ev_fd; + + rcu_read_lock(); + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return false; - return !ctx->eventfd_async || io_wq_current_is_worker(); + goto out; + + if (!ev_fd->eventfd_async || io_wq_current_is_worker()) + eventfd_signal(ev_fd->cq_ev_fd, 1); +out: + rcu_read_unlock(); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and @@ -1751,21 +1857,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); +} + +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + io_cqring_wake(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - } - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -1905,8 +2022,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, { struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, user_data, res, cflags); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -1922,16 +2037,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +{ + trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->user_data, res, cflags); +} + static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req->ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; + trace_io_uring_complete(ctx, NULL, user_data, res, cflags); return __io_fill_cqe(ctx, user_data, res, cflags); } @@ -1941,7 +2063,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1956,6 +2078,12 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } io_req_put_rsrc(req, ctx); + /* + * Selected buffer deallocation in io_clean_op() assumes that + * we don't hold ->completion_lock. Clean them here to avoid + * deadlocks. + */ + io_put_kbuf_comp(req); io_dismantle_req(req); io_put_task(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -2000,7 +2128,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, 0); + io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2183,7 +2311,9 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req, link); + trace_io_uring_fail_link(req->ctx, req, req->user_data, + req->opcode, link); + if (!ignore_cqes) { link->flags &= ~REQ_F_CQE_SKIP; io_fill_cqe_req(link, res, 0); @@ -2287,6 +2417,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { if (unlikely(!*uring_locked && *ctx)) ctx_commit_and_unlock(*ctx); @@ -2302,7 +2434,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, io_put_kbuf(req)); + __io_req_complete_post(req, req->result, + io_put_kbuf_comp(req)); node = next; } while (node); @@ -2318,6 +2451,8 @@ static void handle_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, locked); *ctx = req->ctx; @@ -2381,6 +2516,8 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) WARN_ON_ONCE(!tctx); + io_drop_inflight_file(req); + spin_lock_irqsave(&tctx->task_lock, flags); if (priority) wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); @@ -2530,8 +2667,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, req->result, - req->cflags); + __io_fill_cqe_req(req, req->result, req->cflags); + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } } io_commit_cqring(ctx); @@ -2590,9 +2735,11 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_run_task_work(void) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { __set_current_state(TASK_RUNNING); - tracehook_notify_signal(); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); return true; } @@ -2653,7 +2800,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); + __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); nr_events++; } @@ -2813,8 +2960,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) + if (req->rw.kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { @@ -2829,14 +2980,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { - io_req_complete_state(req, res, cflags); + io_req_complete_state(req, res, io_put_kbuf(req, 0)); io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, cflags); + io_req_complete_post(req, res, + io_put_kbuf(req, IO_URING_F_UNLOCKED)); } } @@ -2845,7 +2996,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, + io_put_kbuf(req, issue_flags)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2990,50 +3142,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; unsigned ioprio; int ret; - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1) { - if (!(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; - } else { - kiocb->ki_pos = 0; - } - } - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -3049,6 +3162,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); + req->rw.flags = READ_ONCE(sqe->rw_flags); req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3074,6 +3188,23 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + + if (kiocb->ki_pos != -1) + return &kiocb->ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; + } + + kiocb->ki_pos = 0; + return NULL; +} + static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { @@ -3096,14 +3227,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) io_req_task_queue_reissue(req); - } else { - req_set_fail(req); - req->result = ret; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); - } + else + io_req_task_queue_fail(req, ret); } } @@ -3165,7 +3292,8 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter return 0; } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, + unsigned int issue_flags) { struct io_mapped_ubuf *imu = req->imu; u16 index, buf_index = req->buf_index; @@ -3175,7 +3303,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - io_req_set_rsrc_node(req, ctx); + io_req_set_rsrc_node(req, ctx, issue_flags); index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = READ_ONCE(ctx->user_bufs[index]); req->imu = imu; @@ -3201,30 +3329,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) mutex_lock(&ctx->uring_lock); } +static void io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + struct list_head *list; + + list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + INIT_LIST_HEAD(&bl->buf_list); + bl->bgid = bgid; + list_add(&bl->list, list); +} + static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, int bgid, unsigned int issue_flags) { struct io_buffer *kbuf = req->kbuf; - struct io_buffer *head; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; - io_ring_submit_lock(req->ctx, needs_lock); + io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&req->ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); - head = xa_load(&req->ctx->io_buffers, bgid); - if (head) { - if (!list_empty(&head->list)) { - kbuf = list_last_entry(&head->list, struct io_buffer, - list); - list_del(&kbuf->list); - } else { - kbuf = head; - xa_erase(&req->ctx->io_buffers, bgid); - } + bl = io_buffer_get_list(ctx, bgid); + if (bl && !list_empty(&bl->buf_list)) { + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; @@ -3331,7 +3465,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter); + ret = io_import_fixed(req, rw, iter, issue_flags); if (ret) return ERR_PTR(ret); return NULL; @@ -3400,6 +3534,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) struct kiocb *kiocb = &req->rw.kiocb; struct file *file = req->file; ssize_t ret = 0; + loff_t *ppos; /* * Don't support polled IO through this interface, and we can't @@ -3412,6 +3547,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + ppos = io_kiocb_ppos(kiocb); + while (iov_iter_count(iter)) { struct iovec iovec; ssize_t nr; @@ -3425,10 +3562,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } if (nr < 0) { @@ -3436,13 +3573,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; } @@ -3527,13 +3666,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; - return io_prep_rw(req, sqe); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -3621,6 +3753,49 @@ static bool need_read_all(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ + struct kiocb *kiocb = &req->rw.kiocb; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (unlikely(!file || !(file->f_mode & mode))) + return -EBADF; + + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + + kiocb->ki_flags = iocb_flags(file); + ret = kiocb_set_rw_flags(kiocb, req->rw.flags); + if (unlikely(ret)) + return ret; + + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + req->flags |= REQ_F_NOWAIT; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) + return -EOPNOTSUPP; + + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; + kiocb->ki_complete = io_complete_rw_iopoll; + req->iopoll_completed = 0; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + + return 0; +} + static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3629,12 +3804,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_async_rw *rw; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(READ, req, &iovec, s, issue_flags); if (unlikely(ret < 0)) return ret; } else { + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + rw = req->async_data; s = &rw->s; /* @@ -3645,6 +3831,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -3659,7 +3848,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(READ, req->file, ppos, req->result); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3669,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3743,14 +3937,6 @@ out_free: return 0; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; - req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); - return io_prep_rw(req, sqe); -} - static int io_write(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3758,6 +3944,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) struct kiocb *kiocb = &req->rw.kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); @@ -3770,6 +3957,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_WRITE); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -3788,7 +3978,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(WRITE, req->file, ppos, req->result); if (unlikely(ret)) goto out_free; @@ -4138,18 +4330,11 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - sp->file_in = NULL; sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - - sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (!sp->file_in) - return -EBADF; - req->flags |= REQ_F_NEED_CLEANUP; + sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); return 0; } @@ -4164,20 +4349,29 @@ static int io_tee_prep(struct io_kiocb *req, static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + if (sp->len) ret = do_tee(in, out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -4196,15 +4390,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; @@ -4213,8 +4416,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -4235,13 +4437,53 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_msg_ring_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || + sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + return -EINVAL; + + req->msg.user_data = READ_ONCE(sqe->off); + req->msg.len = READ_ONCE(sqe->len); + return 0; +} + +static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx; + struct io_msg *msg = &req->msg; + bool filled; + int ret; + + ret = -EBADFD; + if (req->file->f_op != &io_uring_fops) + goto done; + + ret = -EOVERFLOW; + target_ctx = req->file->private_data; + + spin_lock(&target_ctx->completion_lock); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); + io_commit_cqring(target_ctx); + spin_unlock(&target_ctx->completion_lock); + + if (filled) { + io_cqring_ev_posted(target_ctx); + ret = 0; + } + +done: + if (ret < 0) + req_set_fail(req); + __io_req_complete(req, issue_flags, ret, 0); + return 0; +} + static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || @@ -4301,6 +4543,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) req->sync.len); if (ret < 0) req_set_fail(req); + else + fsnotify_modify(req->file); io_req_complete(req, ret); return 0; } @@ -4458,8 +4702,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, return 0; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, - int bgid, unsigned nbufs) +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) { unsigned i = 0; @@ -4468,19 +4712,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return 0; /* the head kbuf is the list itself */ - while (!list_empty(&buf->list)) { + while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; - nxt = list_first_entry(&buf->list, struct io_buffer, list); + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); - kfree(nxt); if (++i == nbufs) return i; cond_resched(); } i++; - kfree(buf); - xa_erase(&ctx->io_buffers, bgid); return i; } @@ -4489,7 +4730,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4498,9 +4739,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = xa_load(&ctx->io_buffers, p->bgid); - if (head) - ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) + ret = __io_remove_buffers(ctx, bl, p->nbufs); if (ret < 0) req_set_fail(req); @@ -4545,38 +4786,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return 0; } -static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ + struct io_buffer *buf; + struct page *page; + int bufs_in_page; + + /* + * Completions that don't happen inline (eg not under uring_lock) will + * add to ->io_buffers_comp. If we don't have any free buffers, check + * the completion list and splice those entries first. + */ + if (!list_empty_careful(&ctx->io_buffers_comp)) { + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) { + list_splice_init(&ctx->io_buffers_comp, + &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + return 0; + } + spin_unlock(&ctx->completion_lock); + } + + /* + * No free buffers and no completion entries either. Allocate a new + * page worth of buffer entries and add those to our freelist. + */ + page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!page) + return -ENOMEM; + + list_add(&page->lru, &ctx->io_buffers_pages); + + buf = page_address(page); + bufs_in_page = PAGE_SIZE / sizeof(*buf); + while (bufs_in_page) { + list_add_tail(&buf->list, &ctx->io_buffers_cache); + buf++; + bufs_in_page--; + } + + return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, + struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); - if (!buf) + if (list_empty(&ctx->io_buffers_cache) && + io_refill_buffer_cache(ctx)) break; - + buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, + list); + list_move_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; + buf->bgid = pbuf->bgid; addr += pbuf->len; bid++; - if (!*head) { - INIT_LIST_HEAD(&buf->list); - *head = buf; - } else { - list_add_tail(&buf->list, &(*head)->list); - } + cond_resched(); } - return i ? i : -ENOMEM; + return i ? 0 : -ENOMEM; } static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head, *list; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4584,14 +4867,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = xa_load(&ctx->io_buffers, p->bgid); - - ret = io_add_buffers(p, &head); - if (ret >= 0 && !list) { - ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); - if (ret < 0) - __io_remove_buffers(ctx, head, p->bgid, -1U); + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kmalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + io_buffer_add_list(ctx, bl, p->bgid); } + + ret = io_add_buffers(ctx, p, bl); +err: if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ @@ -4721,6 +5008,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + const char __user *path; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) @@ -4730,10 +5019,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->statx.dfd = READ_ONCE(sqe->fd); req->statx.mask = READ_ONCE(sqe->len); - req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); + path = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); req->statx.flags = READ_ONCE(sqe->statx_flags); + req->statx.filename = getname_flags(path, + getname_statx_lookup_flags(req->statx.flags), + NULL); + + if (IS_ERR(req->statx.filename)) { + int ret = PTR_ERR(req->statx.filename); + + req->statx.filename = NULL; + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -5126,12 +5427,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -5174,6 +5484,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { req_set_fail(req); @@ -5183,7 +5498,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5233,12 +5552,24 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5257,8 +5588,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->nofile = rlimit(RLIMIT_NOFILE); accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || - (accept->flags & SOCK_CLOEXEC))) + if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) return -EINVAL; if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; @@ -5276,9 +5606,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; - if (req->file->f_flags & O_NONBLOCK) - req->flags |= REQ_F_NOWAIT; - if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5406,7 +5733,7 @@ struct io_poll_table { }; #define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK ((1u << 20)-1) +#define IO_POLL_REF_MASK GENMASK(30, 0) /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can @@ -5473,8 +5800,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll) static void io_poll_remove_entries(struct io_kiocb *req) { - struct io_poll_iocb *poll = io_poll_get_single(req); - struct io_poll_iocb *poll_double = io_poll_get_double(req); + /* + * Nothing to do if neither of those flags are set. Avoid dipping + * into the poll/apoll/double cachelines if we can. + */ + if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) + return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, @@ -5492,9 +5823,10 @@ static void io_poll_remove_entries(struct io_kiocb *req) * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); - io_poll_remove_entry(poll); - if (poll_double) - io_poll_remove_entry(poll_double); + if (req->flags & REQ_F_SINGLE_POLL) + io_poll_remove_entry(io_poll_get_single(req)); + if (req->flags & REQ_F_DOUBLE_POLL) + io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } @@ -5506,10 +5838,9 @@ static void io_poll_remove_entries(struct io_kiocb *req) * either spurious wakeup or multishot CQE is served. 0 when it's done with * the request, then the mask is stored in req->result. */ -static int io_poll_check_events(struct io_kiocb *req) +static int io_poll_check_events(struct io_kiocb *req, bool locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_poll_iocb *poll = io_poll_get_single(req); int v; /* req->task == current here, checking PF_EXITING is safe */ @@ -5526,14 +5857,17 @@ static int io_poll_check_events(struct io_kiocb *req) return -ECANCELED; if (!req->result) { - struct poll_table_struct pt = { ._key = poll->events }; + struct poll_table_struct pt = { ._key = req->apoll_events }; + unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; - req->result = vfs_poll(req->file, &pt) & poll->events; + if (unlikely(!io_assign_file(req, flags))) + return -EBADF; + req->result = vfs_poll(req->file, &pt) & req->apoll_events; } /* multishot, just fill an CQE and proceed */ - if (req->result && !(poll->events & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->result & poll->events); + if (req->result && !(req->apoll_events & EPOLLONESHOT)) { + __poll_t mask = mangle_poll(req->result & req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); @@ -5562,7 +5896,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req); + ret = io_poll_check_events(req, *locked); if (ret > 0) return; @@ -5587,7 +5921,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req); + ret = io_poll_check_events(req, *locked); if (ret > 0) return; @@ -5602,35 +5936,45 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask) +static void __io_poll_execute(struct io_kiocb *req, int mask, int events) { req->result = mask; + /* + * This is useful for poll that is armed on behalf of another + * request, and where the wakeup path could be on a different + * CPU. We want to avoid pulling in req->apoll->events for that + * case. + */ + req->apoll_events = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); io_req_task_work_add(req, false); } -static inline void io_poll_execute(struct io_kiocb *req, int res) +static inline void io_poll_execute(struct io_kiocb *req, int res, int events) { if (io_poll_get_ownership(req)) - __io_poll_execute(req, res); + __io_poll_execute(req, res, events); } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, 0); } +#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) +#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_kiocb *req = wait->private; + struct io_kiocb *req = wqe_to_req(wait); struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, wait); __poll_t mask = key_to_poll(key); @@ -5638,7 +5982,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (unlikely(mask & POLLFREE)) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, poll->events); /* * If the waitqueue is being freed early but someone is already @@ -5668,8 +6012,12 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; + if (wqe_is_double(wait)) + req->flags &= ~REQ_F_DOUBLE_POLL; + else + req->flags &= ~REQ_F_SINGLE_POLL; } - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); } return 1; } @@ -5679,6 +6027,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct io_poll_iocb **poll_ptr) { struct io_kiocb *req = pt->req; + unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling @@ -5704,15 +6053,19 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + /* mark as double wq entry */ + wqe_private |= 1; + req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; } + req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; - poll->wait.private = req; + poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -5739,7 +6092,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; - poll->wait.private = req; ipt->pt._key = mask; ipt->req = req; @@ -5773,7 +6125,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, /* can't multishot if failed, just queue the event we've got */ if (unlikely(ipt->error || !ipt->nr_entries)) poll->events |= EPOLLONESHOT; - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); return 0; } @@ -5783,7 +6135,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, */ v = atomic_dec_return(&req->poll_refs); if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0); + __io_poll_execute(req, 0, poll->events); return 0; } @@ -5802,7 +6154,7 @@ enum { IO_APOLL_READY }; -static int io_arm_poll_handler(struct io_kiocb *req) +static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; @@ -5826,20 +6178,30 @@ static int io_arm_poll_handler(struct io_kiocb *req) } else { mask |= POLLOUT | POLLWRNORM; } - - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + } apoll->double_poll = NULL; req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_kbuf_recycle(req, issue_flags); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, + trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -5862,6 +6224,7 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } @@ -5974,7 +6337,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); + req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6091,10 +6454,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (IS_ERR(req)) return PTR_ERR(req); - - req_set_fail(req); - io_fill_cqe_req(req, -ECANCELED, 0); - io_put_req_deferred(req); + io_req_task_queue_fail(req, -ECANCELED); return 0; } @@ -6272,6 +6632,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -6478,6 +6839,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.nr = 0; up.tags = 0; up.resv = 0; + up.resv2 = 0; io_ring_submit_lock(ctx, needs_lock); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, @@ -6498,11 +6860,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - return io_write_prep(req, sqe); + return io_prep_rw(req, sqe); case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: @@ -6567,6 +6928,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_symlinkat_prep(req, sqe); case IORING_OP_LINKAT: return io_linkat_prep(req, sqe); + case IORING_OP_MSG_RING: + return io_msg_ring_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6648,7 +7011,7 @@ fail: goto queue; } - trace_io_uring_defer(ctx, req, req->user_data); + trace_io_uring_defer(ctx, req, req->user_data, req->opcode); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -6657,8 +7020,11 @@ fail: static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) - io_put_kbuf(req); + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); + io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { @@ -6680,11 +7046,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(io->free_iov); break; } - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(req->splice.file_in); - break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: if (req->open.filename) @@ -6708,6 +7069,10 @@ static void io_clean_op(struct io_kiocb *req) putname(req->hardlink.oldpath); putname(req->hardlink.newpath); break; + case IORING_OP_STATX: + if (req->statx.filename) + putname(req->statx.filename); + break; } } if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -6715,11 +7080,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -6729,11 +7089,31 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req->file || !io_op_defs[req->opcode].needs_file) + return true; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->fd, issue_flags); + else + req->file = io_file_get_normal(req, req->fd); + if (req->file) + return true; + + req_set_fail(req); + req->result = -EBADF; + return false; +} + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct cred *creds = NULL; int ret; + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); @@ -6850,6 +7230,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_LINKAT: ret = io_linkat(req, issue_flags); break; + case IORING_OP_MSG_RING: + ret = io_msg_ring(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -6880,10 +7263,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; struct io_kiocb *timeout; - int ret = 0; + int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) @@ -6895,14 +7279,20 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { - io_req_task_queue_fail(req, -ECANCELED); +fail: + io_req_task_queue_fail(req, err); return; } + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + goto fail; + } if (req->flags & REQ_F_FORCE_ASYNC) { - const struct io_op_def *def = &io_op_defs[req->opcode]; bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { @@ -6925,7 +7315,7 @@ static void io_wq_submit_work(struct io_wq_work *work) continue; } - if (io_arm_poll_handler(req) == IO_APOLL_OK) + if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; @@ -6959,46 +7349,56 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file file_slot->file_ptr = file_ptr; } -static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned int issue_flags) { - struct file *file; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = NULL; unsigned long file_ptr; + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; + goto out; fd = array_index_nospec(fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); - io_req_set_rsrc_node(req, ctx); + io_req_set_rsrc_node(req, ctx, 0); +out: + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); return file; } -static struct file *io_file_get_normal(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) +/* + * Drop the file for requeue operations. Only used of req->file is the + * io_uring descriptor itself. + */ +static void io_drop_inflight_file(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_INFLIGHT)) { + fput(req->file); + req->file = NULL; + req->flags &= ~REQ_F_INFLIGHT; + } +} + +static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(ctx, fd); + trace_io_uring_file_get(req->ctx, req, req->user_data, fd); /* we don't allow fixed io_uring files */ - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); + if (file && file->f_op == &io_uring_fops) + req->flags |= REQ_F_INFLIGHT; return file; } -static inline struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed) -{ - if (fixed) - return io_file_get_fixed(ctx, req, fd); - else - return io_file_get_normal(ctx, req, fd); -} - static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; @@ -7071,7 +7471,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) { struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req)) { + switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_req_task_queue(req); break; @@ -7082,6 +7482,8 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) */ io_queue_async_work(req, NULL); break; + case IO_APOLL_OK: + break; } if (linked_timeout) @@ -7236,6 +7638,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; + req->fd = READ_ONCE(sqe->fd); + /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. @@ -7245,11 +7649,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } - - req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); - if (unlikely(!req->file)) - return -EBADF; } personality = READ_ONCE(sqe->personality); @@ -7280,7 +7679,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { - trace_io_uring_req_failed(sqe, ret); + trace_io_uring_req_failed(sqe, ctx, req, ret); /* fail even hard links since we don't submit */ if (link->head) { @@ -7307,7 +7706,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, + trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, req->flags, true, ctx->flags & IORING_SETUP_SQPOLL); @@ -7450,8 +7849,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - if (io_submit_sqe(ctx, req, sqe)) - break; + if (io_submit_sqe(ctx, req, sqe)) { + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) + break; + } } while (submitted < nr); if (unlikely(submitted != nr)) { @@ -7601,7 +8006,7 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !current->task_works) { + if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -7612,6 +8017,13 @@ static int io_sq_thread(void *data) needs_sched = false; break; } + + /* + * Ensure the store of the wakeup flag is not + * reordered with the load of the SQ tail + */ + smp_mb(); + if (io_sqring_entries(ctx)) { needs_sched = false; break; @@ -7683,17 +8095,17 @@ static int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; - if (!signal_pending(current)) - return 0; if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; - return -EINTR; + if (task_sigpending(current)) + return -EINTR; + return 0; } /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - signed long *timeout) + ktime_t timeout) { int ret; @@ -7705,8 +8117,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (test_bit(0, &ctx->check_cq_overflow)) return 1; - *timeout = schedule_timeout(*timeout); - return !*timeout ? -ETIME : 1; + if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + return -ETIME; + return 1; } /* @@ -7719,7 +8132,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - signed long timeout = MAX_SCHEDULE_TIMEOUT; + ktime_t timeout = KTIME_MAX; int ret; do { @@ -7730,14 +8143,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; } while (1); - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = timespec64_to_jiffies(&ts); - } - if (sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) @@ -7751,6 +8156,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } + init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); @@ -7767,7 +8180,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, timeout); finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); @@ -7924,7 +8337,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - break; + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } } atomic_inc(&data->refs); @@ -8219,10 +8640,15 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr_files; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr; i++) { + struct file *file = io_file_from_index(ctx, i + offset); + + if (file) + fput(file); + } } else { kfree_skb(skb); + free_uid(fpl->user); kfree(fpl); } @@ -8510,13 +8936,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { + u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); if (!prsrc) return -ENOMEM; - prsrc->tag = *io_get_tag_slot(data, idx); + prsrc->tag = *tag_slot; + *tag_slot = 0; prsrc->rsrc = rsrc; list_add(&prsrc->list, &node->rsrc_list); return 0; @@ -8585,7 +9013,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; - int ret, i; + int ret; io_ring_submit_lock(ctx, needs_lock); ret = -ENXIO; @@ -8598,8 +9026,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) if (ret) goto out; - i = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); ret = -EBADF; if (!file_slot->file_ptr) goto out; @@ -8655,8 +9083,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (file_slot->file_ptr) { file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, up->offset + done, - ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); if (err) break; file_slot->file_ptr = 0; @@ -8681,7 +9108,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, up->offset + done) = tag; + *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { @@ -8739,8 +9166,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (unlikely(!tctx)) return -ENOMEM; + tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, + sizeof(struct file *), GFP_KERNEL); + if (unlikely(!tctx->registered_rings)) { + kfree(tctx); + return -ENOMEM; + } + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { + kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -8749,6 +9184,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); + kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -8756,7 +9192,6 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8773,6 +9208,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); + kfree(tctx->registered_rings); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; @@ -9330,7 +9766,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(offset, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, offset, + err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->rsrc_node, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); @@ -9349,33 +9785,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, return done ? done : err; } -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) { + struct io_ev_fd *ev_fd; __s32 __user *fds = arg; int fd; - if (ctx->cq_ev_fd) + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) return -EBUSY; if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; - ctx->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ctx->cq_ev_fd)) { - int ret = PTR_ERR(ctx->cq_ev_fd); + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; - ctx->cq_ev_fd = NULL; + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); return ret; } - + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } +static void io_eventfd_put(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + static int io_eventfd_unregister(struct io_ring_ctx *ctx) { - if (ctx->cq_ev_fd) { - eventfd_ctx_put(ctx->cq_ev_fd); - ctx->cq_ev_fd = NULL; + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + call_rcu(&ev_fd->rcu, io_eventfd_put); return 0; } @@ -9384,11 +9842,28 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { - struct io_buffer *buf; - unsigned long index; + int i; + + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { + struct list_head *list = &ctx->io_buffers[i]; - xa_for_each(&ctx->io_buffers, index, buf) - __io_remove_buffers(ctx, buf, index, -1U); + while (!list_empty(list)) { + struct io_buffer_list *bl; + + bl = list_first_entry(list, struct io_buffer_list, list); + __io_remove_buffers(ctx, bl, -1U); + list_del(&bl->list); + kfree(bl); + } + } + + while (!list_empty(&ctx->io_buffers_pages)) { + struct page *page; + + page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); + list_del_init(&page->lru); + __free_page(page); + } } static void io_req_caches_free(struct io_ring_ctx *ctx) @@ -9419,6 +9894,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } +static void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ + struct async_poll *apoll; + + while (!list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del(&apoll->poll.wait.entry); + kfree(apoll); + } +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9440,8 +9927,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_files_unregister(ctx); if (ctx->rings) __io_cqring_overflow_flush(ctx, true); - mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); + io_flush_apoll_cache(ctx); + mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -9475,6 +9963,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); + kfree(ctx->io_buffers); kfree(ctx); } @@ -9894,7 +10383,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) - return atomic_read(&tctx->inflight_tracked); + return 0; return percpu_counter_sum(&tctx->inflight); } @@ -9973,6 +10462,144 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } +void io_uring_unreg_ringfd(void) +{ + struct io_uring_task *tctx = current->io_uring; + int i; + + for (i = 0; i < IO_RINGFD_REG_MAX; i++) { + if (tctx->registered_rings[i]) { + fput(tctx->registered_rings[i]); + tctx->registered_rings[i] = NULL; + } + } +} + +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (file->f_op != &io_uring_fops) { + fput(file); + return -EOPNOTSUPP; + } + tctx->registered_rings[offset] = file; + return offset; + } + + return -EBUSY; +} + +/* + * Register a ring fd to avoid fdget/fdput for each io_uring_enter() + * invocation. User passes in an array of struct io_uring_rsrc_update + * with ->data set to the ring_fd, and ->offset given for the desired + * index. If no index is desired, application may set ->offset == -1U + * and we'll find an available index. Returns number of entries + * successfully processed, or < 0 on error if none were processed. + */ +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_rsrc_update reg; + struct io_uring_task *tctx; + int ret, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + + mutex_unlock(&ctx->uring_lock); + ret = io_uring_add_tctx_node(ctx); + mutex_lock(&ctx->uring_lock); + if (ret) + return ret; + + tctx = current->io_uring; + for (i = 0; i < nr_args; i++) { + int start, end; + + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + + if (reg.resv) { + ret = -EINVAL; + break; + } + + if (reg.offset == -1U) { + start = 0; + end = IO_RINGFD_REG_MAX; + } else { + if (reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + start = reg.offset; + end = start + 1; + } + + ret = io_ring_add_registered_fd(tctx, reg.data, start, end); + if (ret < 0) + break; + + reg.offset = ret; + if (copy_to_user(&arg[i], ®, sizeof(reg))) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + ret = -EFAULT; + break; + } + } + + return i ? i : ret; +} + +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_task *tctx = current->io_uring; + struct io_uring_rsrc_update reg; + int ret = 0, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + if (!tctx) + return 0; + + for (i = 0; i < nr_args; i++) { + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + + reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[reg.offset]) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + } + } + + return i ? i : ret; +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -10085,6 +10712,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; + if (arg.pad) + return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); @@ -10103,12 +10732,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | + IORING_ENTER_REGISTERED_RING))) return -EINVAL; - f = fdget(fd); - if (unlikely(!f.file)) - return -EBADF; + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + if (flags & IORING_ENTER_REGISTERED_RING) { + struct io_uring_task *tctx = current->io_uring; + + if (!tctx || fd >= IO_RINGFD_REG_MAX) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + f.file = tctx->registered_rings[fd]; + if (unlikely(!f.file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + } ret = -EOPNOTSUPP; if (unlikely(f.file->f_op != &io_uring_fops)) @@ -10182,7 +10827,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, out: percpu_ref_put(&ctx->refs); out_fput: - fdput(f); + if (!(flags & IORING_ENTER_REGISTERED_RING)) + fdput(f); return submitted ? submitted : ret; } @@ -10324,7 +10970,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, hlist_for_each_entry(req, list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - req->task->task_works != NULL); + task_work_pending(req->task)); } seq_puts(m, "CqOverflowList:\n"); @@ -10549,7 +11195,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | + IORING_FEAT_LINKED_FILE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -10600,7 +11247,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -10760,8 +11407,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, __u32 tmp; int err; - if (up->resv) - return -EINVAL; if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; err = io_rsrc_node_switch_start(ctx); @@ -10787,6 +11432,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; + if (up.resv || up.resv2) + return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } @@ -10799,7 +11446,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr || up.resv) + if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } @@ -10847,7 +11494,15 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, if (len > cpumask_size()) len = cpumask_size(); - if (copy_from_user(new_mask, arg, len)) { + if (in_compat_syscall()) { + ret = compat_get_bitmap(cpumask_bits(new_mask), + (const compat_ulong_t __user *)arg, + len * 8 /* CHAR_BIT */); + } else { + ret = copy_from_user(new_mask, arg, len); + } + + if (ret) { free_cpumask_var(new_mask); return -EFAULT; } @@ -10950,61 +11605,6 @@ err: return ret; } -static bool io_register_op_must_quiesce(int op) -{ - switch (op) { - case IORING_REGISTER_BUFFERS: - case IORING_UNREGISTER_BUFFERS: - case IORING_REGISTER_FILES: - case IORING_UNREGISTER_FILES: - case IORING_REGISTER_FILES_UPDATE: - case IORING_REGISTER_PROBE: - case IORING_REGISTER_PERSONALITY: - case IORING_UNREGISTER_PERSONALITY: - case IORING_REGISTER_FILES2: - case IORING_REGISTER_FILES_UPDATE2: - case IORING_REGISTER_BUFFERS2: - case IORING_REGISTER_BUFFERS_UPDATE: - case IORING_REGISTER_IOWQ_AFF: - case IORING_UNREGISTER_IOWQ_AFF: - case IORING_REGISTER_IOWQ_MAX_WORKERS: - return false; - default: - return true; - } -} - -static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) -{ - long ret; - - percpu_ref_kill(&ctx->refs); - - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab the - * uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - do { - ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); - if (ret) { - ret = min(0L, ret); - break; - } - - ret = io_run_task_work_sig(); - io_req_caches_free(ctx); - } while (ret >= 0); - mutex_lock(&ctx->uring_lock); - - if (ret) - io_refs_resurrect(&ctx->refs, &ctx->ref_comp); - return ret; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -11028,12 +11628,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -EACCES; } - if (io_register_op_must_quiesce(opcode)) { - ret = io_ctx_quiesce(ctx); - if (ret) - return ret; - } - switch (opcode) { case IORING_REGISTER_BUFFERS: ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); @@ -11057,17 +11651,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: - case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; - ret = io_eventfd_register(ctx, arg); - if (ret) + ret = io_eventfd_register(ctx, arg, 0); + break; + case IORING_REGISTER_EVENTFD_ASYNC: + ret = -EINVAL; + if (nr_args != 1) break; - if (opcode == IORING_REGISTER_EVENTFD_ASYNC) - ctx->eventfd_async = 1; - else - ctx->eventfd_async = 0; + ret = io_eventfd_register(ctx, arg, 1); break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; @@ -11134,16 +11727,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_iowq_max_workers(ctx, arg); break; + case IORING_REGISTER_RING_FDS: + ret = io_ringfd_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_RING_FDS: + ret = io_ringfd_unregister(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } - if (io_register_op_must_quiesce(opcode)) { - /* bring the ctx back to life */ - percpu_ref_reinit(&ctx->refs); - reinit_completion(&ctx->ref_comp); - } return ret; } @@ -11169,8 +11763,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, - ctx->cq_ev_fd != NULL, ret); + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); out_fput: fdput(f); return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 1ed097e94af2..80ac36aea913 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -173,7 +173,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, if (*len == 0) return -EINVAL; - if (start > maxbytes) + if (start >= maxbytes) return -EFBIG; /* @@ -236,9 +236,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, if (!src_file.file) return -EBADF; - ret = -EXDEV; - if (src_file.file->f_path.mnt != dst_file->f_path.mnt) - goto fdput; cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen, 0); if (cloned < 0) @@ -247,7 +244,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EINVAL; else ret = 0; -fdput: fdput(src_file); return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6c51a75d0be6..8ce8720093b9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -292,19 +292,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); + ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), + REQ_OP_READ, gfp); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates * what do_mpage_readpage does. */ - if (!ctx->bio) - ctx->bio = bio_alloc(orig_gfp, 1); - ctx->bio->bi_opf = REQ_OP_READ; + if (!ctx->bio) { + ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, + orig_gfp); + } if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; - bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; bio_add_folio(ctx->bio, folio, plen, poff); } @@ -424,37 +425,32 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_readahead); /* - * iomap_is_partially_uptodate checks whether blocks within a page are + * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. * - * Returns true if all blocks which correspond to a file portion - * we want to read within the page are uptodate. + * Returns true if all blocks which correspond to the specified part + * of the folio are uptodate. */ -int -iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct folio *folio = page_folio(page); struct iomap_page *iop = to_iomap_page(folio); - struct inode *inode = page->mapping->host; - unsigned len, first, last; - unsigned i; + struct inode *inode = folio->mapping->host; + unsigned first, last, i; - /* Limit range to one page */ - len = min_t(unsigned, PAGE_SIZE - from, count); + if (!iop) + return false; - /* First and last blocks in range within page */ - first = from >> inode->i_blkbits; - last = (from + len - 1) >> inode->i_blkbits; + /* Caller's range may extend past the end of this folio */ + count = min(folio_size(folio) - from, count); - if (iop) { - for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) - return 0; - return 1; - } + /* First and last blocks in range within folio */ + first = from >> inode->i_blkbits; + last = (from + count - 1) >> inode->i_blkbits; - return 0; + for (i = first; i <= last; i++) + if (!test_bit(i, iop->uptodate)) + return false; + return true; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); @@ -480,7 +476,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) { - trace_iomap_invalidatepage(folio->mapping->host, offset, len); + trace_iomap_invalidate_folio(folio->mapping->host, + folio_pos(folio) + offset, len); /* * If we're invalidating the entire folio, clear the dirty state @@ -499,13 +496,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); -void iomap_invalidatepage(struct page *page, unsigned int offset, - unsigned int len) -{ - iomap_invalidate_folio(page_folio(page), offset, len); -} -EXPORT_SYMBOL_GPL(iomap_invalidatepage); - #ifdef CONFIG_MIGRATION int iomap_migrate_page(struct address_space *mapping, struct page *newpage, @@ -550,10 +540,8 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, struct bio_vec bvec; struct bio bio; - bio_init(&bio, &bvec, 1); - bio.bi_opf = REQ_OP_READ; + bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_set_dev(&bio, iomap->bdev); bio_add_folio(&bio, folio, plen, poff); return submit_bio_wait(&bio); } @@ -776,7 +764,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } @@ -1229,11 +1217,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset); - bio_set_dev(bio, wpc->iomap.bdev); + bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, + REQ_OP_WRITE | wbc_to_write_flags(wbc), + GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = sector; - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - bio->bi_write_hint = inode->i_write_hint; wbc_init_bio(wbc, bio); ioend = container_of(bio, struct iomap_ioend, io_inline_bio); @@ -1261,11 +1248,9 @@ iomap_chain_bio(struct bio *prev) { struct bio *new; - new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); - bio_copy_dev(new, prev);/* also copies over blkcg information */ + new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); + bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; bio_chain(prev, new); bio_get(prev); /* for iomap_finish_ioend */ diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 03ea367df19a..b08f5dc31780 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> +#include <linux/fscrypt.h> #include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/backing-dev.h> @@ -179,19 +180,20 @@ static void iomap_dio_bio_end_io(struct bio *bio) static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { + struct inode *inode = file_inode(dio->iocb->ki_filp); struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iter->iomap.bdev); + bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, + GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, flags); iomap_dio_submit_bio(iter, dio, bio, pos); } @@ -309,14 +311,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio = bio_alloc(GFP_KERNEL, nr_pages); - bio_set_dev(bio, iomap->bdev); + bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, + GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - bio->bi_opf = bio_opf; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 66cf267c68ae..610ca6f1ec9b 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/iomap.h> #include <linux/fiemap.h> +#include <linux/pagemap.h> static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 65e39785c284..a6689a563c6e 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -81,7 +81,7 @@ DEFINE_EVENT(iomap_range_class, name, \ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); -DEFINE_RANGE_EVENT(iomap_invalidatepage); +DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); #define IOMAP_TYPE_STRINGS \ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 0c6eacfcbeef..d7491692aea3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; - ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c2cf74b01ddb..fcacafa4510d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -86,7 +86,7 @@ EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); EXPORT_SYMBOL(jbd2_journal_blocks_per_page); -EXPORT_SYMBOL(jbd2_journal_invalidatepage); +EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 73ed02f061e1..fcb9175016a5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2217,14 +2217,14 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } /* - * jbd2_journal_invalidatepage + * jbd2_journal_invalidate_folio * * This code is tricky. It has a number of cases to deal with. * * There are two invariants which this code relies on: * - * i_size must be updated on disk before we start calling invalidatepage on the - * data. + * i_size must be updated on disk before we start calling invalidate_folio + * on the data. * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this @@ -2426,9 +2426,9 @@ zap_buffer_unlocked: } /** - * jbd2_journal_invalidatepage() + * jbd2_journal_invalidate_folio() * @journal: journal to use for flush... - * @page: page to flush + * @folio: folio to flush * @offset: start of the range to invalidate * @length: length of the range to invalidate * @@ -2437,30 +2437,29 @@ zap_buffer_unlocked: * the page is straddling i_size. Caller then has to wait for current commit * and try again. */ -int jbd2_journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned int offset, - unsigned int length) +int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, + size_t offset, size_t length) { struct buffer_head *head, *bh, *next; unsigned int stop = offset + length; unsigned int curr_off = 0; - int partial_page = (offset || length < PAGE_SIZE); + int partial_page = (offset || length < folio_size(folio)); int may_free = 1; int ret = 0; - if (!PageLocked(page)) + if (!folio_test_locked(folio)) BUG(); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) return 0; - BUG_ON(stop > PAGE_SIZE || stop < length); + BUG_ON(stop > folio_size(folio) || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ - head = bh = page_buffers(page); + bh = head; do { unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; @@ -2483,8 +2482,8 @@ int jbd2_journal_invalidatepage(journal_t *journal, } while (bh != head); if (!partial_page) { - if (may_free && try_to_free_buffers(page)) - J_ASSERT(!page_has_buffers(page)); + if (may_free && try_to_free_buffers(&folio->page)) + J_ASSERT(!folio_buffers(folio)); } return 0; } diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index b288c8ae1236..837cd55fd4c5 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); ret = -EIO; - goto out_free; + goto out_sum_exit; } jffs2_calc_trigger_levels(c); return 0; + out_sum_exit: + jffs2_sum_exit(c); out_free: kvfree(c->blocks); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2ac410477c4f..71f03a5d36ed 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -603,8 +603,8 @@ out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); kvfree(c->blocks); - out_inohash: jffs2_clear_xattr_subsystem(c); + out_inohash: kfree(c->inocache_list); out_wbuf: jffs2_flash_cleanup(c); diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 2e4a86763c07..93a2951538ce 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -18,11 +18,11 @@ #include <linux/mutex.h> struct jffs2_inode_info { - /* We need an internal mutex similar to inode->i_mutex. + /* We need an internal mutex similar to inode->i_rwsem. Unfortunately, we can't used the existing one, because either the GC would deadlock, or we'd have to release it before letting GC proceed. Or we'd have to put ugliness - into the GC code so it didn't attempt to obtain the i_mutex + into the GC code so it didn't attempt to obtain the i_rwsem for the inode(s) which are already locked */ struct mutex sem; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index b676056826be..29671e33a171 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } } @@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - kfree(s); return ret; } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 81ca58c10b72..7ea37f49f1e1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) { struct jffs2_inode_info *f; - f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); + f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL); if (!f) return NULL; return &f->vfs_inode; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 57ab424c05ff..d1943a7b4b04 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - if (JFS_SBI(inode->i_sb)->ipimap) + if (ipimap && JFS_IP(ipimap)->i_imap) diFree(inode); /* @@ -357,7 +358,8 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations jfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = jfs_readpage, .readahead = jfs_readahead, .writepage = jfs_writepage, diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 91f4ec93dab1..d8502f4989d9 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -148,6 +148,7 @@ static const s8 budtab[256] = { * 0 - success * -ENOMEM - insufficient memory * -EIO - i/o error + * -EINVAL - wrong bmap data */ int dbMount(struct inode *ipbmap) { @@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap) bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + if (!bmp->db_numag) { + release_metapage(mp); + kfree(bmp); + return -EINVAL; + } + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 78fd136ac13b..997c81fcea34 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(GFP_NOFS, 1); - + bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_set_dev(bio, log->bdev); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio->bi_opf = REQ_OP_READ; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; @@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO"); - bio = bio_alloc(GFP_NOFS, 1); + bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_set_dev(bio, log->bdev); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 104ae698443e..c4220ccdedef 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) } len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, inode->i_sb->s_bdev); + bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; @@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page) if (bio) submit_bio(bio); - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, inode->i_sb->s_bdev); + bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ, + GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, 0); len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; if (bio_add_page(bio, page, len, offset) < len) @@ -555,22 +552,22 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) return ret; } -static void metapage_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void metapage_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - BUG_ON(offset || length < PAGE_SIZE); + BUG_ON(offset || length < folio_size(folio)); - BUG_ON(PageWriteback(page)); + BUG_ON(folio_test_writeback(folio)); - metapage_releasepage(page, 0); + metapage_releasepage(&folio->page, 0); } const struct address_space_operations jfs_metapage_aops = { .readpage = metapage_readpage, .writepage = metapage_writepage, .releasepage = metapage_releasepage, - .invalidatepage = metapage_invalidatepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .invalidate_folio = metapage_invalidate_folio, + .dirty_folio = filemap_dirty_folio, }; struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 24cbc9946e01..f1a13a74cddf 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; - jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; #ifdef CONFIG_QUOTA diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e6d9772ddb4c..61a8edc4ba8b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -971,6 +971,15 @@ void kernfs_destroy_root(struct kernfs_root *root) } /** + * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root + * @root: root to use to lookup + */ +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) +{ + return root->kn; +} + +/** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 9414a7a60a9f..88423069407c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -120,13 +120,8 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; } + return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) @@ -1002,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, #endif /* - * kn->attr.ops is accesible only while holding active ref. We + * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index f9cc912c31e1..eeaa779b929c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -31,6 +31,24 @@ struct kernfs_iattrs { atomic_t user_xattr_size; }; +struct kernfs_root { + /* published fields */ + struct kernfs_node *kn; + unsigned int flags; /* KERNFS_ROOT_* flags */ + + /* private fields, do not use outside kernfs proper */ + struct idr ino_idr; + u32 last_id_lowbits; + u32 id_highbits; + struct kernfs_syscall_ops *syscall_ops; + + /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ + struct list_head supers; + + wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; +}; + /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) @@ -122,7 +140,6 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ -extern struct rw_semaphore kernfs_rwsem; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 71bfb7de4472..ebe6ca08467a 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -241,7 +241,7 @@ struct ksmbd_rpc_command { struct ksmbd_spnego_authen_request { __u32 handle; __u16 spnego_blob_len; /* the length of spnego_blob */ - __u8 spnego_blob[0]; /* + __u8 spnego_blob[]; /* * the GSS token from SecurityBuffer of * SMB2 SESSION SETUP request */ diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h index adaf4c0cbe8f..f13153c18b4e 100644 --- a/fs/ksmbd/ntlmssp.h +++ b/fs/ksmbd/ntlmssp.h @@ -95,7 +95,7 @@ struct security_buffer { struct target_info { __le16 Type; __le16 Length; - __u8 Content[0]; + __u8 Content[]; } __packed; struct negotiate_message { @@ -108,7 +108,7 @@ struct negotiate_message { * struct security_buffer for version info not present since we * do not set the version is present flag */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __packed; @@ -140,7 +140,7 @@ struct authenticate_message { * struct security_buffer for version info not present since we * do not set the version is present flag */ - char UserString[0]; + char UserString[]; } __packed; struct ntlmv2_resp { diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 077b8761d099..23871b18a429 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -656,8 +656,8 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE; rsp->Reserved = 0; rsp->Reserved2 = 0; - rsp->PersistentFid = cpu_to_le64(fp->persistent_id); - rsp->VolatileFid = cpu_to_le64(fp->volatile_id); + rsp->PersistentFid = fp->persistent_id; + rsp->VolatileFid = fp->volatile_id; inc_rfc1001_len(work->response_buf, 24); diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 2e12f6d8483b..4cd03d661df0 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -585,7 +585,7 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; - pr_warn_once("The ksmbd server is experimental, use at your own risk.\n"); + pr_warn_once("The ksmbd server is experimental\n"); return 0; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 67e8e28e3fc3..3bf6c56c654c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -377,12 +377,8 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) * command in the compound request */ if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) { - work->compound_fid = - le64_to_cpu(((struct smb2_create_rsp *)rsp)-> - VolatileFileId); - work->compound_pfid = - le64_to_cpu(((struct smb2_create_rsp *)rsp)-> - PersistentFileId); + work->compound_fid = ((struct smb2_create_rsp *)rsp)->VolatileFileId; + work->compound_pfid = ((struct smb2_create_rsp *)rsp)->PersistentFileId; work->compound_sid = le64_to_cpu(rsp->SessionId); } @@ -2129,7 +2125,7 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) rsp->EndofFile = cpu_to_le64(0); rsp->FileAttributes = FILE_ATTRIBUTE_NORMAL_LE; rsp->Reserved2 = 0; - rsp->VolatileFileId = cpu_to_le64(id); + rsp->VolatileFileId = id; rsp->PersistentFileId = 0; rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; @@ -3157,8 +3153,8 @@ int smb2_open(struct ksmbd_work *work) rsp->Reserved2 = 0; - rsp->PersistentFileId = cpu_to_le64(fp->persistent_id); - rsp->VolatileFileId = cpu_to_le64(fp->volatile_id); + rsp->PersistentFileId = fp->persistent_id; + rsp->VolatileFileId = fp->volatile_id; rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; @@ -3865,9 +3861,7 @@ int smb2_query_dir(struct ksmbd_work *work) goto err_out2; } - dir_fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + dir_fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!dir_fp) { rc = -EBADF; goto err_out2; @@ -4088,12 +4082,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, * Windows can sometime send query file info request on * pipe without opening it, checking error condition here */ - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (!ksmbd_session_rpc_method(sess, id)) return -ENOENT; ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n", - req->FileInfoClass, le64_to_cpu(req->VolatileFileId)); + req->FileInfoClass, req->VolatileFileId); switch (req->FileInfoClass) { case FILE_STANDARD_INFORMATION: @@ -4738,7 +4732,7 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (work->next_smb2_rcv_hdr_off) { - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -4747,8 +4741,8 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -5113,7 +5107,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, } if (work->next_smb2_rcv_hdr_off) { - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -5122,8 +5116,8 @@ static int smb2_get_info_sec(struct ksmbd_work *work, } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -5221,7 +5215,7 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work) struct smb2_close_req *req = smb2_get_msg(work->request_buf); struct smb2_close_rsp *rsp = smb2_get_msg(work->response_buf); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; ksmbd_session_rpc_close(work->sess, id); rsp->StructureSize = cpu_to_le16(60); @@ -5280,7 +5274,7 @@ int smb2_close(struct ksmbd_work *work) } if (work->next_smb2_rcv_hdr_off && - !has_file_id(le64_to_cpu(req->VolatileFileId))) { + !has_file_id(req->VolatileFileId)) { if (!has_file_id(work->compound_fid)) { /* file already closed, return FILE_CLOSED */ ksmbd_debug(SMB, "file already closed\n"); @@ -5299,7 +5293,7 @@ int smb2_close(struct ksmbd_work *work) work->compound_pfid = KSMBD_NO_FID; } } else { - volatile_id = le64_to_cpu(req->VolatileFileId); + volatile_id = req->VolatileFileId; } ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id); @@ -5988,7 +5982,7 @@ int smb2_set_info(struct ksmbd_work *work) if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); rsp = ksmbd_resp_buf_next(work); - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -6000,8 +5994,8 @@ int smb2_set_info(struct ksmbd_work *work) } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -6079,7 +6073,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) struct smb2_read_req *req = smb2_get_msg(work->request_buf); struct smb2_read_rsp *rsp = smb2_get_msg(work->response_buf); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; inc_rfc1001_len(work->response_buf, 16); rpc_resp = ksmbd_rpc_read(work->sess, id); @@ -6215,8 +6209,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { err = -ENOENT; goto out; @@ -6335,7 +6328,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) size_t length; length = le32_to_cpu(req->Length); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (le16_to_cpu(req->DataOffset) == offsetof(struct smb2_write_req, Buffer)) { @@ -6471,8 +6464,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { err = -ENOENT; goto out; @@ -6584,12 +6576,9 @@ int smb2_flush(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", - le64_to_cpu(req->VolatileFileId)); + ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", req->VolatileFileId); - err = ksmbd_vfs_fsync(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + err = ksmbd_vfs_fsync(work, req->VolatileFileId, req->PersistentFileId); if (err) goto out; @@ -6618,8 +6607,7 @@ int smb2_cancel(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_hdr *hdr = smb2_get_msg(work->request_buf); struct smb2_hdr *chdr; - struct ksmbd_work *cancel_work = NULL; - int canceled = 0; + struct ksmbd_work *cancel_work = NULL, *iter; struct list_head *command_list; ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", @@ -6629,11 +6617,11 @@ int smb2_cancel(struct ksmbd_work *work) command_list = &conn->async_requests; spin_lock(&conn->request_lock); - list_for_each_entry(cancel_work, command_list, + list_for_each_entry(iter, command_list, async_request_entry) { - chdr = smb2_get_msg(cancel_work->request_buf); + chdr = smb2_get_msg(iter->request_buf); - if (cancel_work->async_id != + if (iter->async_id != le64_to_cpu(hdr->Id.AsyncId)) continue; @@ -6641,7 +6629,7 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with AsyncId %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->Id.AsyncId), le16_to_cpu(chdr->Command)); - canceled = 1; + cancel_work = iter; break; } spin_unlock(&conn->request_lock); @@ -6649,24 +6637,24 @@ int smb2_cancel(struct ksmbd_work *work) command_list = &conn->requests; spin_lock(&conn->request_lock); - list_for_each_entry(cancel_work, command_list, request_entry) { - chdr = smb2_get_msg(cancel_work->request_buf); + list_for_each_entry(iter, command_list, request_entry) { + chdr = smb2_get_msg(iter->request_buf); if (chdr->MessageId != hdr->MessageId || - cancel_work == work) + iter == work) continue; ksmbd_debug(SMB, "smb2 with mid %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->MessageId), le16_to_cpu(chdr->Command)); - canceled = 1; + cancel_work = iter; break; } spin_unlock(&conn->request_lock); } - if (canceled) { + if (cancel_work) { cancel_work->state = KSMBD_WORK_CANCELLED; if (cancel_work->cancel_fn) cancel_work->cancel_fn(cancel_work->cancel_argv); @@ -6804,12 +6792,9 @@ int smb2_lock(struct ksmbd_work *work) int prior_lock = 0; ksmbd_debug(SMB, "Received lock request\n"); - fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { - ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", - le64_to_cpu(req->VolatileFileId)); + ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", req->VolatileFileId); err = -ENOENT; goto out2; } @@ -7164,8 +7149,8 @@ static int fsctl_copychunk(struct ksmbd_work *work, ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; - rsp->VolatileFileId = cpu_to_le64(volatile_id); - rsp->PersistentFileId = cpu_to_le64(persistent_id); + rsp->VolatileFileId = volatile_id; + rsp->PersistentFileId = persistent_id; ci_rsp->ChunksWritten = cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); ci_rsp->ChunkBytesWritten = @@ -7379,8 +7364,8 @@ ipv6_retry: if (nii_rsp) nii_rsp->Next = 0; - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); - rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + rsp->PersistentFileId = SMB2_NO_FID; + rsp->VolatileFileId = SMB2_NO_FID; return nbytes; } @@ -7547,9 +7532,7 @@ static int fsctl_request_resume_key(struct ksmbd_work *work, { struct ksmbd_file *fp; - fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) return -ENOENT; @@ -7579,7 +7562,7 @@ int smb2_ioctl(struct ksmbd_work *work) if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); rsp = ksmbd_resp_buf_next(work); - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -7590,14 +7573,14 @@ int smb2_ioctl(struct ksmbd_work *work) } if (!has_file_id(id)) - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) { rsp->hdr.Status = STATUS_NOT_SUPPORTED; goto out; } - cnt_code = le32_to_cpu(req->CntCode); + cnt_code = le32_to_cpu(req->CtlCode); ret = smb2_calc_max_out_buf_len(work, 48, le32_to_cpu(req->MaxOutputResponse)); if (ret < 0) { @@ -7656,8 +7639,8 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; nbytes = sizeof(struct validate_negotiate_info_rsp); - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); - rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + rsp->PersistentFileId = SMB2_NO_FID; + rsp->VolatileFileId = SMB2_NO_FID; break; case FSCTL_QUERY_NETWORK_INTERFACE_INFO: ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len); @@ -7703,10 +7686,10 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->PersistentFileId = req->PersistentFileId; fsctl_copychunk(work, (struct copychunk_ioctl_req *)&req->Buffer[0], - le32_to_cpu(req->CntCode), + le32_to_cpu(req->CtlCode), le32_to_cpu(req->InputCount), - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId), + req->VolatileFileId, + req->PersistentFileId, rsp); break; case FSCTL_SET_SPARSE: @@ -7857,7 +7840,7 @@ dup_ext_out: goto out; } - rsp->CntCode = cpu_to_le32(cnt_code); + rsp->CtlCode = cpu_to_le32(cnt_code); rsp->InputCount = cpu_to_le32(0); rsp->InputOffset = cpu_to_le32(112); rsp->OutputOffset = cpu_to_le32(112); @@ -7903,8 +7886,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) char req_oplevel = 0, rsp_oplevel = 0; unsigned int oplock_change_type; - volatile_id = le64_to_cpu(req->VolatileFid); - persistent_id = le64_to_cpu(req->PersistentFid); + volatile_id = req->VolatileFid; + persistent_id = req->PersistentFid; req_oplevel = req->OplockLevel; ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n", volatile_id, persistent_id, req_oplevel); @@ -7999,8 +7982,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->OplockLevel = rsp_oplevel; rsp->Reserved = 0; rsp->Reserved2 = 0; - rsp->VolatileFid = cpu_to_le64(volatile_id); - rsp->PersistentFid = cpu_to_le64(persistent_id); + rsp->VolatileFid = volatile_id; + rsp->PersistentFid = persistent_id; inc_rfc1001_len(work->response_buf, 24); return; @@ -8500,7 +8483,7 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type) struct smb2_hdr *hdr = smb2_get_msg(old_buf); unsigned int orig_len = get_rfc1002_len(old_buf); - memset(tr_buf, 0, sizeof(struct smb2_transform_hdr) + 4); + /* tr_buf must be cleared by the caller */ tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(TRANSFORM_FLAG_ENCRYPTED); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 725b800c29c8..af455278d005 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -16,42 +16,13 @@ #define FILE_CREATED 0x00000002 #define FILE_OVERWRITTEN 0x00000003 -/* - * Size of the session key (crypto key encrypted with the password - */ -#define SMB2_NTLMV2_SESSKEY_SIZE 16 -#define SMB2_SIGNATURE_SIZE 16 -#define SMB2_HMACSHA256_SIZE 32 -#define SMB2_CMACAES_SIZE 16 -#define SMB3_GCM128_CRYPTKEY_SIZE 16 -#define SMB3_GCM256_CRYPTKEY_SIZE 32 - -/* - * Size of the smb3 encryption/decryption keys - */ -#define SMB3_ENC_DEC_KEY_SIZE 32 - -/* - * Size of the smb3 signing key - */ -#define SMB3_SIGN_KEY_SIZE 16 - -#define CIFS_CLIENT_CHALLENGE_SIZE 8 -#define SMB_SERVER_CHALLENGE_SIZE 8 - /* SMB2 Max Credits */ #define SMB2_MAX_CREDITS 8192 -/* Maximum buffer size value we can send with 1 credit */ -#define SMB2_MAX_BUFFER_SIZE 65536 - -#define NUMBER_OF_SMB2_COMMANDS 0x0013 - /* BB FIXME - analyze following length BB */ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) -#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) #define SMB3_MIN_IOSIZE (64 * 1024) #define SMB3_MAX_IOSIZE (8 * 1024 * 1024) @@ -65,18 +36,6 @@ * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 9 -#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2) - -struct smb2_err_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; - __u8 ErrorContextCount; - __u8 Reserved; - __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ -} __packed; - struct preauth_integrity_info { /* PreAuth integrity Hash ID */ __le16 Preauth_HashId; @@ -116,8 +75,8 @@ struct create_durable_reconn_req { union { __u8 Reserved[16]; struct { - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } Fid; } Data; } __packed; @@ -126,8 +85,8 @@ struct create_durable_reconn_v2_req { struct create_context ccontext; __u8 Name[8]; struct { - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } Fid; __u8 CreateGuid[16]; __le32 Flags; @@ -161,13 +120,6 @@ struct create_alloc_size_req { __le64 AllocationSize; } __packed; -struct create_posix { - struct create_context ccontext; - __u8 Name[16]; - __le32 Mode; - __u32 Reserved; -} __packed; - struct create_durable_rsp { struct create_context ccontext; __u8 Name[8]; @@ -209,45 +161,6 @@ struct create_posix_rsp { u8 SidBuffer[40]; } __packed; -#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) - -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) - -#define SMB2_LEASE_KEY_SIZE 16 - -struct lease_context { - __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; -} __packed; - -struct lease_context_v2 { - __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; - __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE]; - __le16 Epoch; - __le16 Reserved; -} __packed; - -struct create_lease { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context lcontext; -} __packed; - -struct create_lease_v2 { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context_v2 lcontext; - __u8 Pad[4]; -} __packed; - struct smb2_buffer_desc_v1 { __le64 offset; __le32 token; @@ -256,63 +169,6 @@ struct smb2_buffer_desc_v1 { #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; - -struct smb2_ioctl_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __le16 Reserved; /* offset from start of SMB2 header to write data */ - __le32 CntCode; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le32 InputOffset; /* Reserved MBZ */ - __le32 InputCount; - __le32 MaxInputResponse; - __le32 OutputOffset; - __le32 OutputCount; - __le32 MaxOutputResponse; - __le32 Flags; - __le32 Reserved2; - __u8 Buffer[1]; -} __packed; - -struct smb2_ioctl_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 49 */ - __le16 Reserved; /* offset from start of SMB2 header to write data */ - __le32 CntCode; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le32 InputOffset; /* Reserved MBZ */ - __le32 InputCount; - __le32 OutputOffset; - __le32 OutputCount; - __le32 Flags; - __le32 Reserved2; - __u8 Buffer[1]; -} __packed; - -struct validate_negotiate_info_req { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 DialectCount; - __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ -} __packed; - -struct validate_negotiate_info_rsp { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 Dialect; /* Dialect in use for the connection */ -} __packed; - struct smb_sockaddr_in { __be16 Port; __be32 IPv4address; @@ -357,7 +213,7 @@ struct file_object_buf_type1_ioctl_rsp { } __packed; struct resume_key_ioctl_rsp { - __le64 ResumeKey[3]; + __u64 ResumeKey[3]; __le32 ContextLength; __u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */ } __packed; @@ -386,167 +242,6 @@ struct file_sparse { __u8 SetSparse; } __packed; -struct file_zero_data_information { - __le64 FileOffset; - __le64 BeyondFinalZero; -} __packed; - -struct file_allocated_range_buffer { - __le64 file_offset; - __le64 length; -} __packed; - -struct reparse_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -/* SMB2 Notify Action Flags */ -#define FILE_ACTION_ADDED 0x00000001 -#define FILE_ACTION_REMOVED 0x00000002 -#define FILE_ACTION_MODIFIED 0x00000003 -#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 -#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 -#define FILE_ACTION_ADDED_STREAM 0x00000006 -#define FILE_ACTION_REMOVED_STREAM 0x00000007 -#define FILE_ACTION_MODIFIED_STREAM 0x00000008 -#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 - -#define SMB2_LOCKFLAG_SHARED 0x0001 -#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002 -#define SMB2_LOCKFLAG_UNLOCK 0x0004 -#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 -#define SMB2_LOCKFLAG_MASK 0x0007 - -struct smb2_lock_element { - __le64 Offset; - __le64 Length; - __le32 Flags; - __le32 Reserved; -} __packed; - -struct smb2_lock_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 48 */ - __le16 LockCount; - __le32 Reserved; - __le64 PersistentFileId; - __le64 VolatileFileId; - /* Followed by at least one */ - struct smb2_lock_element locks[1]; -} __packed; - -struct smb2_lock_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_echo_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -struct smb2_echo_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -/* search (query_directory) Flags field */ -#define SMB2_RESTART_SCANS 0x01 -#define SMB2_RETURN_SINGLE_ENTRY 0x02 -#define SMB2_INDEX_SPECIFIED 0x04 -#define SMB2_REOPEN 0x10 - -struct smb2_query_directory_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 FileInformationClass; - __u8 Flags; - __le32 FileIndex; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le16 FileNameOffset; - __le16 FileNameLength; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_directory_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* Possible InfoType values */ -#define SMB2_O_INFO_FILE 0x01 -#define SMB2_O_INFO_FILESYSTEM 0x02 -#define SMB2_O_INFO_SECURITY 0x03 -#define SMB2_O_INFO_QUOTA 0x04 - -/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ -#define OWNER_SECINFO 0x00000001 -#define GROUP_SECINFO 0x00000002 -#define DACL_SECINFO 0x00000004 -#define SACL_SECINFO 0x00000008 -#define LABEL_SECINFO 0x00000010 -#define ATTRIBUTE_SECINFO 0x00000020 -#define SCOPE_SECINFO 0x00000040 -#define BACKUP_SECINFO 0x00010000 -#define UNPROTECTED_SACL_SECINFO 0x10000000 -#define UNPROTECTED_DACL_SECINFO 0x20000000 -#define PROTECTED_SACL_SECINFO 0x40000000 -#define PROTECTED_DACL_SECINFO 0x80000000 - -struct smb2_query_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 41 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 OutputBufferLength; - __le16 InputBufferOffset; - __u16 Reserved; - __le32 InputBufferLength; - __le32 AdditionalInformation; - __le32 Flags; - __le64 PersistentFileId; - __le64 VolatileFileId; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 BufferLength; - __le16 BufferOffset; - __u16 Reserved; - __le32 AdditionalInformation; - __le64 PersistentFileId; - __le64 VolatileFileId; - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 2 */ -} __packed; - /* FILE Info response size */ #define FILE_DIRECTORY_INFORMATION_SIZE 1 #define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2 @@ -602,145 +297,11 @@ struct fs_type_info { long magic_number; } __packed; -struct smb2_oplock_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __le64 PersistentFid; - __le64 VolatileFid; -} __packed; - -#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) - -struct smb2_lease_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 44 */ - __le16 Epoch; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 CurrentLeaseState; - __le32 NewLeaseState; - __le32 BreakReason; - __le32 AccessMaskHint; - __le32 ShareMaskHint; -} __packed; - -struct smb2_lease_ack { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 Reserved; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 LeaseState; - __le64 LeaseDuration; -} __packed; - /* - * PDU infolevel structure definitions + * PDU query infolevel structure definitions * BB consider moving to a different header */ -/* File System Information Classes */ -#define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Set */ -#define FS_SIZE_INFORMATION 3 /* Query */ -#define FS_DEVICE_INFORMATION 4 /* Query */ -#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ -#define FS_CONTROL_INFORMATION 6 /* Query, Set */ -#define FS_FULL_SIZE_INFORMATION 7 /* Query */ -#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ -#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ -#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ - -struct smb2_fs_full_size_info { - __le64 TotalAllocationUnits; - __le64 CallerAvailableAllocationUnits; - __le64 ActualAvailableAllocationUnits; - __le32 SectorsPerAllocationUnit; - __le32 BytesPerSector; -} __packed; - -#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 -#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 -#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 -#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 - -/* sector size info struct */ -struct smb3_fs_ss_info { - __le32 LogicalBytesPerSector; - __le32 PhysicalBytesPerSectorForAtomicity; - __le32 PhysicalBytesPerSectorForPerf; - __le32 FSEffPhysicalBytesPerSectorForAtomicity; - __le32 Flags; - __le32 ByteOffsetForSectorAlignment; - __le32 ByteOffsetForPartitionAlignment; -} __packed; - -/* File System Control Information */ -struct smb2_fs_control_info { - __le64 FreeSpaceStartFiltering; - __le64 FreeSpaceThreshold; - __le64 FreeSpaceStopFiltering; - __le64 DefaultQuotaThreshold; - __le64 DefaultQuotaLimit; - __le32 FileSystemControlFlags; - __le32 Padding; -} __packed; - -/* partial list of QUERY INFO levels */ -#define FILE_DIRECTORY_INFORMATION 1 -#define FILE_FULL_DIRECTORY_INFORMATION 2 -#define FILE_BOTH_DIRECTORY_INFORMATION 3 -#define FILE_BASIC_INFORMATION 4 -#define FILE_STANDARD_INFORMATION 5 -#define FILE_INTERNAL_INFORMATION 6 -#define FILE_EA_INFORMATION 7 -#define FILE_ACCESS_INFORMATION 8 -#define FILE_NAME_INFORMATION 9 -#define FILE_RENAME_INFORMATION 10 -#define FILE_LINK_INFORMATION 11 -#define FILE_NAMES_INFORMATION 12 -#define FILE_DISPOSITION_INFORMATION 13 -#define FILE_POSITION_INFORMATION 14 -#define FILE_FULL_EA_INFORMATION 15 -#define FILE_MODE_INFORMATION 16 -#define FILE_ALIGNMENT_INFORMATION 17 -#define FILE_ALL_INFORMATION 18 -#define FILE_ALLOCATION_INFORMATION 19 -#define FILE_END_OF_FILE_INFORMATION 20 -#define FILE_ALTERNATE_NAME_INFORMATION 21 -#define FILE_STREAM_INFORMATION 22 -#define FILE_PIPE_INFORMATION 23 -#define FILE_PIPE_LOCAL_INFORMATION 24 -#define FILE_PIPE_REMOTE_INFORMATION 25 -#define FILE_MAILSLOT_QUERY_INFORMATION 26 -#define FILE_MAILSLOT_SET_INFORMATION 27 -#define FILE_COMPRESSION_INFORMATION 28 -#define FILE_OBJECT_ID_INFORMATION 29 -/* Number 30 not defined in documents */ -#define FILE_MOVE_CLUSTER_INFORMATION 31 -#define FILE_QUOTA_INFORMATION 32 -#define FILE_REPARSE_POINT_INFORMATION 33 -#define FILE_NETWORK_OPEN_INFORMATION 34 -#define FILE_ATTRIBUTE_TAG_INFORMATION 35 -#define FILE_TRACKING_INFORMATION 36 -#define FILEID_BOTH_DIRECTORY_INFORMATION 37 -#define FILEID_FULL_DIRECTORY_INFORMATION 38 -#define FILE_VALID_DATA_LENGTH_INFORMATION 39 -#define FILE_SHORT_NAME_INFORMATION 40 -#define FILE_SFIO_RESERVE_INFORMATION 44 -#define FILE_SFIO_VOLUME_INFORMATION 45 -#define FILE_HARD_LINK_INFORMATION 46 -#define FILE_NORMALIZED_NAME_INFORMATION 48 -#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 -#define FILE_STANDARD_LINK_INFORMATION 54 - -#define OP_BREAK_STRUCT_SIZE_20 24 -#define OP_BREAK_STRUCT_SIZE_21 36 - struct smb2_file_access_info { __le32 AccessFlags; } __packed; @@ -749,56 +310,6 @@ struct smb2_file_alignment_info { __le32 AlignmentRequirement; } __packed; -struct smb2_file_internal_info { - __le64 IndexNumber; -} __packed; /* level 6 Query */ - -struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[0]; /* New name to be assigned */ -} __packed; /* level 10 Set */ - -struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[0]; /* Name to be assigned to new link */ -} __packed; /* level 11 Set */ - -/* - * This level 18, although with struct with same name is different from cifs - * level 0x107. Level 0x107 has an extra u64 between AccessFlags and - * CurrentByteOffset. - */ -struct smb2_file_all_info { /* data block encoding of response to level 18 */ - __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 Attributes; - __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; /* size ie offset to first free byte in file */ - __le32 NumberOfLinks; /* hard links */ - __u8 DeletePending; - __u8 Directory; - __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ - __le64 IndexNumber; - __le32 EASize; - __le32 AccessFlags; - __le64 CurrentByteOffset; - __le32 Mode; - __le32 AlignmentRequirement; - __le32 FileNameLength; - char FileName[1]; -} __packed; /* level 18 Query */ - struct smb2_file_basic_info { /* data block encoding of response to level 18 */ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ __le64 LastAccessTime; @@ -810,7 +321,7 @@ struct smb2_file_basic_info { /* data block encoding of response to level 18 */ struct smb2_file_alt_name_info { __le32 FileNameLength; - char FileName[0]; + char FileName[]; } __packed; struct smb2_file_stream_info { @@ -818,13 +329,9 @@ struct smb2_file_stream_info { __le32 StreamNameLength; __le64 StreamSize; __le64 StreamAllocationSize; - char StreamName[0]; + char StreamName[]; } __packed; -struct smb2_file_eof_info { /* encoding of request for level 10 */ - __le64 EndOfFile; /* new end of file value */ -} __packed; /* level 20 Set */ - struct smb2_file_ntwrk_info { __le64 CreationTime; __le64 LastAccessTime; @@ -915,34 +422,6 @@ struct create_sd_buf_req { struct smb_ntsd ntsd; } __packed; -/* Find File infolevels */ -#define SMB_FIND_FILE_POSIX_INFO 0x064 - -/* Level 100 query info */ -struct smb311_posix_qinfo { - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 EndOfFile; - __le64 AllocationSize; - __le32 DosAttributes; - __le64 Inode; - __le32 DeviceId; - __le32 Zero; - /* beginning of POSIX Create Context Response */ - __le32 HardLinks; - __le32 ReparseTag; - __le32 Mode; - u8 Sids[]; - /* - * var sized owner SID - * var sized group SID - * le32 filenamelength - * u8 filename[] - */ -} __packed; - struct smb2_posix_info { __le32 NextEntryOffset; __u32 Ignored; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index ba5a22bc2e6d..e646d79554b8 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -211,7 +211,7 @@ struct smb_direct_rdma_rw_msg { struct completion *completion; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; - struct scatterlist sg_list[0]; + struct scatterlist sg_list[]; }; static inline int get_buf_page_count(void *buf, int size) diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 82a1429bbe12..8fef9de787d3 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -476,7 +476,7 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, switch (event) { case NETDEV_UP: - if (netdev->priv_flags & IFF_BRIDGE_PORT) + if (netif_is_bridge_port(netdev)) return NOTIFY_OK; list_for_each_entry(iface, &iface_list, entry) { @@ -585,7 +585,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (netdev->priv_flags & IFF_BRIDGE_PORT) + if (netif_is_bridge_port(netdev)) continue; if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) return -ENOMEM; diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 19d36393974c..9cebb6ba555b 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -11,7 +11,6 @@ #include <linux/writeback.h> #include <linux/xattr.h> #include <linux/falloc.h> -#include <linux/genhd.h> #include <linux/fsnotify.h> #include <linux/dcache.h> #include <linux/slab.h> diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h index 8857c01093d9..16499ca5c82d 100644 --- a/fs/ksmbd/xattr.h +++ b/fs/ksmbd/xattr.h @@ -76,7 +76,7 @@ struct xattr_acl_entry { struct xattr_smb_acl { int count; int next; - struct xattr_acl_entry entries[0]; + struct xattr_acl_entry entries[]; }; /* 64bytes hash in xattr_ntacl is computed with sha256 */ diff --git a/fs/libfs.c b/fs/libfs.c index 974125270a42..e64bdedef168 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -631,7 +631,7 @@ const struct address_space_operations ram_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); @@ -1198,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void noop_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * There is no page cache to invalidate in the dax case, however - * we need this callback defined to prevent falling back to - * block_invalidatepage() in do_invalidatepage(). - */ -} -EXPORT_SYMBOL_GPL(noop_invalidatepage); - ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* @@ -1231,7 +1220,7 @@ EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 0475c5a5d061..59ef8a1f843f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -184,8 +184,7 @@ lockd(void *vrqstp) dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); - - module_put_and_kthread_exit(0); + return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, @@ -197,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name, xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, net, family, port, - SVC_SOCK_DEFAULTS, cred); + return svc_xprt_create(serv, name, net, family, port, + SVC_SOCK_DEFAULTS, cred); svc_xprt_put(xprt); return 0; } @@ -248,7 +247,8 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_shutdown_net(serv, net); + svc_xprt_destroy_all(serv, net); + svc_rpcb_cleanup(serv, net); return err; } @@ -286,9 +286,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_shutdown_net(serv, net); - dprintk("%s: per-net data destroyed; net=%x\n", - __func__, net->ns.inum); + svc_xprt_destroy_all(serv, net); + svc_rpcb_cleanup(serv, net); } } else { pr_err("%s: no users! net=%x\n", @@ -350,13 +349,6 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static const struct svc_serv_ops lockd_sv_ops = { - .svo_shutdown = svc_rpcb_cleanup, - .svo_function = lockd, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_module = THIS_MODULE, -}; - static int lockd_get(void) { struct svc_serv *serv; @@ -380,7 +372,7 @@ static int lockd_get(void) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9..f1a6610e4ee6 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -442,12 +442,14 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations minix_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = minix_readpage, .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { diff --git a/fs/mpage.c b/fs/mpage.c index 87f5cfef6caa..1fe56f8c495f 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -57,38 +57,14 @@ static void mpage_end_io(struct bio *bio) bio_put(bio); } -static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) +static struct bio *mpage_bio_submit(struct bio *bio) { bio->bi_end_io = mpage_end_io; - bio_set_op_attrs(bio, op, op_flags); guard_bio_eod(bio); submit_bio(bio); return NULL; } -static struct bio * -mpage_alloc(struct block_device *bdev, - sector_t first_sector, int nr_vecs, - gfp_t gfp_flags) -{ - struct bio *bio; - - /* Restrict the given (page cache) mask for slab allocations */ - gfp_flags &= GFP_KERNEL; - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = first_sector; - } - return bio; -} - /* * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into @@ -169,17 +145,14 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; - int op_flags; + int op = REQ_OP_READ; unsigned nblocks; unsigned relative_block; - gfp_t gfp; + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); if (args->is_readahead) { - op_flags = REQ_RAHEAD; - gfp = readahead_gfp_mask(page->mapping); - } else { - op_flags = 0; - gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + op |= REQ_RAHEAD; + gfp |= __GFP_NORETRY | __GFP_NOWARN; } if (page_has_buffers(page)) @@ -287,7 +260,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This page will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); + args->bio = mpage_bio_submit(args->bio); alloc_new: if (args->bio == NULL) { @@ -296,15 +269,16 @@ alloc_new: page)) goto out; } - args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - bio_max_segs(args->nr_pages), gfp); + args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op, + gfp); if (args->bio == NULL) goto confused; + args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); } length = first_hole << blkbits; if (bio_add_page(args->bio, page, length, 0) < length) { - args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); + args->bio = mpage_bio_submit(args->bio); goto alloc_new; } @@ -312,7 +286,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); + args->bio = mpage_bio_submit(args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -320,7 +294,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); + args->bio = mpage_bio_submit(args->bio); if (!PageUptodate(page)) block_read_full_page(page, args->get_block); else @@ -383,7 +357,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) put_page(page); } if (args.bio) - mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); + mpage_bio_submit(args.bio); } EXPORT_SYMBOL(mpage_readahead); @@ -400,7 +374,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) args.bio = do_mpage_readpage(&args); if (args.bio) - mpage_bio_submit(REQ_OP_READ, 0, args.bio); + mpage_bio_submit(args.bio); return 0; } EXPORT_SYMBOL(mpage_readpage); @@ -491,7 +465,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int op_flags = wbc_to_write_flags(wbc); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -504,7 +477,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by - * __set_page_dirty_buffers -> mmapped data + * block_dirty_folio -> mmapped data */ if (buffer_dirty(bh)) goto confused; @@ -599,7 +572,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); + bio = mpage_bio_submit(bio); alloc_new: if (bio == NULL) { @@ -608,13 +581,11 @@ alloc_new: page, wbc)) goto out; } - bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH); - if (bio == NULL) - goto confused; - + bio = bio_alloc(bdev, BIO_MAX_VECS, + REQ_OP_WRITE | wbc_to_write_flags(wbc), + GFP_NOFS); + bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); wbc_init_bio(wbc, bio); - bio->bi_write_hint = inode->i_write_hint; } /* @@ -625,7 +596,7 @@ alloc_new: wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); + bio = mpage_bio_submit(bio); goto alloc_new; } @@ -635,7 +606,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); + bio = mpage_bio_submit(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -647,7 +618,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); + bio = mpage_bio_submit(bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -703,11 +674,8 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) { - int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - REQ_SYNC : 0); - mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); - } + if (mpd.bio) + mpage_bio_submit(mpd.bio); } blk_finish_plug(&plug); return ret; @@ -724,11 +692,8 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) { - int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - REQ_SYNC : 0); - mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); - } + if (mpd.bio) + mpage_bio_submit(mpd.bio); return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/namei.c b/fs/namei.c index 3f1829b3ab5b..509657fdf4f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3673,18 +3673,14 @@ static struct dentry *filename_create(int dfd, struct filename *name, { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - - error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3698,11 +3694,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + if (last.name[last.len] && !want_dir) + create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, lookup_flags); + dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -3716,7 +3714,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && last.name[last.len])) { + if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } diff --git a/fs/namespace.c b/fs/namespace.c index de6fae84f1a1..afe2b64b14f1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @@ -563,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; - smp_mb(); - if (mnt_get_writers(mnt) > 0) { - err = -EBUSY; + err = mnt_hold_writers(mnt); + if (err) break; - } } } if (!err && atomic_long_read(&sb->s_remove_count)) @@ -2099,22 +2112,23 @@ static int invent_group_ids(struct mount *mnt, bool recurse) int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); - unsigned int mounts = 0, old, pending, sum; + unsigned int mounts = 0; struct mount *p; + if (ns->mounts >= max) + return -ENOSPC; + max -= ns->mounts; + if (ns->pending_mounts >= max) + return -ENOSPC; + max -= ns->pending_mounts; + for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; - old = ns->mounts; - pending = ns->pending_mounts; - sum = old + pending; - if ((old > sum) || - (pending > sum) || - (max < sum) || - (mounts > (max - sum))) + if (mounts > max) return -ENOSPC; - ns->pending_mounts = pending + mounts; + ns->pending_mounts += mounts; return 0; } @@ -2597,6 +2611,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2611,6 +2626,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } @@ -2906,7 +2922,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - struct path *path, int mnt_flags) + const struct path *path, int mnt_flags) { struct mount *parent = real_mount(path->mnt); @@ -3029,7 +3045,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, return err; } -int finish_automount(struct vfsmount *m, struct path *path) +int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; struct mountpoint *mp; @@ -3998,46 +4014,69 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return 0; } -static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, - struct mount *mnt, int *err) +/** + * mnt_allow_writers() - check whether the attribute change allows writers + * @kattr: the new mount attributes + * @mnt: the mount to which @kattr will be applied + * + * Check whether thew new mount attributes in @kattr allow concurrent writers. + * + * Return: true if writers need to be held, false if not + */ +static inline bool mnt_allow_writers(const struct mount_kattr *kattr, + const struct mount *mnt) { - struct mount *m = mnt, *last = NULL; + return !(kattr->attr_set & MNT_READONLY) || + (mnt->mnt.mnt_flags & MNT_READONLY); +} - if (!is_mounted(&m->mnt)) { - *err = -EINVAL; - goto out; - } +static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) +{ + struct mount *m; + int err; - if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { - *err = -EINVAL; - goto out; - } + for (m = mnt; m; m = next_mnt(m, mnt)) { + if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { + err = -EPERM; + break; + } - do { - unsigned int flags; + err = can_idmap_mount(kattr, m); + if (err) + break; - flags = recalc_flags(kattr, m); - if (!can_change_locked_flags(m, flags)) { - *err = -EPERM; - goto out; + if (!mnt_allow_writers(kattr, m)) { + err = mnt_hold_writers(m); + if (err) + break; } - *err = can_idmap_mount(kattr, m); - if (*err) - goto out; + if (!kattr->recurse) + return 0; + } - last = m; + if (err) { + struct mount *p; - if ((kattr->attr_set & MNT_READONLY) && - !(m->mnt.mnt_flags & MNT_READONLY)) { - *err = mnt_hold_writers(m); - if (*err) - goto out; + /* + * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will + * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all + * mounts and needs to take care to include the first mount. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { + /* If we had to hold writers unblock them. */ + if (p->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt_unhold_writers(p); + + /* + * We're done once the first mount we changed got + * MNT_WRITE_HOLD unset. + */ + if (p == m) + break; } - } while (kattr->recurse && (m = next_mnt(m, mnt))); - -out: - return last; + } + return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) @@ -4065,48 +4104,32 @@ static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) put_user_ns(old_mnt_userns); } -static void mount_setattr_commit(struct mount_kattr *kattr, - struct mount *mnt, struct mount *last, - int err) +static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { - struct mount *m = mnt; + struct mount *m; - do { - if (!err) { - unsigned int flags; + for (m = mnt; m; m = next_mnt(m, mnt)) { + unsigned int flags; - do_idmap_mount(kattr, m); - flags = recalc_flags(kattr, m); - WRITE_ONCE(m->mnt.mnt_flags, flags); - } + do_idmap_mount(kattr, m); + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); - /* - * We either set MNT_READONLY above so make it visible - * before ~MNT_WRITE_HOLD or we failed to recursively - * apply mount options. - */ - if ((kattr->attr_set & MNT_READONLY) && - (m->mnt.mnt_flags & MNT_WRITE_HOLD)) + /* If we had to hold writers unblock them. */ + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(m); - if (!err && kattr->propagation) + if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - - /* - * On failure, only cleanup until we found the first mount - * we failed to handle. - */ - if (err && m == last) + if (!kattr->recurse) break; - } while (kattr->recurse && (m = next_mnt(m, mnt))); - - if (!err) - touch_mnt_namespace(mnt->mnt_ns); + } + touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) { - struct mount *mnt = real_mount(path->mnt), *last = NULL; + struct mount *mnt = real_mount(path->mnt); int err = 0; if (path->dentry != mnt->mnt.mnt_root) @@ -4127,16 +4150,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) } } + err = -EINVAL; lock_mount_hash(); + /* Ensure that this isn't anything purely vfs internal. */ + if (!is_mounted(&mnt->mnt)) + goto out; + + /* + * If this is an attached mount make sure it's located in the callers + * mount namespace. If it's not don't let the caller interact with it. + * If this is a detached mount make sure it has an anonymous mount + * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. + */ + if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + goto out; + /* - * Get the mount tree in a shape where we can change mount - * properties without failure. + * First, we get the mount tree in a shape where we can change mount + * properties without failure. If we succeeded to do so we commit all + * changes and if we failed we clean up. */ - last = mount_setattr_prepare(kattr, mnt, &err); - if (last) /* Commit all changes or revert to the old state. */ - mount_setattr_commit(kattr, mnt, last, err); + err = mount_setattr_prepare(kattr, mnt); + if (!err) + mount_setattr_commit(kattr, mnt); +out: unlock_mount_hash(); if (kattr->propagation) { diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index c15bfc966d96..f684c0cd1ec5 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -1,5 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -netfs-y := read_helper.o stats.o +netfs-y := \ + buffered_read.o \ + io.o \ + main.o \ + objects.o + +netfs-$(CONFIG_NETFS_STATS) += stats.o obj-$(CONFIG_NETFS_SUPPORT) := netfs.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c new file mode 100644 index 000000000000..281a88a5b8dc --- /dev/null +++ b/fs/netfs/buffered_read.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Network filesystem high-level buffered read support. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * Unlock the folios in a read operation. We need to set PG_fscache on any + * folios we're going to write back before we unlock them. + */ +void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + unsigned int iopos, account = 0; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { + __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + } + } + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + iopos = 0; + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + unsigned int pgend = pgpos + folio_size(folio); + bool pg_failed = false; + + for (;;) { + if (!subreq) { + pg_failed = true; + break; + } + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + folio_start_fscache(folio); + pg_failed |= subreq_failed; + if (pgend < iopos + subreq->len) + break; + + account += subreq->transferred; + iopos += subreq->len; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + if (pgend == iopos) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio_index(folio) == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); + + task_io_account_read(account); + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); +} + +static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, + loff_t *_start, size_t *_len, loff_t i_size) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops && cres->ops->expand_readahead) + cres->ops->expand_readahead(cres, _start, _len, i_size); +} + +static void netfs_rreq_expand(struct netfs_io_request *rreq, + struct readahead_control *ractl) +{ + /* Give the cache a chance to change the request parameters. The + * resultant request must contain the original region. + */ + netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); + + /* Give the netfs a chance to change the request parameters. The + * resultant request must contain the original region. + */ + if (rreq->netfs_ops->expand_readahead) + rreq->netfs_ops->expand_readahead(rreq); + + /* Expand the request if the cache wants it to start earlier. Note + * that the expansion may get further extended if the VM wishes to + * insert THPs and the preferred start and/or end wind up in the middle + * of THPs. + * + * If this is the case, however, the THP size should be an integer + * multiple of the cache granule size, so we get a whole number of + * granules to deal with. + */ + if (rreq->start != readahead_pos(ractl) || + rreq->len != readahead_length(ractl)) { + readahead_expand(ractl, rreq->start, rreq->len); + rreq->start = readahead_pos(ractl); + rreq->len = readahead_length(ractl); + + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_expanded); + } +} + +/** + * netfs_readahead - Helper to manage a read request + * @ractl: The description of the readahead request + * + * Fulfil a readahead request by drawing data from the cache if possible, or + * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O + * requests from different sources will get munged together. If necessary, the + * readahead window can be expanded in either direction to a more convenient + * alighment for RPC efficiency or to make storage in the cache feasible. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +void netfs_readahead(struct readahead_control *ractl) +{ + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); + int ret; + + _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + + if (readahead_count(ractl) == 0) + return; + + rreq = netfs_alloc_request(ractl->mapping, ractl->file, + readahead_pos(ractl), + readahead_length(ractl), + NETFS_READAHEAD); + if (IS_ERR(rreq)) + return; + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; + } + + netfs_stat(&netfs_n_rh_readahead); + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_readahead); + + netfs_rreq_expand(rreq, ractl); + + /* Drop the refs on the folios here rather than in the cache or + * filesystem. The locks will be dropped in netfs_rreq_unlock(). + */ + while (readahead_folio(ractl)) + ; + + netfs_begin_read(rreq, false); + return; + +cleanup_free: + netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + return; +} +EXPORT_SYMBOL(netfs_readahead); + +/** + * netfs_readpage - Helper to manage a readpage request + * @file: The file to read from + * @subpage: A subpage of the folio to read + * + * Fulfil a readpage request by drawing data from the cache if possible, or the + * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests + * from different sources will get munged together. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +int netfs_readpage(struct file *file, struct page *subpage) +{ + struct folio *folio = page_folio(subpage); + struct address_space *mapping = folio_file_mapping(folio); + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(mapping->host); + int ret; + + _enter("%lx", folio_index(folio)); + + rreq = netfs_alloc_request(mapping, file, + folio_file_pos(folio), folio_size(folio), + NETFS_READPAGE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto alloc_error; + } + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; + } + + netfs_stat(&netfs_n_rh_readpage); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); + return netfs_begin_read(rreq, true); + +discard: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +alloc_error: + folio_unlock(folio); + return ret; +} +EXPORT_SYMBOL(netfs_readpage); + +/* + * Prepare a folio for writing without reading first + * @folio: The folio being prepared + * @pos: starting position for the write + * @len: length of write + * @always_fill: T if the folio should always be completely filled/cleared + * + * In some cases, write_begin doesn't need to read at all: + * - full folio write + * - write that lies in a folio that is completely beyond EOF + * - write that covers the folio from start to EOF or beyond it + * + * If any of these criteria are met, then zero out the unwritten parts + * of the folio and return true. Otherwise, return false. + */ +static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, + bool always_fill) +{ + struct inode *inode = folio_inode(folio); + loff_t i_size = i_size_read(inode); + size_t offset = offset_in_folio(folio, pos); + size_t plen = folio_size(folio); + + if (unlikely(always_fill)) { + if (pos - offset + len <= i_size) + return false; /* Page entirely before EOF */ + zero_user_segment(&folio->page, 0, plen); + folio_mark_uptodate(folio); + return true; + } + + /* Full folio write */ + if (offset == 0 && len >= plen) + return true; + + /* Page entirely beyond the end of the file */ + if (pos - offset >= i_size) + goto zero_out; + + /* Write that covers from the start of the folio to EOF or beyond */ + if (offset == 0 && (pos + len) >= i_size) + goto zero_out; + + return false; +zero_out: + zero_user_segments(&folio->page, 0, offset, offset + len, plen); + return true; +} + +/** + * netfs_write_begin - Helper to prepare for writing + * @file: The file to read from + * @mapping: The mapping to read from + * @pos: File position at which the write will begin + * @len: The length of the write (may extend beyond the end of the folio chosen) + * @aop_flags: AOP_* flags + * @_folio: Where to put the resultant folio + * @_fsdata: Place for the netfs to store a cookie + * + * Pre-read data for a write-begin request by drawing data from the cache if + * possible, or the netfs if not. Space beyond the EOF is zero-filled. + * Multiple I/O requests from different sources will get munged together. If + * necessary, the readahead window can be expanded in either direction to a + * more convenient alighment for RPC efficiency or to make storage in the cache + * feasible. + * + * The calling netfs must provide a table of operations, only one of which, + * issue_op, is mandatory. + * + * The check_write_begin() operation can be provided to check for and flush + * conflicting writes once the folio is grabbed and locked. It is passed a + * pointer to the fsdata cookie that gets returned to the VM to be passed to + * write_end. It is permitted to sleep. It should return 0 if the request + * should go ahead; unlock the folio and return -EAGAIN to cause the folio to + * be regot; or return an error. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +int netfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int aop_flags, + struct folio **_folio, void **_fsdata) +{ + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); + struct folio *folio; + unsigned int fgp_flags; + pgoff_t index = pos >> PAGE_SHIFT; + int ret; + + DEFINE_READAHEAD(ractl, file, NULL, mapping, index); + +retry: + fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + if (aop_flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + folio = __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); + if (!folio) + return -ENOMEM; + + if (ctx->ops->check_write_begin) { + /* Allow the netfs (eg. ceph) to flush conflicts. */ + ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + if (ret < 0) { + trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); + if (ret == -EAGAIN) + goto retry; + goto error; + } + } + + if (folio_test_uptodate(folio)) + goto have_folio; + + /* If the page is beyond the EOF, we want to clear it - unless it's + * within the cache granule containing the EOF, in which case we need + * to preload the granule. + */ + if (!netfs_is_cache_enabled(ctx) && + netfs_skip_folio_read(folio, pos, len, false)) { + netfs_stat(&netfs_n_rh_write_zskip); + goto have_folio_no_wait; + } + + rreq = netfs_alloc_request(mapping, file, + folio_file_pos(folio), folio_size(folio), + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + rreq->no_unlock_folio = folio_index(folio); + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + } + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); + + /* Expand the request to meet caching requirements and download + * preferences. + */ + ractl._nr_pages = folio_nr_pages(folio); + netfs_rreq_expand(rreq, &ractl); + + /* We hold the folio locks, so we can drop the references */ + folio_get(folio); + while (readahead_folio(&ractl)) + ; + + ret = netfs_begin_read(rreq, true); + if (ret < 0) + goto error; + +have_folio: + ret = folio_wait_fscache_killable(folio); + if (ret < 0) + goto error; +have_folio_no_wait: + *_folio = folio; + _leave(" = 0"); + return 0; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); +error: + folio_unlock(folio); + folio_put(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_write_begin); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7f2c4459f33..b7b0e3d18d9e 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,6 +5,10 @@ * Written by David Howells (dhowells@redhat.com) */ +#include <linux/netfs.h> +#include <linux/fscache.h> +#include <trace/events/netfs.h> + #ifdef pr_fmt #undef pr_fmt #endif @@ -12,11 +16,40 @@ #define pr_fmt(fmt) "netfs: " fmt /* - * read_helper.c + * buffered_read.c + */ +void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); + +/* + * io.c + */ +int netfs_begin_read(struct netfs_io_request *rreq, bool sync); + +/* + * main.c */ extern unsigned int netfs_debug; /* + * objects.c + */ +struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + struct file *file, + loff_t start, size_t len, + enum netfs_io_origin origin); +void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async); +void netfs_put_request(struct netfs_io_request *rreq, bool was_async, + enum netfs_rreq_ref_trace what); +struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); + +static inline void netfs_see_request(struct netfs_io_request *rreq, + enum netfs_rreq_ref_trace what) +{ + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); +} + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS @@ -55,6 +88,21 @@ static inline void netfs_stat_d(atomic_t *stat) #define netfs_stat_d(x) do {} while(0) #endif +/* + * Miscellaneous functions. + */ +static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +{ +#if IS_ENABLED(CONFIG_FSCACHE) + struct fscache_cookie *cookie = ctx->cache; + + return fscache_cookie_valid(cookie) && cookie->cache_priv && + fscache_cookie_enabled(cookie); +#else + return false; +#endif +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/netfs/io.c b/fs/netfs/io.c new file mode 100644 index 000000000000..428925899282 --- /dev/null +++ b/fs/netfs/io.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Network filesystem high-level read support. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/module.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * Clear the unread part of an I/O request. + */ +static void netfs_clear_unread(struct netfs_io_subrequest *subreq) +{ + struct iov_iter iter; + + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + iov_iter_zero(iov_iter_count(&iter), &iter); +} + +static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + + netfs_subreq_terminated(subreq, transferred_or_error, was_async); +} + +/* + * Issue a read against the cache. + * - Eats the caller's ref on subreq. + */ +static void netfs_read_from_cache(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + enum netfs_read_from_hole read_hole) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct iov_iter iter; + + netfs_stat(&netfs_n_rh_read); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + + cres->ops->read(cres, subreq->start, &iter, read_hole, + netfs_cache_read_terminated, subreq); +} + +/* + * Fill a subrequest region with zeroes. + */ +static void netfs_fill_with_zeroes(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_zero); + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, 0, false); +} + +/* + * Ask the netfs to issue a read request to the server for us. + * + * The netfs is expected to read from subreq->pos + subreq->transferred to + * subreq->pos + subreq->len - 1. It may not backtrack and write data into the + * buffer prior to the transferred point as it might clobber dirty data + * obtained from the cache. + * + * Alternatively, the netfs is allowed to indicate one of two things: + * + * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and + * make progress. + * + * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be + * cleared. + */ +static void netfs_read_from_server(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_download); + rreq->netfs_ops->issue_read(subreq); +} + +/* + * Release those waiting. + */ +static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_done); + netfs_clear_subrequests(rreq, was_async); + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. We have to clear + * the PG_fscache bits on the folios involved and release the caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, + bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + /* We might have multiple writes from the same huge + * folio, but we mustn't unlock a folio more than once. + */ + if (have_unlocked && folio_index(folio) <= unlocked) + continue; + unlocked = folio_index(folio); + folio_end_fscache(folio); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} + +/* + * Perform any outstanding writes to the cache. We inherit a ref from the + * caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_io_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_copy); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_copy_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_no_copy); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false, + netfs_sreq_trace_put_merged); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + rreq->i_size, true); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_copy_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) +{ + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); +} + +/* + * Handle a short read. + */ +static void netfs_rreq_short_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); + + netfs_stat(&netfs_n_rh_short_read); + trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read); + atomic_inc(&rreq->nr_outstanding); + if (subreq->source == NETFS_READ_FROM_CACHE) + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); + else + netfs_read_from_server(rreq, subreq); +} + +/* + * Resubmit any short or failed operations. Returns true if we got the rreq + * ref back. + */ +static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + WARN_ON(in_interrupt()); + + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + + /* We don't want terminating submissions trying to wake us up whilst + * we're still going through the list. + */ + atomic_inc(&rreq->nr_outstanding); + + __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error) { + if (subreq->source != NETFS_READ_FROM_CACHE) + break; + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->error = 0; + netfs_stat(&netfs_n_rh_download_instead); + trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + atomic_inc(&rreq->nr_outstanding); + netfs_read_from_server(rreq, subreq); + } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + netfs_rreq_short_read(rreq, subreq); + } + } + + /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_outstanding)) + return true; + + wake_up_var(&rreq->nr_outstanding); + return false; +} + +/* + * Check to see if the data read is still valid. + */ +static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + if (!rreq->netfs_ops->is_still_valid || + rreq->netfs_ops->is_still_valid(rreq)) + return; + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->source == NETFS_READ_FROM_CACHE) { + subreq->error = -ESTALE; + __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } + } +} + +/* + * Assess the state of a read request and decide what to do next. + * + * Note that we could be in an ordinary kernel thread, on a workqueue or in + * softirq context at this point. We inherit a ref from the caller. + */ +static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + +again: + netfs_rreq_is_still_valid(rreq); + + if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && + test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { + if (netfs_rreq_perform_resubmissions(rreq)) + goto again; + return; + } + + netfs_rreq_unlock_folios(rreq); + + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq); + + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_work(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + netfs_rreq_assess(rreq, false); +} + +/* + * Handle the completion of all outstanding I/O operations on a read request. + * We inherit a ref from the caller. + */ +static void netfs_rreq_terminated(struct netfs_io_request *rreq, + bool was_async) +{ + if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && + was_async) { + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_rreq_assess(rreq, was_async); + } +} + +/** + * netfs_subreq_terminated - Note the termination of an I/O operation. + * @subreq: The I/O request that has terminated. + * @transferred_or_error: The amount of data transferred or an error code. + * @was_async: The termination was asynchronous + * + * This tells the read helper that a contributory I/O operation has terminated, + * one way or another, and that it should integrate the results. + * + * The caller indicates in @transferred_or_error the outcome of the operation, + * supplying a positive value to indicate the number of bytes transferred, 0 to + * indicate a failure to transfer anything that should be retried or a negative + * error code. The helper will look after reissuing I/O operations as + * appropriate and writing downloaded data to the cache. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, + ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + int u; + + _enter("[%u]{%llx,%lx},%zd", + subreq->debug_index, subreq->start, subreq->flags, + transferred_or_error); + + switch (subreq->source) { + case NETFS_READ_FROM_CACHE: + netfs_stat(&netfs_n_rh_read_done); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download_done); + break; + default: + break; + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_read); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq overread: R%x[%x] %zd > %zu - %zu", + rreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + if (subreq->transferred < subreq->len) + goto incomplete; + +complete: + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); + +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&rreq->nr_outstanding); + if (u == 0) + netfs_rreq_terminated(rreq, was_async); + else if (u == 1) + wake_up_var(&rreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { + netfs_clear_unread(subreq); + subreq->transferred = subreq->len; + goto complete; + } + + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + goto out; + +failed: + if (subreq->source == NETFS_READ_FROM_CACHE) { + netfs_stat(&netfs_n_rh_read_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } else { + netfs_stat(&netfs_n_rh_download_failed); + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + rreq->error = subreq->error; + } + goto out; +} +EXPORT_SYMBOL(netfs_subreq_terminated); + +static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops) + return cres->ops->prepare_read(subreq, i_size); + if (subreq->start >= rreq->i_size) + return NETFS_FILL_WITH_ZEROES; + return NETFS_DOWNLOAD_FROM_SERVER; +} + +/* + * Work out what sort of subrequest the next one will be. + */ +static enum netfs_io_source +netfs_rreq_prepare_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + enum netfs_io_source source; + + _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + + if (source == NETFS_DOWNLOAD_FROM_SERVER) { + /* Call out to the netfs to let it shrink the request to fit + * its own I/O sizes and boundaries. If it shinks it here, it + * will be called again to make simultaneous calls; if it wants + * to make serial calls, it can indicate a short read and then + * we will call it again. + */ + if (subreq->len > rreq->i_size - subreq->start) + subreq->len = rreq->i_size - subreq->start; + + if (rreq->netfs_ops->clamp_length && + !rreq->netfs_ops->clamp_length(subreq)) { + source = NETFS_INVALID_READ; + goto out; + } + } + + if (WARN_ON(subreq->len == 0)) + source = NETFS_INVALID_READ; + +out: + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + return source; +} + +/* + * Slice off a piece of a read request and submit an I/O request for it. + */ +static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + unsigned int *_debug_index) +{ + struct netfs_io_subrequest *subreq; + enum netfs_io_source source; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + return false; + + subreq->debug_index = (*_debug_index)++; + subreq->start = rreq->start + rreq->submitted; + subreq->len = rreq->len - rreq->submitted; + + _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + + /* Call out to the cache to find out what it can do with the remaining + * subset. It tells us in subreq->flags what it decided should be done + * and adjusts subreq->len down if the subset crosses a cache boundary. + * + * Then when we hand the subset, it can choose to take a subset of that + * (the starts must coincide), in which case, we go around the loop + * again and ask it to download the next piece. + */ + source = netfs_rreq_prepare_read(rreq, subreq); + if (source == NETFS_INVALID_READ) + goto subreq_failed; + + atomic_inc(&rreq->nr_outstanding); + + rreq->submitted += subreq->len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + switch (source) { + case NETFS_FILL_WITH_ZEROES: + netfs_fill_with_zeroes(rreq, subreq); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_read_from_server(rreq, subreq); + break; + case NETFS_READ_FROM_CACHE: + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); + break; + default: + BUG(); + } + + return true; + +subreq_failed: + rreq->error = subreq->error; + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed); + return false; +} + +/* + * Begin the process of reading in a chunk of data, where that data may be + * stitched together from multiple sources, including multiple servers and the + * local cache. + */ +int netfs_begin_read(struct netfs_io_request *rreq, bool sync) +{ + unsigned int debug_index = 0; + int ret; + + _enter("R=%x %llx-%llx", + rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); + + if (rreq->len == 0) { + pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); + return -EIO; + } + + INIT_WORK(&rreq->work, netfs_rreq_work); + + if (sync) + netfs_get_request(rreq, netfs_rreq_trace_get_hold); + + /* Chop the read into slices according to what the cache and the netfs + * want and submit each one. + */ + atomic_set(&rreq->nr_outstanding, 1); + do { + if (!netfs_rreq_submit_slice(rreq, &debug_index)) + break; + + } while (rreq->submitted < rreq->len); + + if (sync) { + /* Keep nr_outstanding incremented so that the ref always belongs to + * us, and the service code isn't punted off to a random thread pool to + * process. + */ + for (;;) { + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + netfs_rreq_assess(rreq, false); + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + cond_resched(); + } + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); + } else { + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_assess(rreq, false); + ret = 0; + } + return ret; +} diff --git a/fs/netfs/main.c b/fs/netfs/main.c new file mode 100644 index 000000000000..068568702957 --- /dev/null +++ b/fs/netfs/main.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Miscellaneous bits for the netfs support library. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/module.h> +#include <linux/export.h> +#include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/netfs.h> + +MODULE_DESCRIPTION("Network fs support"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned netfs_debug; +module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c new file mode 100644 index 000000000000..e86107b30ba4 --- /dev/null +++ b/fs/netfs/objects.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Object lifetime handling and tracing. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include "internal.h" + +/* + * Allocate an I/O request and initialise it. + */ +struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + struct file *file, + loff_t start, size_t len, + enum netfs_io_origin origin) +{ + static atomic_t debug_ids; + struct inode *inode = file ? file_inode(file) : mapping->host; + struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_io_request *rreq; + int ret; + + rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + if (!rreq) + return ERR_PTR(-ENOMEM); + + rreq->start = start; + rreq->len = len; + rreq->origin = origin; + rreq->netfs_ops = ctx->ops; + rreq->mapping = mapping; + rreq->inode = inode; + rreq->i_size = i_size_read(inode); + rreq->debug_id = atomic_inc_return(&debug_ids); + INIT_LIST_HEAD(&rreq->subrequests); + refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (rreq->netfs_ops->init_request) { + ret = rreq->netfs_ops->init_request(rreq, file); + if (ret < 0) { + kfree(rreq); + return ERR_PTR(ret); + } + } + + netfs_stat(&netfs_n_rh_rreq); + return rreq; +} + +void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) +{ + int r; + + __refcount_inc(&rreq->ref, &r); + trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); +} + +void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + + while (!list_empty(&rreq->subrequests)) { + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, was_async, + netfs_sreq_trace_put_clear); + } +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_clear_subrequests(rreq, false); + if (rreq->netfs_priv) + rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); + trace_netfs_rreq(rreq, netfs_rreq_trace_free); + if (rreq->cache_resources.ops) + rreq->cache_resources.ops->end_operation(&rreq->cache_resources); + kfree(rreq); + netfs_stat_d(&netfs_n_rh_rreq); +} + +void netfs_put_request(struct netfs_io_request *rreq, bool was_async, + enum netfs_rreq_ref_trace what) +{ + unsigned int debug_id = rreq->debug_id; + bool dead; + int r; + + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } + } +} + +/* + * Allocate and partially initialise an I/O request structure. + */ +struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + if (subreq) { + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->ref, 2); + subreq->rreq = rreq; + netfs_get_request(rreq, netfs_rreq_trace_get_subreq); + netfs_stat(&netfs_n_rh_sreq); + } + + return subreq; +} + +void netfs_get_subrequest(struct netfs_io_subrequest *subreq, + enum netfs_sreq_ref_trace what) +{ + int r; + + __refcount_inc(&subreq->ref, &r); + trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1, + what); +} + +static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + trace_netfs_sreq(subreq, netfs_sreq_trace_free); + kfree(subreq); + netfs_stat_d(&netfs_n_rh_sreq); + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); +} + +void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, + enum netfs_sreq_ref_trace what) +{ + unsigned int debug_index = subreq->debug_index; + unsigned int debug_id = subreq->rreq->debug_id; + bool dead; + int r; + + dead = __refcount_dec_and_test(&subreq->ref, &r); + trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); + if (dead) + netfs_free_subrequest(subreq, was_async); +} diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c deleted file mode 100644 index 501da990c259..000000000000 --- a/fs/netfs/read_helper.c +++ /dev/null @@ -1,1205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Network filesystem high-level read support. - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/uio.h> -#include <linux/sched/mm.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/netfs.h> -#include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/netfs.h> - -MODULE_DESCRIPTION("Network fs support"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned netfs_debug; -module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); - -static void netfs_rreq_work(struct work_struct *); -static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool); - -static void netfs_put_subrequest(struct netfs_read_subrequest *subreq, - bool was_async) -{ - if (refcount_dec_and_test(&subreq->usage)) - __netfs_put_subrequest(subreq, was_async); -} - -static struct netfs_read_request *netfs_alloc_read_request( - const struct netfs_read_request_ops *ops, void *netfs_priv, - struct file *file) -{ - static atomic_t debug_ids; - struct netfs_read_request *rreq; - - rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL); - if (rreq) { - rreq->netfs_ops = ops; - rreq->netfs_priv = netfs_priv; - rreq->inode = file_inode(file); - rreq->i_size = i_size_read(rreq->inode); - rreq->debug_id = atomic_inc_return(&debug_ids); - INIT_LIST_HEAD(&rreq->subrequests); - INIT_WORK(&rreq->work, netfs_rreq_work); - refcount_set(&rreq->usage, 1); - __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (ops->init_rreq) - ops->init_rreq(rreq, file); - netfs_stat(&netfs_n_rh_rreq); - } - - return rreq; -} - -static void netfs_get_read_request(struct netfs_read_request *rreq) -{ - refcount_inc(&rreq->usage); -} - -static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq, - bool was_async) -{ - struct netfs_read_subrequest *subreq; - - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_read_subrequest, rreq_link); - list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async); - } -} - -static void netfs_free_read_request(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - netfs_rreq_clear_subreqs(rreq, false); - if (rreq->netfs_priv) - rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); - trace_netfs_rreq(rreq, netfs_rreq_trace_free); - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); - netfs_stat_d(&netfs_n_rh_rreq); -} - -static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async) -{ - if (refcount_dec_and_test(&rreq->usage)) { - if (was_async) { - rreq->work.func = netfs_free_read_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_read_request(&rreq->work); - } - } -} - -/* - * Allocate and partially initialise an I/O request structure. - */ -static struct netfs_read_subrequest *netfs_alloc_subrequest( - struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->usage, 2); - subreq->rreq = rreq; - netfs_get_read_request(rreq); - netfs_stat(&netfs_n_rh_sreq); - } - - return subreq; -} - -static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq) -{ - refcount_inc(&subreq->usage); -} - -static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq, - bool was_async) -{ - struct netfs_read_request *rreq = subreq->rreq; - - trace_netfs_sreq(subreq, netfs_sreq_trace_free); - kfree(subreq); - netfs_stat_d(&netfs_n_rh_sreq); - netfs_put_read_request(rreq, was_async); -} - -/* - * Clear the unread part of an I/O request. - */ -static void netfs_clear_unread(struct netfs_read_subrequest *subreq) -{ - struct iov_iter iter; - - iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); -} - -static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_subrequest *subreq = priv; - - netfs_subreq_terminated(subreq, transferred_or_error, was_async); -} - -/* - * Issue a read against the cache. - * - Eats the caller's ref on subreq. - */ -static void netfs_read_from_cache(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq, - enum netfs_read_from_hole read_hole) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; - - netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, - netfs_cache_read_terminated, subreq); -} - -/* - * Fill a subrequest region with zeroes. - */ -static void netfs_fill_with_zeroes(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_zero); - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, 0, false); -} - -/* - * Ask the netfs to issue a read request to the server for us. - * - * The netfs is expected to read from subreq->pos + subreq->transferred to - * subreq->pos + subreq->len - 1. It may not backtrack and write data into the - * buffer prior to the transferred point as it might clobber dirty data - * obtained from the cache. - * - * Alternatively, the netfs is allowed to indicate one of two things: - * - * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and - * make progress. - * - * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be - * cleared. - */ -static void netfs_read_from_server(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_download); - rreq->netfs_ops->issue_op(subreq); -} - -/* - * Release those waiting. - */ -static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_rreq_clear_subreqs(rreq, was_async); - netfs_put_read_request(rreq, was_async); -} - -/* - * Deal with the completion of writing the data to the cache. We have to clear - * the PG_fscache bits on the folios involved and release the caller's ref. - * - * May be called in softirq mode and we inherit a ref from the caller. - */ -static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq, - bool was_async) -{ - struct netfs_read_subrequest *subreq; - struct folio *folio; - pgoff_t unlocked = 0; - bool have_unlocked = false; - - rcu_read_lock(); - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); - - xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { - /* We might have multiple writes from the same huge - * folio, but we mustn't unlock a folio more than once. - */ - if (have_unlocked && folio_index(folio) <= unlocked) - continue; - unlocked = folio_index(folio); - folio_end_fscache(folio); - have_unlocked = true; - } - } - - rcu_read_unlock(); - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_subrequest *subreq = priv; - struct netfs_read_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) { - netfs_stat(&netfs_n_rh_write_failed); - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_copy_to_cache); - } else { - netfs_stat(&netfs_n_rh_write_done); - } - - trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); - - /* If we decrement nr_wr_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_wr_ops)) - netfs_rreq_unmark_after_write(rreq, was_async); - - netfs_put_subrequest(subreq, was_async); -} - -/* - * Perform any outstanding writes to the cache. We inherit a ref from the - * caller. - */ -static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct netfs_read_subrequest *subreq, *next, *p; - struct iov_iter iter; - int ret; - - trace_netfs_rreq(rreq, netfs_rreq_trace_write); - - /* We don't want terminating writes trying to wake us up whilst we're - * still going through the list. - */ - atomic_inc(&rreq->nr_wr_ops); - - list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) { - list_del_init(&subreq->rreq_link); - netfs_put_subrequest(subreq, false); - } - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - /* Amalgamate adjacent writes */ - while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - next = list_next_entry(subreq, rreq_link); - if (next->start != subreq->start + subreq->len) - break; - subreq->len += next->len; - list_del_init(&next->rreq_link); - netfs_put_subrequest(next, false); - } - - ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); - if (ret < 0) { - trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); - trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); - continue; - } - - iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, - subreq->start, subreq->len); - - atomic_inc(&rreq->nr_wr_ops); - netfs_stat(&netfs_n_rh_write); - netfs_get_read_subrequest(subreq); - trace_netfs_sreq(subreq, netfs_sreq_trace_write); - cres->ops->write(cres, subreq->start, &iter, - netfs_rreq_copy_terminated, subreq); - } - - /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_wr_ops)) - netfs_rreq_unmark_after_write(rreq, false); -} - -static void netfs_rreq_write_to_cache_work(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - - netfs_rreq_do_write_to_cache(rreq); -} - -static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq) -{ - rreq->work.func = netfs_rreq_write_to_cache_work; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); -} - -/* - * Unlock the folios in a read operation. We need to set PG_fscache on any - * folios we're going to write back before we unlock them. - */ -static void netfs_rreq_unlock(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - struct folio *folio; - unsigned int iopos, account = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { - __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); - } - } - - /* Walk through the pagecache and the I/O request lists simultaneously. - * We may have a mixture of cached and uncached sections and we only - * really want to write out the uncached sections. This is slightly - * complicated by the possibility that we might have huge pages with a - * mixture inside. - */ - subreq = list_first_entry(&rreq->subrequests, - struct netfs_read_subrequest, rreq_link); - iopos = 0; - subreq_failed = (subreq->error < 0); - - trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); - - rcu_read_lock(); - xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); - bool pg_failed = false; - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) - folio_start_fscache(folio); - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - account += subreq->transferred; - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) { - flush_dcache_folio(folio); - folio_mark_uptodate(folio); - } - - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); - else - folio_unlock(folio); - } - } - rcu_read_unlock(); - - task_io_account_read(account); - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); -} - -/* - * Handle a short read. - */ -static void netfs_rreq_short_read(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); - __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); - - netfs_stat(&netfs_n_rh_short_read); - trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); - - netfs_get_read_subrequest(subreq); - atomic_inc(&rreq->nr_rd_ops); - if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); - else - netfs_read_from_server(rreq, subreq); -} - -/* - * Resubmit any short or failed operations. Returns true if we got the rreq - * ref back. - */ -static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - WARN_ON(in_interrupt()); - - trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); - - /* We don't want terminating submissions trying to wake us up whilst - * we're still going through the list. - */ - atomic_inc(&rreq->nr_rd_ops); - - __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error) { - if (subreq->source != NETFS_READ_FROM_CACHE) - break; - subreq->source = NETFS_DOWNLOAD_FROM_SERVER; - subreq->error = 0; - netfs_stat(&netfs_n_rh_download_instead); - trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); - netfs_get_read_subrequest(subreq); - atomic_inc(&rreq->nr_rd_ops); - netfs_read_from_server(rreq, subreq); - } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) { - netfs_rreq_short_read(rreq, subreq); - } - } - - /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_rd_ops)) - return true; - - wake_up_var(&rreq->nr_rd_ops); - return false; -} - -/* - * Check to see if the data read is still valid. - */ -static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - if (!rreq->netfs_ops->is_still_valid || - rreq->netfs_ops->is_still_valid(rreq)) - return; - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->source == NETFS_READ_FROM_CACHE) { - subreq->error = -ESTALE; - __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } - } -} - -/* - * Assess the state of a read request and decide what to do next. - * - * Note that we could be in an ordinary kernel thread, on a workqueue or in - * softirq context at this point. We inherit a ref from the caller. - */ -static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_assess); - -again: - netfs_rreq_is_still_valid(rreq); - - if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && - test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { - if (netfs_rreq_perform_resubmissions(rreq)) - goto again; - return; - } - - netfs_rreq_unlock(rreq); - - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); - - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags)) - return netfs_rreq_write_to_cache(rreq); - - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_work(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - netfs_rreq_assess(rreq, false); -} - -/* - * Handle the completion of all outstanding I/O operations on a read request. - * We inherit a ref from the caller. - */ -static void netfs_rreq_terminated(struct netfs_read_request *rreq, - bool was_async) -{ - if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && - was_async) { - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_rreq_assess(rreq, was_async); - } -} - -/** - * netfs_subreq_terminated - Note the termination of an I/O operation. - * @subreq: The I/O request that has terminated. - * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous - * - * This tells the read helper that a contributory I/O operation has terminated, - * one way or another, and that it should integrate the results. - * - * The caller indicates in @transferred_or_error the outcome of the operation, - * supplying a positive value to indicate the number of bytes transferred, 0 to - * indicate a failure to transfer anything that should be retried or a negative - * error code. The helper will look after reissuing I/O operations as - * appropriate and writing downloaded data to the cache. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - */ -void netfs_subreq_terminated(struct netfs_read_subrequest *subreq, - ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_request *rreq = subreq->rreq; - int u; - - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); - - switch (subreq->source) { - case NETFS_READ_FROM_CACHE: - netfs_stat(&netfs_n_rh_read_done); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_stat(&netfs_n_rh_download_done); - break; - default: - break; - } - - if (IS_ERR_VALUE(transferred_or_error)) { - subreq->error = transferred_or_error; - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_read); - goto failed; - } - - if (WARN(transferred_or_error > subreq->len - subreq->transferred, - "Subreq overread: R%x[%x] %zd > %zu - %zu", - rreq->debug_id, subreq->debug_index, - transferred_or_error, subreq->len, subreq->transferred)) - transferred_or_error = subreq->len - subreq->transferred; - - subreq->error = 0; - subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len) - goto incomplete; - -complete: - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) - set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - -out: - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ - u = atomic_dec_return(&rreq->nr_rd_ops); - if (u == 0) - netfs_rreq_terminated(rreq, was_async); - else if (u == 1) - wake_up_var(&rreq->nr_rd_ops); - - netfs_put_subrequest(subreq, was_async); - return; - -incomplete: - if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { - netfs_clear_unread(subreq); - subreq->transferred = subreq->len; - goto complete; - } - - if (transferred_or_error == 0) { - if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; - goto failed; - } - } else { - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - - __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - goto out; - -failed: - if (subreq->source == NETFS_READ_FROM_CACHE) { - netfs_stat(&netfs_n_rh_read_failed); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } else { - netfs_stat(&netfs_n_rh_download_failed); - set_bit(NETFS_RREQ_FAILED, &rreq->flags); - rreq->error = subreq->error; - } - goto out; -} -EXPORT_SYMBOL(netfs_subreq_terminated); - -static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq, - loff_t i_size) -{ - struct netfs_read_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops) - return cres->ops->prepare_read(subreq, i_size); - if (subreq->start >= rreq->i_size) - return NETFS_FILL_WITH_ZEROES; - return NETFS_DOWNLOAD_FROM_SERVER; -} - -/* - * Work out what sort of subrequest the next one will be. - */ -static enum netfs_read_source -netfs_rreq_prepare_read(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - enum netfs_read_source source; - - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; - - if (source == NETFS_DOWNLOAD_FROM_SERVER) { - /* Call out to the netfs to let it shrink the request to fit - * its own I/O sizes and boundaries. If it shinks it here, it - * will be called again to make simultaneous calls; if it wants - * to make serial calls, it can indicate a short read and then - * we will call it again. - */ - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; - - if (rreq->netfs_ops->clamp_length && - !rreq->netfs_ops->clamp_length(subreq)) { - source = NETFS_INVALID_READ; - goto out; - } - } - - if (WARN_ON(subreq->len == 0)) - source = NETFS_INVALID_READ; - -out: - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - return source; -} - -/* - * Slice off a piece of a read request and submit an I/O request for it. - */ -static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq, - unsigned int *_debug_index) -{ - struct netfs_read_subrequest *subreq; - enum netfs_read_source source; - - subreq = netfs_alloc_subrequest(rreq); - if (!subreq) - return false; - - subreq->debug_index = (*_debug_index)++; - subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; - - _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - /* Call out to the cache to find out what it can do with the remaining - * subset. It tells us in subreq->flags what it decided should be done - * and adjusts subreq->len down if the subset crosses a cache boundary. - * - * Then when we hand the subset, it can choose to take a subset of that - * (the starts must coincide), in which case, we go around the loop - * again and ask it to download the next piece. - */ - source = netfs_rreq_prepare_read(rreq, subreq); - if (source == NETFS_INVALID_READ) - goto subreq_failed; - - atomic_inc(&rreq->nr_rd_ops); - - rreq->submitted += subreq->len; - - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - switch (source) { - case NETFS_FILL_WITH_ZEROES: - netfs_fill_with_zeroes(rreq, subreq); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_read_from_server(rreq, subreq); - break; - case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); - break; - default: - BUG(); - } - - return true; - -subreq_failed: - rreq->error = subreq->error; - netfs_put_subrequest(subreq, false); - return false; -} - -static void netfs_cache_expand_readahead(struct netfs_read_request *rreq, - loff_t *_start, size_t *_len, loff_t i_size) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops && cres->ops->expand_readahead) - cres->ops->expand_readahead(cres, _start, _len, i_size); -} - -static void netfs_rreq_expand(struct netfs_read_request *rreq, - struct readahead_control *ractl) -{ - /* Give the cache a chance to change the request parameters. The - * resultant request must contain the original region. - */ - netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); - - /* Give the netfs a chance to change the request parameters. The - * resultant request must contain the original region. - */ - if (rreq->netfs_ops->expand_readahead) - rreq->netfs_ops->expand_readahead(rreq); - - /* Expand the request if the cache wants it to start earlier. Note - * that the expansion may get further extended if the VM wishes to - * insert THPs and the preferred start and/or end wind up in the middle - * of THPs. - * - * If this is the case, however, the THP size should be an integer - * multiple of the cache granule size, so we get a whole number of - * granules to deal with. - */ - if (rreq->start != readahead_pos(ractl) || - rreq->len != readahead_length(ractl)) { - readahead_expand(ractl, rreq->start, rreq->len); - rreq->start = readahead_pos(ractl); - rreq->len = readahead_length(ractl); - - trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), - netfs_read_trace_expanded); - } -} - -/** - * netfs_readahead - Helper to manage a read request - * @ractl: The description of the readahead request - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Fulfil a readahead request by drawing data from the cache if possible, or - * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O - * requests from different sources will get munged together. If necessary, the - * readahead window can be expanded in either direction to a more convenient - * alighment for RPC efficiency or to make storage in the cache feasible. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. It may also be passed a private token, which will - * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). - * - * This is usable whether or not caching is enabled. - */ -void netfs_readahead(struct readahead_control *ractl, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - unsigned int debug_index = 0; - int ret; - - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); - - if (readahead_count(ractl) == 0) - goto cleanup; - - rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file); - if (!rreq) - goto cleanup; - rreq->mapping = ractl->mapping; - rreq->start = readahead_pos(ractl); - rreq->len = readahead_length(ractl); - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } - - netfs_stat(&netfs_n_rh_readahead); - trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), - netfs_read_trace_readahead); - - netfs_rreq_expand(rreq, ractl); - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Drop the refs on the folios here rather than in the cache or - * filesystem. The locks will be dropped in netfs_rreq_unlock(). - */ - while (readahead_folio(ractl)) - ; - - /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_rd_ops)) - netfs_rreq_assess(rreq, false); - return; - -cleanup_free: - netfs_put_read_request(rreq, false); - return; -cleanup: - if (netfs_priv) - ops->cleanup(ractl->mapping, netfs_priv); - return; -} -EXPORT_SYMBOL(netfs_readahead); - -/** - * netfs_readpage - Helper to manage a readpage request - * @file: The file to read from - * @folio: The folio to read - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Fulfil a readpage request by drawing data from the cache if possible, or the - * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests - * from different sources will get munged together. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. It may also be passed a private token, which will - * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). - * - * This is usable whether or not caching is enabled. - */ -int netfs_readpage(struct file *file, - struct folio *folio, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - unsigned int debug_index = 0; - int ret; - - _enter("%lx", folio_index(folio)); - - rreq = netfs_alloc_read_request(ops, netfs_priv, file); - if (!rreq) { - if (netfs_priv) - ops->cleanup(folio_file_mapping(folio), netfs_priv); - folio_unlock(folio); - return -ENOMEM; - } - rreq->mapping = folio_file_mapping(folio); - rreq->start = folio_file_pos(folio); - rreq->len = folio_size(folio); - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) { - folio_unlock(folio); - goto out; - } - } - - netfs_stat(&netfs_n_rh_readpage); - trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); - - netfs_get_read_request(rreq); - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Keep nr_rd_ops incremented so that the ref always belongs to us, and - * the service code isn't punted off to a random thread pool to - * process. - */ - do { - wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); - netfs_rreq_assess(rreq, false); - } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)); - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage); - ret = -EIO; - } -out: - netfs_put_read_request(rreq, false); - return ret; -} -EXPORT_SYMBOL(netfs_readpage); - -/* - * Prepare a folio for writing without reading first - * @folio: The folio being prepared - * @pos: starting position for the write - * @len: length of write - * - * In some cases, write_begin doesn't need to read at all: - * - full folio write - * - write that lies in a folio that is completely beyond EOF - * - write that covers the folio from start to EOF or beyond it - * - * If any of these criteria are met, then zero out the unwritten parts - * of the folio and return true. Otherwise, return false. - */ -static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len) -{ - struct inode *inode = folio_inode(folio); - loff_t i_size = i_size_read(inode); - size_t offset = offset_in_folio(folio, pos); - - /* Full folio write */ - if (offset == 0 && len >= folio_size(folio)) - return true; - - /* pos beyond last folio in the file */ - if (pos - offset >= i_size) - goto zero_out; - - /* Write that covers from the start of the folio to EOF or beyond */ - if (offset == 0 && (pos + len) >= i_size) - goto zero_out; - - return false; -zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio)); - return true; -} - -/** - * netfs_write_begin - Helper to prepare for writing - * @file: The file to read from - * @mapping: The mapping to read from - * @pos: File position at which the write will begin - * @len: The length of the write (may extend beyond the end of the folio chosen) - * @aop_flags: AOP_* flags - * @_folio: Where to put the resultant folio - * @_fsdata: Place for the netfs to store a cookie - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Pre-read data for a write-begin request by drawing data from the cache if - * possible, or the netfs if not. Space beyond the EOF is zero-filled. - * Multiple I/O requests from different sources will get munged together. If - * necessary, the readahead window can be expanded in either direction to a - * more convenient alighment for RPC efficiency or to make storage in the cache - * feasible. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. - * - * The check_write_begin() operation can be provided to check for and flush - * conflicting writes once the folio is grabbed and locked. It is passed a - * pointer to the fsdata cookie that gets returned to the VM to be passed to - * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. - * - * This is usable whether or not caching is enabled. - */ -int netfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int aop_flags, - struct folio **_folio, void **_fsdata, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - struct folio *folio; - struct inode *inode = file_inode(file); - unsigned int debug_index = 0, fgp_flags; - pgoff_t index = pos >> PAGE_SHIFT; - int ret; - - DEFINE_READAHEAD(ractl, file, NULL, mapping, index); - -retry: - fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - if (ops->check_write_begin) { - /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ops->check_write_begin(file, pos, len, folio, _fsdata); - if (ret < 0) { - trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; - goto error; - } - } - - if (folio_test_uptodate(folio)) - goto have_folio; - - /* If the page is beyond the EOF, we want to clear it - unless it's - * within the cache granule containing the EOF, in which case we need - * to preload the granule. - */ - if (!ops->is_cache_enabled(inode) && - netfs_skip_folio_read(folio, pos, len)) { - netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio_no_wait; - } - - ret = -ENOMEM; - rreq = netfs_alloc_read_request(ops, netfs_priv, file); - if (!rreq) - goto error; - rreq->mapping = folio_file_mapping(folio); - rreq->start = folio_file_pos(folio); - rreq->len = folio_size(folio); - rreq->no_unlock_folio = folio_index(folio); - __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - netfs_priv = NULL; - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } - - netfs_stat(&netfs_n_rh_write_begin); - trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); - - /* Expand the request to meet caching requirements and download - * preferences. - */ - ractl._nr_pages = folio_nr_pages(folio); - netfs_rreq_expand(rreq, &ractl); - netfs_get_read_request(rreq); - - /* We hold the folio locks, so we can drop the references */ - folio_get(folio); - while (readahead_folio(&ractl)) - ; - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Keep nr_rd_ops incremented so that the ref always belongs to us, and - * the service code isn't punted off to a random thread pool to - * process. - */ - for (;;) { - wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); - netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin); - ret = -EIO; - } - netfs_put_read_request(rreq, false); - if (ret < 0) - goto error; - -have_folio: - ret = folio_wait_fscache_killable(folio); - if (ret < 0) - goto error; -have_folio_no_wait: - if (netfs_priv) - ops->cleanup(mapping, netfs_priv); - *_folio = folio; - _leave(" = 0"); - return 0; - -error_put: - netfs_put_read_request(rreq, false); -error: - folio_unlock(folio); - folio_put(folio); - if (netfs_priv) - ops->cleanup(mapping, netfs_priv); - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(netfs_write_begin); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 9ae538c85378..5510a7a14a40 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/seq_file.h> -#include <linux/netfs.h> #include "internal.h" atomic_t netfs_n_rh_readahead; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index fe860c538747..79a8b451791f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio) return NULL; } -static struct bio *bl_alloc_init_bio(unsigned int npg, - struct block_device *bdev, sector_t disk_sector, - bio_end_io_t end_io, struct parallel_io *par) -{ - struct bio *bio; - - npg = bio_max_segs(npg); - bio = bio_alloc(GFP_NOIO, npg); - if (bio) { - bio->bi_iter.bi_sector = disk_sector; - bio_set_dev(bio, bdev); - bio->bi_end_io = end_io; - bio->bi_private = par; - } - return bio; -} - static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) { return offset >= map->start && offset < map->start + map->len; @@ -171,11 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, retry: if (!bio) { - bio = bl_alloc_init_bio(npg, map->bdev, - disk_addr >> SECTOR_SHIFT, end_io, par); - if (!bio) - return ERR_PTR(-ENOMEM); - bio_set_op_attrs(bio, rw, 0); + bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO); + bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT; + bio->bi_end_io = end_io; + bio->bi_private = par; } if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(bio); diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index ef9db135c649..6c977288cc28 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -27,7 +27,6 @@ */ #include <linux/module.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include "blocklayout.h" diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 054cc1255fac..456af7d230cf 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -17,7 +17,6 @@ #include <linux/errno.h> #include <linux/mutex.h> #include <linux/freezer.h> -#include <linux/kthread.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/bc_xprt.h> @@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) int ret; struct nfs_net *nn = net_generic(net, nfs_net_id); - ret = svc_create_xprt(serv, "tcp", net, PF_INET, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, - cred); + ret = svc_xprt_create(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, + cred); if (ret <= 0) goto out_err; nn->nfs_callback_tcpport = ret; dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", nn->nfs_callback_tcpport, PF_INET, net->ns.inum); - ret = svc_create_xprt(serv, "tcp", net, PF_INET6, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, - cred); + ret = svc_xprt_create(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, + cred); if (ret > 0) { nn->nfs_callback_tcpport6 = ret; dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", @@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); - module_put_and_kthread_exit(0); return 0; } @@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp) finish_wait(&serv->sv_cb_waitq, &wq); } } + svc_exit_thread(rqstp); - module_put_and_kthread_exit(0); return 0; } @@ -189,7 +188,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_shutdown_net(serv, net); + svc_xprt_destroy_all(serv, net); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -232,33 +231,10 @@ err_bind: return ret; } -static const struct svc_serv_ops nfs40_cb_sv_ops = { - .svo_function = nfs4_callback_svc, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_module = THIS_MODULE, -}; -#if defined(CONFIG_NFS_V4_1) -static const struct svc_serv_ops nfs41_cb_sv_ops = { - .svo_function = nfs41_callback_svc, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_module = THIS_MODULE, -}; - -static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { - [0] = &nfs40_cb_sv_ops, - [1] = &nfs41_cb_sv_ops, -}; -#else -static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { - [0] = &nfs40_cb_sv_ops, - [1] = NULL, -}; -#endif - static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - const struct svc_serv_ops *sv_ops; + int (*threadfn)(void *data); struct svc_serv *serv; /* @@ -267,17 +243,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) if (cb_info->serv) return svc_get(cb_info->serv); - switch (minorversion) { - case 0: - sv_ops = nfs4_cb_sv_ops[0]; - break; - default: - sv_ops = nfs4_cb_sv_ops[1]; - } - - if (sv_ops == NULL) - return ERR_PTR(-ENOTSUPP); - /* * Sanity check: if there's no task, * we should be the first user ... @@ -286,7 +251,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + threadfn = nfs4_callback_svc; +#if defined(CONFIG_NFS_V4_1) + if (minorversion) + threadfn = nfs41_callback_svc; +#else + if (minorversion) + return ERR_PTR(-ENOTSUPP); +#endif + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + threadfn); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c343666d9a42..c8520284dda7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; + const struct pnfs_layoutdriver_type *ld = NULL; uint32_t i; __be32 res = 0; - struct nfs_client *clp = cps->clp; - struct nfs_server *server = NULL; - if (!clp) { + if (!cps->clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; } @@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, for (i = 0; i < args->ndevs; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - if (!server || - server->pnfs_curr_ld->id != dev->cbd_layout_type) { - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - if (server->pnfs_curr_ld && - server->pnfs_curr_ld->id == dev->cbd_layout_type) { - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - continue; + if (!ld || ld->id != dev->cbd_layout_type) { + pnfs_put_layoutdriver(ld); + ld = pnfs_find_layoutdriver(dev->cbd_layout_type); + if (!ld) + continue; } - - found: - nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id); } - + pnfs_put_layoutdriver(ld); out: kfree(args->devs); return res; @@ -710,7 +701,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy, struct nfs4_copy_state *copy, *tmp_copy; bool found = false; - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) return htonl(NFS4ERR_SERVERFAULT); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index f90de8043b0f..8dcb08e1a885 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -271,10 +271,6 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, n = ntohl(*p++); if (n == 0) goto out; - if (n > ULONG_MAX / sizeof(*args->devs)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d1f34229e11a..e828504cc396 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -857,7 +857,8 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } if (clp->rpc_ops->discover_trunking != NULL && - (server->caps & NFS_CAP_FS_LOCATIONS)) { + (server->caps & NFS_CAP_FS_LOCATIONS && + (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7c9eb679dbdb..5c97cad741a7 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -439,7 +439,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, struct nfs_delegation *freeme = NULL; int status = 0; - delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT); if (delegation == NULL) return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, stateid); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 75cb1cbe4cde..c6b263b5faf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -39,6 +39,7 @@ #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> +#include <linux/hash.h> #include "delegation.h" #include "iostat.h" @@ -69,26 +70,26 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir) +#define NFS_INIT_DTSIZE PAGE_SIZE + +static struct nfs_open_dir_context * +alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { - ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - ctx->dir_cookie = 0; - ctx->dup_cookie = 0; - ctx->page_index = 0; - ctx->eof = false; + ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); - list_add(&ctx->list, &nfsi->open_files); - clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); + memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } @@ -98,9 +99,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); - list_del(&ctx->list); + list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } /* @@ -142,6 +143,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { + u64 change_attr; u64 last_cookie; unsigned int size; unsigned char page_full : 1, @@ -155,11 +157,10 @@ struct nfs_readdir_descriptor { struct page *page; struct dir_context *ctx; pgoff_t page_index; + pgoff_t page_index_max; u64 dir_cookie; u64 last_cookie; - u64 dup_cookie; loff_t current_index; - loff_t prev_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; @@ -167,24 +168,47 @@ struct nfs_readdir_descriptor { unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; - signed char duped; + unsigned int buffer_fills; + unsigned int dtsize; + bool clear_cache; bool plus; bool eob; bool eof; }; -static void nfs_readdir_array_init(struct nfs_cache_array *array) +static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) +{ + struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); + unsigned int maxsize = server->dtsize; + + if (sz > maxsize) + sz = maxsize; + if (sz < NFS_MIN_FILE_IO_SIZE) + sz = NFS_MIN_FILE_IO_SIZE; + desc->dtsize = sz; +} + +static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) { - memset(array, 0, sizeof(struct nfs_cache_array)); + nfs_set_dtsize(desc, desc->dtsize >> 1); } -static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) +static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize << 1); +} + +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, + u64 change_attr) { struct nfs_cache_array *array; array = kmap_atomic(page); - nfs_readdir_array_init(array); + array->change_attr = change_attr; array->last_cookie = last_cookie; + array->size = 0; + array->page_full = 0; + array->page_is_eof = 0; array->cookies_are_ordered = 1; kunmap_atomic(array); } @@ -192,25 +216,31 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static -void nfs_readdir_clear_array(struct page *page) +static void nfs_readdir_clear_array(struct page *page) { struct nfs_cache_array *array; - int i; + unsigned int i; array = kmap_atomic(page); for (i = 0; i < array->size; i++) kfree(array->array[i].name); - nfs_readdir_array_init(array); + array->size = 0; kunmap_atomic(array); } +static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie, + u64 change_attr) +{ + nfs_readdir_clear_array(page); + nfs_readdir_page_init_array(page, last_cookie, change_attr); +} + static struct page * nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct page *page = alloc_page(gfp_flags); if (page) - nfs_readdir_page_init_array(page, last_cookie); + nfs_readdir_page_init_array(page, last_cookie, 0); return page; } @@ -222,6 +252,11 @@ static void nfs_readdir_page_array_free(struct page *page) } } +static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) +{ + return array->size == 0 ? array->last_cookie : array->array[0].cookie; +} + static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->page_is_eof = 1; @@ -251,36 +286,40 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len) return ret; } +static size_t nfs_readdir_array_maxentries(void) +{ + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / + sizeof(struct nfs_cache_array_entry); +} + /* * Check that the next array entry lies entirely within the page bounds */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { - struct nfs_cache_array_entry *cache_entry; - if (array->page_full) return -ENOSPC; - cache_entry = &array->array[array->size + 1]; - if ((char *)cache_entry - (char *)array > PAGE_SIZE) { + if (array->size == nfs_readdir_array_maxentries()) { array->page_full = 1; return -ENOSPC; } return 0; } -static -int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +static int nfs_readdir_page_array_append(struct page *page, + const struct nfs_entry *entry, + u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; - int ret; + int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); - if (!name) - return -ENOMEM; array = kmap_atomic(page); + if (!name) + goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); @@ -288,7 +327,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) } cache_entry = &array->array[array->size]; - cache_entry->cookie = entry->prev_cookie; + cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; @@ -300,23 +339,72 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: + *cookie = array->last_cookie; + kunmap_atomic(array); + return ret; +} + +#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) +/* + * Hash algorithm allowing content addressible access to sequences + * of directory cookies. Content is addressed by the value of the + * cookie index of the first readdir entry in a page. + * + * We select only the first 18 bits to avoid issues with excessive + * memory use for the page cache XArray. 18 bits should allow the caching + * of 262144 pages of sequences of readdir entries. Since each page holds + * 127 readdir entries for a typical 64-bit system, that works out to a + * cache of ~ 33 million entries per directory. + */ +static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) +{ + if (cookie == 0) + return 0; + return hash_64(cookie, 18); +} + +static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, + u64 change_attr) +{ + struct nfs_cache_array *array = kmap_atomic(page); + int ret = true; + + if (array->change_attr != change_attr) + ret = false; + if (nfs_readdir_array_index_cookie(array) != last_cookie) + ret = false; kunmap_atomic(array); return ret; } +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + +static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie, + u64 change_attr) +{ + if (PageUptodate(page)) { + if (nfs_readdir_page_validate(page, cookie, change_attr)) + return; + nfs_readdir_clear_array(page); + } + nfs_readdir_page_init_array(page, cookie, change_attr); + SetPageUptodate(page); +} + static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, - pgoff_t index, u64 last_cookie) + u64 cookie, u64 change_attr) { + pgoff_t index = nfs_readdir_page_cookie_hash(cookie); struct page *page; page = grab_cache_page(mapping, index); - if (page && !PageUptodate(page)) { - nfs_readdir_page_init_array(page, last_cookie); - if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) - nfs_zap_mapping(mapping->host, mapping); - SetPageUptodate(page); - } - + if (!page) + return NULL; + nfs_readdir_page_init_and_validate(page, cookie, change_attr); return page; } @@ -351,24 +439,19 @@ static void nfs_readdir_page_set_eof(struct page *page) kunmap_atomic(array); } -static void nfs_readdir_page_unlock_and_put(struct page *page) -{ - unlock_page(page); - put_page(page); -} - static struct page *nfs_readdir_page_get_next(struct address_space *mapping, - pgoff_t index, u64 cookie) + u64 cookie, u64 change_attr) { + pgoff_t index = nfs_readdir_page_cookie_hash(cookie); struct page *page; - page = nfs_readdir_page_get_locked(mapping, index, cookie); - if (page) { - if (nfs_readdir_page_last_cookie(page) == cookie) - return page; - nfs_readdir_page_unlock_and_put(page); - } - return NULL; + page = grab_cache_page_nowait(mapping, index); + if (!page) + return NULL; + nfs_readdir_page_init_and_validate(page, cookie, change_attr); + if (nfs_readdir_page_last_cookie(page) != cookie) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } static inline @@ -390,6 +473,25 @@ bool nfs_readdir_use_cookie(const struct file *filp) return true; } +static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) +{ + if (array->page_full) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->cache_entry_index = 0; + desc->page_index++; + } else + desc->last_cookie = nfs_readdir_array_index_cookie(array); +} + +static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) +{ + desc->current_index = 0; + desc->last_cookie = 0; + desc->page_index = 0; +} + static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { @@ -401,6 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, if (diff >= array->size) { if (array->page_is_eof) goto out_eof; + nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } @@ -413,16 +516,6 @@ out_eof: return -EBADCOOKIE; } -static bool -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) -{ - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE | - NFS_INO_INVALID_DATA)) - return false; - smp_rmb(); - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); -} - static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { @@ -439,8 +532,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { - int i; - loff_t new_pos; + unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) @@ -448,33 +540,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { - struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - - new_pos = desc->current_index + i; - if (desc->attr_gencount != nfsi->attr_gencount || - !nfs_readdir_inode_mapping_valid(nfsi)) { - desc->duped = 0; - desc->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->prev_index) { - if (desc->duped > 0 - && desc->dup_cookie == desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %pD2 contains a readdir loop." - "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, array->array[i].name, desc->dir_cookie); - } - status = -ELOOP; - goto out; - } - desc->dup_cookie = desc->dir_cookie; - desc->duped = -1; - } if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else - desc->ctx->pos = new_pos; - desc->prev_index = new_pos; + desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } @@ -484,8 +553,8 @@ check_eof: status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; - } -out: + } else + nfs_readdir_seek_next_array(array, desc); return status; } @@ -501,11 +570,6 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) else status = nfs_readdir_search_for_cookie(array, desc); - if (status == -EAGAIN) { - desc->last_cookie = array->last_cookie; - desc->current_index += array->size; - desc->page_index++; - } kunmap_atomic(array); return status; } @@ -541,7 +605,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); desc->plus = arg.plus = false; goto again; } @@ -591,51 +654,68 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 1; } -static -bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) + +static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, + unsigned int cache_hits, + unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; - if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) - return true; - if (ctx->pos == 0) + if (ctx->pos == 0 || + cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* - * This function is called by the lookup and getattr code to request the + * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ -void nfs_advise_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_hits); + rcu_read_unlock(); + } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. - * Do this by checking if there is an active file descriptor - * and calling nfs_advise_use_readdirplus, then forcing a - * cache flush. */ -void nfs_force_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) { - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); - set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_misses); + rcu_read_unlock(); } } +static void nfs_lookup_advise_force_readdirplus(struct inode *dir, + unsigned int flags) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return; + if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) + return; + nfs_readdir_record_entry_cache_miss(dir); +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) @@ -686,8 +766,12 @@ again: status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr); + trace_nfs_readdir_lookup_revalidate(d_inode(parent), + dentry, 0, status); goto out; } else { + trace_nfs_readdir_lookup_revalidate_failed( + d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; @@ -709,22 +793,38 @@ again: dentry = alias; } nfs_set_verifier(dentry, dir_verifier); + trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } +static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct xdr_stream *stream) +{ + int ret; + + if (entry->fattr->label) + entry->fattr->label->len = NFS4_MAXLABELLEN; + ret = xdr_decode(desc, entry, stream); + if (ret || !desc->plus) + return ret; + nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); + return 0; +} + /* Perform conversion from xdr to cache array */ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, - struct page **xdr_pages, - unsigned int buflen, - struct page **arrays, - size_t narrays) + struct page **xdr_pages, unsigned int buflen, + struct page **arrays, size_t narrays, + u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; struct page *scratch, *new, *page = *arrays; + u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); @@ -735,54 +835,50 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, xdr_set_scratch_page(&stream, scratch); do { - if (entry->fattr->label) - entry->fattr->label->len = NFS4_MAXLABELLEN; - - status = xdr_decode(desc, entry, &stream); + status = nfs_readdir_entry_decode(desc, entry, &stream); if (status != 0) break; - if (desc->plus) - nfs_prime_dcache(file_dentry(desc->file), entry, - desc->dir_verifier); - - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_page_array_append(page, entry, &cookie); if (status != -ENOSPC) continue; if (page->mapping != mapping) { if (!--narrays) break; - new = nfs_readdir_page_array_alloc(entry->prev_cookie, - GFP_KERNEL); + new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = page = new; } else { - new = nfs_readdir_page_get_next(mapping, - page->index + 1, - entry->prev_cookie); + new = nfs_readdir_page_get_next(mapping, cookie, + change_attr); if (!new) break; if (page != *arrays) nfs_readdir_page_unlock_and_put(page); page = new; } - status = nfs_readdir_add_to_array(entry, page); + desc->page_index_max++; + status = nfs_readdir_page_array_append(page, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: - if (entry->eof) { - nfs_readdir_page_set_eof(page); - status = 0; - } - break; - case -ENOSPC: + if (!entry->eof) + break; + nfs_readdir_page_set_eof(page); + fallthrough; case -EAGAIN: status = 0; break; + case -ENOSPC: + status = 0; + if (!desc->plus) + break; + while (!nfs_readdir_entry_decode(desc, entry, &stream)) + ; } if (page != *arrays) @@ -828,12 +924,14 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct page **arrays, size_t narrays) { + u64 change_attr; struct page **pages; struct page *page = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); - size_t dtsize = NFS_SERVER(inode)->dtsize; + unsigned int dtsize = desc->dtsize; + unsigned int pglen; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); @@ -851,27 +949,21 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, if (!pages) goto out; - do { - unsigned int pglen; - status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, - pages, dtsize, - verf_res); - if (status < 0) - break; - - pglen = status; - if (pglen == 0) { - nfs_readdir_page_set_eof(page); - break; - } - - verf_arg = verf_res; + change_attr = inode_peek_iversion_raw(inode); + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, + dtsize, verf_res); + if (status < 0) + goto free_pages; + pglen = status; + if (pglen != 0) status = nfs_readdir_page_filler(desc, entry, pages, pglen, - arrays, narrays); - } while (!status && nfs_readdir_page_needs_filling(page) && - page_mapping(page)); + arrays, narrays, change_attr); + else + nfs_readdir_page_set_eof(page); + desc->buffer_fills++; +free_pages: nfs_readdir_free_pages(pages, array_size); out: nfs_free_fattr(entry->fattr); @@ -896,9 +988,17 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) static struct page * nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) { - return nfs_readdir_page_get_locked(desc->file->f_mapping, - desc->page_index, - desc->last_cookie); + struct address_space *mapping = desc->file->f_mapping; + u64 change_attr = inode_peek_iversion_raw(mapping->host); + u64 cookie = desc->last_cookie; + struct page *page; + + page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); + if (!page) + return NULL; + if (desc->clear_cache && !nfs_readdir_page_needs_filling(page)) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } /* @@ -916,13 +1016,23 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) if (!desc->page) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { + /* Grow the dtsize if we had to go back for more pages */ + if (desc->page_index == desc->page_index_max) + nfs_grow_dtsize(desc); + desc->page_index_max = desc->page_index; + trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, + desc->last_cookie, + desc->page->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->page, 1); if (res < 0) { nfs_readdir_page_unlock_and_put_cached(desc); + trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); - desc->page_index = 0; + nfs_readdir_rewind_search(desc); + trace_nfs_readdir_invalidate_cache_range( + inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; } return res; @@ -930,9 +1040,16 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) /* * Set the cookie verifier if the page cache was empty */ - if (desc->page_index == 0) + if (desc->last_cookie == 0 && + memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); + invalidate_inode_pages2_range(desc->file->f_mapping, 1, + -1); + trace_nfs_readdir_invalidate_cache_range( + inode, 1, MAX_LFS_FILESIZE); + } + desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) @@ -941,34 +1058,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) return res; } -static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc) -{ - struct address_space *mapping = desc->file->f_mapping; - struct inode *dir = file_inode(desc->file); - unsigned int dtsize = NFS_SERVER(dir)->dtsize; - loff_t size = i_size_read(dir); - - /* - * Default to uncached readdir if the page cache is empty, and - * we're looking for a non-zero cookie in a large directory. - */ - return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize; -} - /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (nfs_readdir_dont_search_cache(desc)) - return -EBADCOOKIE; - do { - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -982,7 +1077,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, { struct file *file = desc->file; struct nfs_cache_array *array; - unsigned int i = 0; + unsigned int i; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -995,16 +1090,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, break; } memcpy(desc->verf, verf, sizeof(desc->verf)); - if (i < (array->size-1)) - desc->dir_cookie = array->array[i+1].cookie; - else + if (i == array->size - 1) { desc->dir_cookie = array->last_cookie; + nfs_readdir_seek_next_array(array, desc); + } else { + desc->dir_cookie = array->array[i + 1].cookie; + desc->last_cookie = array->array[0].cookie; + } if (nfs_readdir_use_cookie(file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (desc->duped != 0) - desc->duped = 1; } if (array->page_is_eof) desc->eof = !desc->eob; @@ -1046,9 +1142,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) desc->page_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; - desc->duped = 0; + desc->page_index_max = 0; + + trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, + -1, desc->dtsize); status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); + if (status < 0) { + trace_nfs_readdir_uncached_done(file_inode(desc->file), status); + goto out_free; + } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; @@ -1056,15 +1159,44 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) } desc->page = NULL; - + /* + * Grow the dtsize if we have to go back for more pages, + * or shrink it if we're reading too many. + */ + if (!desc->eof) { + if (!desc->eob) + nfs_grow_dtsize(desc); + else if (desc->buffer_fills == 1 && + i < (desc->page_index_max >> 1)) + nfs_shrink_dtsize(desc); + } +out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_page_array_free(arrays[i]); out: + if (!nfs_readdir_use_cookie(desc->file)) + nfs_readdir_rewind_search(desc); + desc->page_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + +static bool nfs_readdir_handle_cache_misses(struct inode *inode, + struct nfs_readdir_descriptor *desc, + unsigned int cache_misses, + bool force_clear) +{ + if (desc->ctx->pos == 0 || !desc->plus) + return false; + if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) + return false; + trace_nfs_readdir_force_readdirplus(inode); + return true; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -1076,7 +1208,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; - pgoff_t page_index; + unsigned int cache_hits, cache_misses; + bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -1089,11 +1222,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { - res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) - goto out; - } + nfs_revalidate_mapping(inode, file->f_mapping); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL); @@ -1101,16 +1230,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out; desc->file = file; desc->ctx = ctx; - desc->plus = nfs_use_readdirplus(inode, ctx); + desc->page_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; - desc->dup_cookie = dir_ctx->dup_cookie; - desc->duped = dir_ctx->duped; - page_index = dir_ctx->page_index; + desc->page_index = dir_ctx->page_index; + desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; + nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); + cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); + force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { @@ -1118,9 +1250,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && - list_is_singular(&nfsi->open_files)) - invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); + desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); + force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, + force_clear); + desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); @@ -1139,9 +1272,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); - desc->page_index = 0; desc->plus = false; desc->eof = false; continue; @@ -1151,15 +1282,18 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); + if (desc->page_index == desc->page_index_max) + desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; - dir_ctx->dup_cookie = desc->dup_cookie; - dir_ctx->duped = desc->duped; + dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; + dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); out_free: @@ -1197,13 +1331,14 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) } if (offset != filp->f_pos) { filp->f_pos = offset; - if (nfs_readdir_use_cookie(filp)) - dir_ctx->dir_cookie = offset; - else + dir_ctx->page_index = 0; + if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; - if (offset == 0) - memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); - dir_ctx->duped = 0; + dir_ctx->last_cookie = 0; + } else { + dir_ctx->dir_cookie = offset; + dir_ctx->last_cookie = offset; + } dir_ctx->eof = false; } spin_unlock(&filp->f_lock); @@ -1419,7 +1554,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if (flags & LOOKUP_REVAL) goto out_force; out: - return (inode->i_nlink == 0) ? -ESTALE : 0; + if (inode->i_nlink > 0 || + (inode->i_nlink == 0 && + test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) + return 0; + else + return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; @@ -1469,9 +1609,7 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, { switch (error) { case 1: - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; + break; case 0: /* * We can't d_drop the root of a disconnected tree: @@ -1480,13 +1618,10 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, * inodes on unmount and further oopses. */ if (inode && IS_ROOT(dentry)) - return 1; - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; + error = 1; + break; } - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); + trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error); return error; } @@ -1511,15 +1646,17 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } -static int -nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, - struct inode *inode) +static int nfs_lookup_revalidate_dentry(struct inode *dir, + struct dentry *dentry, + struct inode *inode, unsigned int flags) { struct nfs_fh *fhandle; struct nfs_fattr *fattr; unsigned long dir_verifier; int ret; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + ret = -ENOMEM; fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); @@ -1540,6 +1677,10 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, } goto out; } + + /* Request help from readdirplus */ + nfs_lookup_advise_force_readdirplus(dir, flags); + ret = 0; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out; @@ -1549,8 +1690,6 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, nfs_setsecurity(inode, fattr); nfs_set_verifier(dentry, dir_verifier); - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); ret = 1; out: nfs_free_fattr(fattr); @@ -1607,7 +1746,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, nfs_mark_dir_for_revalidate(dir); goto out_bad; } - nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1617,10 +1755,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (NFS_STALE(inode)) goto out_bad; - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = nfs_lookup_revalidate_dentry(dir, dentry, inode); - trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - return error; + return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: @@ -1814,7 +1949,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in goto out; /* Notify readdir to use READDIRPLUS */ - nfs_force_use_readdirplus(dir); + nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: res = d_splice_alias(inode, dentry); @@ -1853,16 +1988,6 @@ const struct dentry_operations nfs4_dentry_operations = { }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -static fmode_t flags_to_mode(int flags) -{ - fmode_t res = (__force fmode_t)flags & FMODE_EXEC; - if ((flags & O_ACCMODE) != O_WRONLY) - res |= FMODE_READ; - if ((flags & O_ACCMODE) != O_RDONLY) - res |= FMODE_WRITE; - return res; -} - static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); @@ -2077,7 +2202,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode); + return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); full_reval: return nfs_do_lookup_revalidate(dir, dentry, flags); @@ -2330,7 +2455,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); - if (d_count(dentry) > 1) { + if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, + &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); @@ -2989,11 +3115,8 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) /* * Determine which access bits we want to ask for... */ - cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; - if (nfs_server_capable(inode, NFS_CAP_XATTR)) { - cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE | - NFS_ACCESS_XALIST; - } + cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | + nfs_access_xattr_mask(NFS_SERVER(inode)); if (S_ISDIR(inode->i_mode)) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index eabfdab543c8..11c566d8769f 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -173,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -425,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -440,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -482,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -790,7 +794,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -798,7 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -876,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -892,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -906,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -937,16 +947,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 76d76acbc594..150b7fa8f0a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -44,11 +44,6 @@ static const struct vm_operations_struct nfs_file_vm_ops; -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - int nfs_check_flags(int flags) { if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) @@ -162,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -406,17 +401,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * - Called if either PG_private or PG_fscache is set on the page * - Caller holds page lock */ -static void nfs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +static void nfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", - page, offset, length); + dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", + folio->index, offset, length); - if (offset != 0 || length < PAGE_SIZE) + if (offset != 0 || length < folio_size(folio)) return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page_file_mapping(page)->host, page); - wait_on_page_fscache(page); + nfs_wb_folio_cancel(folio->mapping->host, folio); + folio_wait_fscache(folio); } /* @@ -472,15 +467,15 @@ static void nfs_check_dirty_writeback(struct page *page, * - Caller holds page lock * - Return 0 if successful, -error otherwise */ -static int nfs_launder_page(struct page *page) +static int nfs_launder_folio(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio->mapping->host; - dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", - inode->i_ino, (long long)page_offset(page)); + dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", + inode->i_ino, folio_pos(folio)); - wait_on_page_fscache(page); - return nfs_wb_page(inode, page); + folio_wait_fscache(folio); + return nfs_wb_page(inode, &folio->page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -488,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; spin_lock(&inode->i_lock); blocks = inode->i_blocks; @@ -502,31 +498,39 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, *span = sis->pages; + + if (cl->rpc_ops->enable_swap) + cl->rpc_ops->enable_swap(inode); + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; rpc_clnt_swap_deactivate(clnt); + if (cl->rpc_ops->disable_swap) + cl->rpc_ops->disable_swap(file_inode(file)); } const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, - .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .readahead = nfs_readahead, + .dirty_folio = filemap_dirty_folio, .writepage = nfs_writepage, .writepages = nfs_writepages, .write_begin = nfs_write_begin, .write_end = nfs_write_end, - .invalidatepage = nfs_invalidate_page, + .invalidate_folio = nfs_invalidate_folio, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, #endif - .launder_page = nfs_launder_page, + .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, .swap_activate = nfs_swap_activate, @@ -619,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); @@ -642,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); - result = generic_perform_write(file, from, iocb->ki_pos); + result = generic_perform_write(iocb, from); current->backing_dev_info = NULL; } nfs_end_io_write(inode); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 9c96e3e5ed35..76deddab0a8f 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1075,7 +1075,7 @@ filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ? fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - new = pnfs_alloc_commit_array(size, GFP_NOIO); + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); array = pnfs_add_commit_array(fl_cinfo, new, lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a553d59afa8b..604be402ae13 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -663,7 +663,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_KERNEL); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void @@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_NOIO); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void @@ -806,13 +806,10 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, bool strict_iomode) { pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_READ, - strict_iomode, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, IOMODE_READ, + strict_iomode, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -894,13 +891,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, retry: ff_layout_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -953,13 +947,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -1258,7 +1249,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, - GFP_NOIO); + nfs_io_gfp_mask()); switch (status) { case NFS4ERR_DELAY: @@ -1973,7 +1964,8 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO); + new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); array = pnfs_add_commit_array(fl_cinfo, new, lseg); @@ -2152,10 +2144,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) struct nfs4_flexfile_layoutreturn_args *ff_args; struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout); - ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL); + ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask()); if (!ff_args) goto out_nomem; - ff_args->pages[0] = alloc_page(GFP_KERNEL); + ff_args->pages[0] = alloc_page(nfs_io_gfp_mask()); if (!ff_args->pages[0]) goto out_nomem_free; @@ -2192,8 +2184,8 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) if (list_empty(&head)) return; - errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, - sizeof(*errors), GFP_NOFS); + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors), + nfs_io_gfp_mask()); if (errors != NULL) { const struct nfs4_ff_layout_ds_err *pos; size_t n = 0; @@ -2444,7 +2436,8 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ - args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), + nfs_io_gfp_mask()); if (!args->devinfo) return -ENOMEM; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index ea17fa1f31ec..e2d59bb5e6bb 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -80,6 +80,7 @@ enum nfs_param { Opt_source, Opt_tcp, Opt_timeo, + Opt_trunkdiscovery, Opt_udp, Opt_v, Opt_vers, @@ -180,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("source", Opt_source), fsparam_flag ("tcp", Opt_tcp), fsparam_u32 ("timeo", Opt_timeo), + fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), fsparam_flag ("udp", Opt_udp), fsparam_flag ("v2", Opt_v), fsparam_flag ("v3", Opt_v), @@ -529,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, else ctx->flags &= ~NFS_MOUNT_NOCTO; break; + case Opt_trunkdiscovery: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; + else + ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; + break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index cfe901650ab0..f73c09a9cf0a 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -19,8 +19,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" - -#define NFSDBG_FACILITY NFSDBG_FSCACHE +#include "nfstrace.h" #define NFS_MAX_KEY_LEN 1000 @@ -128,8 +127,6 @@ int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int u vcookie = fscache_acquire_volume(key, NULL, /* preferred_cache */ NULL, 0 /* coherency_data */); - dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", - nfss, vcookie); if (IS_ERR(vcookie)) { if (vcookie != ERR_PTR(-EBUSY)) { kfree(key); @@ -152,9 +149,6 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) { struct nfs_server *nfss = NFS_SB(sb); - dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - fscache_relinquish_volume(nfss->fscache, NULL, false); nfss->fscache = NULL; kfree(nfss->fscache_uniq); @@ -173,7 +167,7 @@ void nfs_fscache_init_inode(struct inode *inode) if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, 0, @@ -181,7 +175,7 @@ void nfs_fscache_init_inode(struct inode *inode) nfsi->fh.size, &auxdata, /* aux_data */ sizeof(auxdata), - i_size_read(&nfsi->vfs_inode)); + i_size_read(inode)); } /* @@ -192,8 +186,6 @@ void nfs_fscache_clear_inode(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } @@ -220,7 +212,6 @@ void nfs_fscache_clear_inode(struct inode *inode) void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); bool open_for_write = inode_is_open_for_write(inode); @@ -229,8 +220,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) fscache_use_cookie(cookie, open_for_write); if (open_for_write) { - dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_invalidate(cookie, &auxdata, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); } @@ -240,23 +230,14 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file); void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_unuse_cookie(cookie, &auxdata, NULL); } } -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - /* * Fallback page reading interface. */ @@ -319,58 +300,50 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, /* * Retrieve a page from fscache */ -int __nfs_readpage_from_fscache(struct inode *inode, struct page *page) +int __nfs_fscache_read_page(struct inode *inode, struct page *page) { int ret; - dfprintk(FSCACHE, - "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - nfs_i_fscache(inode), page, page->index, page->flags, inode); - + trace_nfs_fscache_read_page(inode, page); if (PageChecked(page)) { - dfprintk(FSCACHE, "NFS: readpage_from_fscache: PageChecked\n"); ClearPageChecked(page); - return 1; + ret = 1; + goto out; } ret = fscache_fallback_read_page(inode, page); if (ret < 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - dfprintk(FSCACHE, - "NFS: readpage_from_fscache failed %d\n", ret); SetPageChecked(page); - return ret; + goto out; } /* Read completed synchronously */ - dfprintk(FSCACHE, "NFS: readpage_from_fscache: read successful\n"); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); SetPageUptodate(page); - return 0; + ret = 0; +out: + trace_nfs_fscache_read_page_exit(inode, page, ret); + return ret; } /* * Store a newly fetched page in fscache. We can be certain there's no page * stored in the cache as yet otherwise we would've read it from there. */ -void __nfs_readpage_to_fscache(struct inode *inode, struct page *page) +void __nfs_fscache_write_page(struct inode *inode, struct page *page) { int ret; - dfprintk(FSCACHE, - "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n", - nfs_i_fscache(inode), page, page->index, page->flags); + trace_nfs_fscache_write_page(inode, page); ret = fscache_fallback_write_page(inode, page, true); - dfprintk(FSCACHE, - "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", - page, page->index, page->flags, ret); - if (ret != 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); } else { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); } + trace_nfs_fscache_write_page_exit(inode, page, ret); } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 25a5c0f82392..4e980cc04779 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -45,10 +45,8 @@ extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); extern void nfs_fscache_release_file(struct inode *, struct file *); -extern int __nfs_readpage_from_fscache(struct inode *, struct page *); -extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr, - unsigned long bytes); -extern void __nfs_readpage_to_fscache(struct inode *, struct page *); +extern int __nfs_fscache_read_page(struct inode *, struct page *); +extern void __nfs_fscache_write_page(struct inode *, struct page *); static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { @@ -66,11 +64,10 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) /* * Retrieve a page from an inode data storage object. */ -static inline int nfs_readpage_from_fscache(struct inode *inode, - struct page *page) +static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) { - if (NFS_I(inode)->fscache) - return __nfs_readpage_from_fscache(inode, page); + if (nfs_i_fscache(inode)) + return __nfs_fscache_read_page(inode, page); return -ENOBUFS; } @@ -78,24 +75,24 @@ static inline int nfs_readpage_from_fscache(struct inode *inode, * Store a page newly fetched from the server in an inode data storage object * in the cache. */ -static inline void nfs_readpage_to_fscache(struct inode *inode, +static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) { - if (NFS_I(inode)->fscache) - __nfs_readpage_to_fscache(inode, page); + if (nfs_i_fscache(inode)) + __nfs_fscache_write_page(inode, page); } static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, - struct nfs_inode *nfsi) + struct inode *inode) { memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + auxdata->mtime_sec = inode->i_mtime.tv_sec; + auxdata->mtime_nsec = inode->i_mtime.tv_nsec; + auxdata->ctime_sec = inode->i_ctime.tv_sec; + auxdata->ctime_nsec = inode->i_ctime.tv_nsec; - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(inode); } /* @@ -107,9 +104,9 @@ static inline void nfs_fscache_invalidate(struct inode *inode, int flags) struct nfs_inode *nfsi = NFS_I(inode); if (nfsi->fscache) { - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_invalidate(nfsi->fscache, &auxdata, - i_size_read(&nfsi->vfs_inode), flags); + i_size_read(inode), flags); } } @@ -136,15 +133,11 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { return 1; /* True: may release page */ } -static inline int nfs_readpage_from_fscache(struct inode *inode, - struct page *page) +static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page) {} - - +static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {} static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d96baa4450e3..b4e46b0ffa2d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -203,14 +203,13 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) NFS_INO_INVALID_OTHER | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); - } else if (flags & NFS_INO_REVAL_PAGECACHE) - flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; + } if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode, 0); - flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + flags &= ~NFS_INO_REVAL_FORCED; nfsi->cache_validity |= flags; @@ -236,19 +235,17 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); - } else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } @@ -564,8 +561,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); - if (nfs_server_capable(inode, NFS_CAP_XATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && @@ -785,26 +780,32 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) +/* + * Don't request help from readdirplus if the file is being written to, + * or if attribute caching is turned off + */ +static bool nfs_getattr_readdirplus_enable(const struct inode *inode) { - struct dentry *parent; + return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && + !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ; +} - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_force_use_readdirplus(d_inode(parent)); - dput(parent); +static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) +{ + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_miss(d_inode(parent)); + dput(parent); + } } static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { - struct dentry *parent; - - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_advise_use_readdirplus(d_inode(parent)); - dput(parent); + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_hit(d_inode(parent)); + dput(parent); + } } static u32 nfs_get_valid_attrmask(struct inode *inode) @@ -840,6 +841,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; bool do_update = false; + bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode); trace_nfs_getattr_enter(inode); @@ -848,7 +850,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_INO | STATX_SIZE | STATX_BLOCKS; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { - nfs_readdirplus_parent_cache_hit(path->dentry); + if (readdirplus_enabled) + nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_revalidate; } @@ -898,15 +901,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (do_update) { - /* Update the attribute cache */ - if (!(server->flags & NFS_MOUNT_NOAC)) + if (readdirplus_enabled) nfs_readdirplus_parent_cache_miss(path->dentry); - else - nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); if (err) goto out; - } else + } else if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ @@ -952,7 +952,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) res = __nfs_find_lock_context(ctx); rcu_read_unlock(); if (res == NULL) { - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); @@ -1030,7 +1030,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, { struct nfs_open_context *ctx; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (!ctx) return ERR_PTR(-ENOMEM); nfs_sb_active(dentry->d_sb); @@ -1180,7 +1180,6 @@ int nfs_open(struct inode *inode, struct file *filp) nfs_fscache_open_file(inode, filp); return 0; } -EXPORT_SYMBOL_GPL(nfs_open); /* * This function is called whenever some part of NFS notices that @@ -1583,7 +1582,7 @@ struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; - fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + fattr = kmalloc(sizeof(*fattr), GFP_KERNEL); if (fattr != NULL) { nfs_fattr_init(fattr); fattr->label = NULL; @@ -1599,7 +1598,7 @@ struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server) if (!fattr) return NULL; - fattr->label = nfs4_label_alloc(server, GFP_NOFS); + fattr->label = nfs4_label_alloc(server, GFP_KERNEL); if (IS_ERR(fattr->label)) { kfree(fattr); return NULL; @@ -1613,7 +1612,7 @@ struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; - fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); if (fh != NULL) fh->size = 0; return fh; @@ -2238,7 +2237,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2de7c56a1fbe..7eefa16ed381 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) return true; } +static inline fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. @@ -366,8 +376,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ -extern void nfs_advise_use_readdirplus(struct inode *dir); -extern void nfs_force_use_readdirplus(struct inode *dir); +extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); +extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -388,6 +398,20 @@ int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); +#ifdef CONFIG_NFS_V4_2 +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + if (!(server->caps & NFS_CAP_XATTR)) + return 0; + return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; +} +#else +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + return 0; +} +#endif + /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); @@ -573,6 +597,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 7fba7711e6b3..05c3b4b2b3dd 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,13 +949,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in * RFC 1094. This implementation assumes that it's an XDR uint32. */ - entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) return -EAGAIN; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9274c9c5efea..3b0b650c9c5a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1261,6 +1261,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, static void encode_readdirplus3args(struct xdr_stream *xdr, const struct nfs3_readdirargs *args) { + uint32_t dircount = args->count; + uint32_t maxcount = args->count; __be32 *p; encode_nfs_fh3(xdr, args->fh); @@ -1273,9 +1275,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr, * readdirplus: need dircount + buffer size. * We just make sure we make dircount big enough */ - *p++ = cpu_to_be32(args->count >> 3); - - *p = cpu_to_be32(args->count); + *p++ = cpu_to_be32(dircount); + *p = cpu_to_be32(maxcount); } static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, @@ -1967,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { struct user_namespace *userns = rpc_userns(entry->server->client); - struct nfs_entry old = *entry; __be32 *p; int error; u64 new_cookie; @@ -1987,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_fileid3(xdr, &entry->ino); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) - return error; + return -EAGAIN; entry->d_type = DT_UNKNOWN; @@ -2003,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) - return error; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); @@ -2018,24 +2018,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); - if (unlikely(error)) { - if (error == -E2BIG) - goto out_truncated; - return error; - } + if (unlikely(error)) + return -EAGAIN; } else zero_nfs_fh3(entry->fh); } - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; - -out_truncated: - dprintk("NFS: directory entry contains invalid file handle\n"); - *entry = old; - return -EAGAIN; } /* @@ -2228,6 +2219,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + result->xattr_support = 0; return 0; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 32129446beca..068c45b3bc1a 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -175,28 +175,27 @@ static int handle_async_copy(struct nfs42_copy_res *res, nfs4_stateid *src_stateid, bool *restart) { - struct nfs4_copy_state *copy, *tmp_copy; + struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; int status = NFS4_OK; - bool found_pending = false; struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); struct nfs_open_context *src_ctx = nfs_file_open_context(src); - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) return -ENOMEM; spin_lock(&dst_server->nfs_client->cl_lock); - list_for_each_entry(tmp_copy, + list_for_each_entry(iter, &dst_server->nfs_client->pending_cb_stateids, copies) { - if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, + if (memcmp(&res->write_res.stateid, &iter->stateid, NFS4_STATEID_SIZE)) continue; - found_pending = true; - list_del(&tmp_copy->copies); + tmp_copy = iter; + list_del(&iter->copies); break; } - if (found_pending) { + if (tmp_copy) { spin_unlock(&dst_server->nfs_client->cl_lock); kfree(copy); copy = tmp_copy; @@ -254,7 +253,7 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst, struct nfs_commitres cres; int status = -ENOMEM; - cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); if (!cres.verf) goto out; @@ -357,7 +356,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, res->commit_res.verf = NULL; if (args->sync) { res->commit_res.verf = - kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); if (!res->commit_res.verf) return -ENOMEM; } @@ -552,7 +551,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst, if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL)) return -EOPNOTSUPP; - data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS); + data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -591,8 +590,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, ctx = get_nfs_open_context(nfs_file_open_context(src)); l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) - return PTR_ERR(l_ctx); + if (IS_ERR(l_ctx)) { + status = PTR_ERR(l_ctx); + goto out; + } status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, FMODE_READ); @@ -600,7 +601,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status) { if (status == -EAGAIN) status = -NFS4ERR_BAD_STATEID; - return status; + goto out; } status = nfs4_call_sync(src_server->client, src_server, &msg, @@ -609,6 +610,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status == -ENOTSUPP) src_server->caps &= ~NFS_CAP_COPY_NOTIFY; +out: put_nfs_open_context(nfs_file_open_context(src)); return status; } @@ -626,7 +628,7 @@ int nfs42_proc_copy_notify(struct file *src, struct file *dst, if (!(src_server->caps & NFS_CAP_COPY_NOTIFY)) return -EOPNOTSUPP; - args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS); + args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL); if (args == NULL) return -ENOMEM; @@ -1014,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, return -EOPNOTSUPP; if (n > NFS42_LAYOUTERROR_MAX) return -EINVAL; - data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS); + data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask()); if (!data) return -ENOMEM; for (i = 0; i < n; i++) { diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 1c4d2a05b401..e7b34f7e0614 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -199,7 +199,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value, flags = NFS4_XATTR_ENTRY_EXTVAL; } - buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS); + buf = kmalloc(alloclen, GFP_KERNEL); if (buf == NULL) return NULL; entry = (struct nfs4_xattr_entry *)buf; @@ -213,7 +213,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value, if (flags & NFS4_XATTR_ENTRY_EXTVAL) { - valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS); + valp = kvmalloc(len, GFP_KERNEL); if (valp == NULL) { kfree(buf); return NULL; @@ -289,8 +289,7 @@ nfs4_xattr_alloc_cache(void) { struct nfs4_xattr_cache *cache; - cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, - GFP_KERNEL_ACCOUNT | GFP_NOFS); + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL); if (cache == NULL) return NULL; @@ -998,7 +997,7 @@ int __init nfs4_xattr_cache_init(void) nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 84f39b6f1b1e..79df6e83881b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -42,6 +42,7 @@ enum nfs4_client_state { NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, NFS4CLNT_RUN_MANAGER, + NFS4CLNT_MANAGER_AVAILABLE, NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e79ae4cbc395..7b861e4f0533 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; + fmode_t f_mode; struct iattr attr; int err; @@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; + f_mode = filp->f_mode; if ((openflags & O_ACCMODE) == 3) - return nfs_open(inode, filp); + f_mode |= flags_to_mode(openflags); /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); @@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -165,7 +167,7 @@ retry: if (sync) return -EOPNOTSUPP; cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), - GFP_NOFS); + GFP_KERNEL); if (unlikely(cn_resp == NULL)) return -ENOMEM; @@ -180,8 +182,8 @@ retry: ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count, nss, cnrs, sync); out: - if (!nfs42_files_from_same_server(file_in, file_out)) - kfree(cn_resp); + kfree(cn_resp); + if (ret == -EAGAIN) goto retry; return ret; @@ -339,7 +341,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, res = ERR_PTR(-ENOMEM); len = strlen(SSC_READ_NAME_BODY) + 16; - read_name = kzalloc(len, GFP_NOFS); + read_name = kzalloc(len, GFP_KERNEL); if (read_name == NULL) goto out; snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0e0db6c27619..16106f805ffa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1392,13 +1392,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_FH: p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE | - NFS4_ACCESS_EXECUTE; -#ifdef CONFIG_NFS_V4_2 - if (!(server->caps & NFS_CAP_XATTR)) - break; - p->o_arg.access |= NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | - NFS4_ACCESS_XALIST; -#endif + NFS4_ACCESS_EXECUTE | + nfs_access_xattr_mask(server); } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); @@ -3050,6 +3045,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED) + set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -5904,7 +5901,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu buflen = server->rsize; npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; - pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS); + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -6609,7 +6606,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, }; int status = 0; - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6797,7 +6794,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; - p = kzalloc(sizeof(*p), GFP_NOFS); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); @@ -7202,8 +7199,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), - fl->fl_u.nfs4_fl.owner, - recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; if (IS_SETLKW(cmd)) @@ -7626,7 +7622,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) if (server->nfs_client->cl_mvops->minor_version != 0) return; - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return; data->lsp = lsp; @@ -8012,6 +8008,18 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, .rpc_resp = &res, .rpc_cred = cred, }; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; int status; nfs_fattr_init(&locations->fattr); @@ -8019,8 +8027,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, locations->nlocations = 0; nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); - status = nfs4_call_sync_sequence(clnt, server, &msg, - &args.seq_args, &res.seq_res); + status = nfs4_call_sync_custom(&task_setup_data); if (status == NFS4_OK && res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) status = -NFS4ERR_LEASE_MOVED; @@ -8333,6 +8340,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DEADSESSION: nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + return; } if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && res->dir != NFS4_CDFS4_BOTH) { @@ -9291,7 +9299,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, goto out_err; ret = ERR_PTR(-ENOMEM); - calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); if (calldata == NULL) goto out_put_clp; nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged); @@ -9607,6 +9615,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return ERR_CAST(task); status = rpc_wait_for_completion_task(task); if (status != 0) @@ -10222,7 +10232,7 @@ static int nfs41_free_stateid(struct nfs_server *server, &task_setup.rpc_client, &msg); dprintk("NFS call free_stateid %p\n", stateid); - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->server = server; @@ -10461,6 +10471,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) return error + error2 + error3; } +static void nfs4_enable_swap(struct inode *inode) +{ + /* The state manager thread must always be running. + * It will notice the client is a swapper, and stay put. + */ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); +} + +static void nfs4_disable_swap(struct inode *inode) +{ + /* The state manager thread will now exit once it is + * woken. + */ + wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state); +} + static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -10538,6 +10566,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .create_server = nfs4_create_server, .clone_server = nfs_clone_server, .discover_trunking = nfs4_discover_trunking, + .enable_swap = nfs4_enable_swap, + .disable_swap = nfs4_disable_swap, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f5a62c0d999b..9e1c987c81e7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/jiffies.h> +#include <linux/sched/mm.h> #include <linux/sunrpc/clnt.h> @@ -666,7 +667,7 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kzalloc(sizeof(*state), GFP_NOFS); + state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) return NULL; refcount_set(&state->count, 1); @@ -820,7 +821,7 @@ static void __nfs4_close(struct nfs4_state *state, void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(state, fmode, GFP_NOFS, 0); + __nfs4_close(state, fmode, GFP_KERNEL, 0); } void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) @@ -869,14 +870,15 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; - lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT); if (lsp == NULL) return NULL; nfs4_init_seqid_counter(&lsp->ls_seqid); refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); + lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, + 0, 0, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -1205,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { + wake_up_var(&clp->cl_state); return; + } + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1224,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); nfs4_clear_state_manager_bit(clp); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2560,9 +2570,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { trace_nfs4_state_mgr(clp); @@ -2657,6 +2675,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); @@ -2669,11 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); } - /* Did we race with an attempt to give us more work? */ - if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) - return; - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - return; + return; + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2686,6 +2702,7 @@ out_error: clp->cl_hostname, -status); ssleep(1); out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } @@ -2693,10 +2710,31 @@ out_drain: static int nfs4_run_state_manager(void *ptr) { struct nfs_client *clp = ptr; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; allow_signal(SIGKILL); +again: + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); + if (atomic_read(&cl->cl_swapper)) { + wait_var_event_interruptible(&clp->cl_state, + test_bit(NFS4CLNT_RUN_MANAGER, + &clp->cl_state)); + if (atomic_read(&cl->cl_swapper) && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + goto again; + /* Either no longer a swapper, or were signalled */ + } + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + goto again; + nfs_put_client(clp); - module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8e70b92df4cc..86a5f6516928 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1605,7 +1605,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; - uint32_t dircount = readdir->count >> 1; + uint32_t dircount = readdir->count; + uint32_t maxcount = readdir->count; __be32 *p, verf[2]; uint32_t attrlen = 0; unsigned int i; @@ -1618,7 +1619,6 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) @@ -1634,7 +1634,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg encode_nfs4_verifier(xdr, &readdir->verifier); p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); - *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(maxcount); *p++ = cpu_to_be32(attrlen); for (i = 0; i < attrlen; i++) *p++ = cpu_to_be32(attrs[i]); @@ -7508,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 317ce27bdc4b..012bd7339862 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -21,7 +21,6 @@ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ - { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ @@ -37,7 +36,6 @@ #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ - { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ @@ -162,6 +160,9 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); +DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done); TRACE_EVENT(nfs_access_exit, TP_PROTO( @@ -273,6 +274,122 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); +DECLARE_EVENT_CLASS(nfs_inode_range_event, + TP_PROTO( + const struct inode *inode, + loff_t range_start, + loff_t range_end + ), + + TP_ARGS(inode, range_start, range_end), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, range_start) + __field(loff_t, range_end) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->fileid = nfsi->fileid; + __entry->version = inode_peek_iversion_raw(inode); + __entry->range_start = range_start; + __entry->range_end = range_end; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "range=[%lld, %lld]", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->range_start, __entry->range_end + ) +); + +#define DEFINE_NFS_INODE_RANGE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_range_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t range_start, \ + loff_t range_end \ + ), \ + TP_ARGS(inode, range_start, range_end)) + +DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range); + +DECLARE_EVENT_CLASS(nfs_readdir_event, + TP_PROTO( + const struct file *file, + const __be32 *verifier, + u64 cookie, + pgoff_t page_index, + unsigned int dtsize + ), + + TP_ARGS(file, verifier, cookie, page_index, dtsize), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __array(char, verifier, NFS4_VERIFIER_SIZE) + __field(u64, cookie) + __field(pgoff_t, index) + __field(unsigned int, dtsize) + ), + + TP_fast_assign( + const struct inode *dir = file_inode(file); + const struct nfs_inode *nfsi = NFS_I(dir); + + __entry->dev = dir->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(dir); + if (cookie != 0) + memcpy(__entry->verifier, verifier, + NFS4_VERIFIER_SIZE); + else + memset(__entry->verifier, 0, + NFS4_VERIFIER_SIZE); + __entry->cookie = cookie; + __entry->index = page_index; + __entry->dtsize = dtsize; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "cookie=%s:0x%llx cache_index=%lu dtsize=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->version, show_nfs4_verifier(__entry->verifier), + (unsigned long long)__entry->cookie, __entry->index, + __entry->dtsize + ) +); + +#define DEFINE_NFS_READDIR_EVENT(name) \ + DEFINE_EVENT(nfs_readdir_event, name, \ + TP_PROTO( \ + const struct file *file, \ + const __be32 *verifier, \ + u64 cookie, \ + pgoff_t page_index, \ + unsigned int dtsize \ + ), \ + TP_ARGS(file, verifier, cookie, page_index, dtsize)) + +DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill); +DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached); + DECLARE_EVENT_CLASS(nfs_lookup_event, TP_PROTO( const struct inode *dir, @@ -366,6 +483,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup); +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate); TRACE_EVENT(nfs_atomic_open_enter, TP_PROTO( @@ -889,11 +1009,11 @@ TRACE_EVENT(nfs_aop_readpage_done, TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, - struct page *page, + loff_t pos, unsigned int nr_pages ), - TP_ARGS(inode, page, nr_pages), + TP_ARGS(inode, pos, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) @@ -911,7 +1031,7 @@ TRACE_EVENT(nfs_aop_readahead, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = pos; __entry->nr_pages = nr_pages; ), @@ -1095,6 +1215,97 @@ TRACE_EVENT(nfs_readpage_short, ) ); +DECLARE_EVENT_CLASS(nfs_fscache_page_event, + TP_PROTO( + const struct inode *inode, + struct page *page + ), + + TP_ARGS(inode, page), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset + ) +); +DECLARE_EVENT_CLASS(nfs_fscache_page_event_done, + TP_PROTO( + const struct inode *inode, + struct page *page, + int error + ), + + TP_ARGS(inode, page, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->error = error; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld error=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->error + ) +); +#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_fscache_page_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct page *page \ + ), \ + TP_ARGS(inode, page)) +#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_fscache_page_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct page *page, \ + int error \ + ), \ + TP_ARGS(inode, page, error)) +DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page); +DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit); +DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page); +DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit); + TRACE_EVENT(nfs_pgio_error, TP_PROTO( const struct nfs_pgio_header *hdr, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ad7f83dc9a2d..9157dd19b8b4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) } } -static inline struct nfs_page * -nfs_page_alloc(void) +static inline struct nfs_page *nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = + kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask()); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -892,7 +892,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; - gfp_t gfp_flags = GFP_KERNEL; + gfp_t gfp_flags = nfs_io_gfp_mask(); pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); pg_array->npages = pagecount; @@ -979,7 +979,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; if (mirror_count == 1) return desc->pg_mirrors_static; - ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL); + ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask()); if (ret != NULL) { for (i = 0; i < mirror_count; i++) nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); @@ -1218,6 +1218,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); + mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7c9090a28e5c..856c962273c7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -92,6 +92,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { @@ -1233,7 +1244,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, int status = 0; *pcred = NULL; - lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask()); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); @@ -2206,7 +2217,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) struct pnfs_layout_hdr *lo; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL); + lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask()); if (!lo) goto out_unlock; if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) @@ -2249,8 +2260,8 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) { pnfs_clear_first_layoutget(lo); nfs_layoutget_end(lo); @@ -2275,8 +2286,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) return; data->lgp = lgp; @@ -2691,13 +2702,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r else rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - rd_size, - IOMODE_READ, - false, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), rd_size, + IOMODE_READ, false, + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2718,13 +2727,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_layout(pgio); pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - wb_size, - IOMODE_RW, - false, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), wb_size, IOMODE_RW, + false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -3183,7 +3189,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = -ENOMEM; /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), nfs_io_gfp_mask()); if (!data) goto clear_layoutcommitting; @@ -3250,7 +3256,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; - thp = kzalloc(sizeof(*thp), GFP_NOFS); + thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask()); if (!thp) { dprintk("%s mdsthreshold allocation failed\n", __func__); return NULL; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f4d7548d67b2..07f11489e4e9 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -234,6 +234,8 @@ struct pnfs_devicelist { extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); +extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ extern size_t max_response_pages(struct nfs_server *server); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 316f68f96e57..657c242a18ff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 73dcaa99fa9b..e3570c656b0f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, info->maxfilesize = 0x7FFFFFFF; info->lease_time = 0; info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + info->xattr_support = 0; return 0; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb00229c1a50..5e7657374bc3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error) struct address_space *mapping = page_file_mapping(page); if (PageUptodate(page)) - nfs_readpage_to_fscache(inode, page); + nfs_fscache_write_page(inode, page); else if (!PageError(page) && !PagePrivate(page)) generic_error_remove_page(mapping, page); unlock_page(page); @@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = hdr->inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - - task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); trace_nfs_initiate_read(hdr); } @@ -290,9 +286,8 @@ static void nfs_readpage_result(struct rpc_task *task, } static int -readpage_async_filler(void *data, struct page *page) +readpage_async_filler(struct nfs_readdesc *desc, struct page *page) { - struct nfs_readdesc *desc = data; struct inode *inode = page_file_mapping(page)->host; unsigned int rsize = NFS_SERVER(inode)->rsize; struct nfs_page *new; @@ -306,7 +301,7 @@ readpage_async_filler(void *data, struct page *page) aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); if (!IS_SYNC(page->mapping->host)) { - error = nfs_readpage_from_fscache(page->mapping->host, page); + error = nfs_fscache_read_page(page->mapping->host, page); if (error == 0) goto out_unlock; } @@ -397,14 +392,16 @@ out_unlock: return ret; } -int nfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void nfs_readahead(struct readahead_control *ractl) { + unsigned int nr_pages = readahead_count(ractl); + struct file *file = ractl->file; struct nfs_readdesc desc; - struct inode *inode = mapping->host; + struct inode *inode = ractl->mapping->host; + struct page *page; int ret; - trace_nfs_aop_readahead(inode, lru_to_page(pages), nr_pages); + trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); ret = -ESTALE; @@ -422,14 +419,18 @@ int nfs_readpages(struct file *file, struct address_space *mapping, nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + while ((page = readahead_page(ractl)) != NULL) { + ret = readpage_async_filler(&desc, page); + put_page(page); + if (ret) + break; + } nfs_pageio_complete_read(&desc.pgio); put_nfs_open_context(desc.ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); - return ret; } int __init nfs_init_readpagecache(void) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 5fa11e1aca4c..6f325e10056c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -347,6 +347,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); + task_setup_data.task = &data->task; task_setup_data.callback_data = data; data->cred = get_current_cred(); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 987a187bd39a..f00d45cf80ef 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); - if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -104,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL); + struct nfs_pgio_header *p; - memset(p, 0, sizeof(*p)); + p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask()); + if (!p) { + p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT); + if (!p) + return NULL; + memset(p, 0, sizeof(*p)); + } p->rw_mode = FMODE_WRITE; return p; } @@ -306,7 +302,7 @@ static void nfs_set_pageerror(struct address_space *mapping) /* Force file size revalidation */ spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | - NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); spin_unlock(&inode->i_lock); } @@ -316,7 +312,10 @@ static void nfs_mapping_set_error(struct page *page, int error) struct address_space *mapping = page_file_mapping(page); SetPageError(page); - mapping_set_error(mapping, error); + filemap_set_wb_err(mapping, error); + if (mapping->host) + errseq_set(&mapping->host->i_sb->s_wb_err, + error == -ENOSPC ? -ENOSPC : -EIO); nfs_set_pageerror(mapping); } @@ -417,7 +416,7 @@ static void nfs_set_page_writeback(struct page *page) if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + nfss->write_congested = 1; } static void nfs_end_page_writeback(struct nfs_page *req) @@ -433,7 +432,7 @@ static void nfs_end_page_writeback(struct nfs_page *req) end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + nfss->write_congested = 0; } /* @@ -672,6 +671,10 @@ static int nfs_writepage_locked(struct page *page, struct inode *inode = page_file_mapping(page)->host; int err; + if (wbc->sync_mode == WB_SYNC_NONE && + NFS_SERVER(inode)->write_congested) + return AOP_WRITEPAGE_ACTIVATE; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); @@ -719,6 +722,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int priority = 0; int err; + if (wbc->sync_mode == WB_SYNC_NONE && + NFS_SERVER(inode)->write_congested) + return 0; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || @@ -1409,6 +1416,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, { int priority = flush_task_priority(how); + if (IS_SWAPFILE(hdr->inode)) + task_setup_data->flags |= RPC_TASK_SWAPPER; task_setup_data->priority = priority; rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client); trace_nfs_initiate_write(hdr); @@ -1821,7 +1830,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); @@ -1893,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); + nfss->write_congested = 0; nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -2049,21 +2062,21 @@ out: } EXPORT_SYMBOL_GPL(nfs_wb_all); -int nfs_wb_page_cancel(struct inode *inode, struct page *page) +int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) { struct nfs_page *req; int ret = 0; - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(page); + req = nfs_lock_and_join_requests(&folio->page); if (IS_ERR(req)) { ret = PTR_ERR(req); } else if (req) { - /* all requests from this page have been cancelled by + /* all requests from this folio have been cancelled by * nfs_lock_and_join_requests, so just remove the head * request from the inode / page_private pointer and * release it */ diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 3d1d17256a91..f6a2fd3015e7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -35,18 +35,9 @@ config NFSD_V2_ACL bool depends on NFSD -config NFSD_V3 - bool "NFS server support for NFS version 3" - depends on NFSD - help - This option enables support in your system's NFS server for - version 3 of the NFS protocol (RFC 1813). - - If unsure, say Y. - config NFSD_V3_ACL bool "NFS server support for the NFSv3 ACL protocol extension" - depends on NFSD_V3 + depends on NFSD select NFSD_V2_ACL help Solaris NFS servers support an auxiliary NFSv3 ACL protocol that @@ -70,7 +61,6 @@ config NFSD_V3_ACL config NFSD_V4 bool "NFS server support for NFS version 4" depends on NFSD && PROC_FS - select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS select CRYPTO diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 3f0983e93a99..805c06d5f1b4 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -12,9 +12,8 @@ nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o \ - stats.o filecache.o + stats.o filecache.o nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o -nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfs4acl.o nfs4callback.o nfs4recover.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e5c0982a381d..b6d01d51a746 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -4,7 +4,6 @@ */ #include <linux/exportfs.h> #include <linux/iomap.h> -#include <linux/genhd.h> #include <linux/slab.h> #include <linux/pr.h> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8bc807c5fea4..2c1b027774d4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -7,6 +7,7 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/pagemap.h> #include <linux/sched.h> #include <linux/list_lru.h> #include <linux/fsnotify_backend.h> @@ -236,6 +237,13 @@ nfsd_file_check_write_error(struct nfsd_file *nf) } static void +nfsd_file_flush(struct nfsd_file *nf) +{ + if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); +} + +static void nfsd_file_do_unhash(struct nfsd_file *nf) { lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); @@ -294,19 +302,15 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { - bool is_hashed; - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) { + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { + nfsd_file_flush(nf); nfsd_file_put_noref(nf); - return; + } else { + nfsd_file_put_noref(nf); + if (nf->nf_file) + nfsd_file_schedule_laundrette(); } - - filemap_flush(nf->nf_file->f_mapping); - is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; - nfsd_file_put_noref(nf); - if (is_hashed) - nfsd_file_schedule_laundrette(); if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } @@ -327,6 +331,7 @@ nfsd_file_dispose_list(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); nfsd_file_put_noref(nf); } } @@ -340,6 +345,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); if (!refcount_dec_and_test(&nf->nf_ref)) continue; if (nfsd_file_free(nf)) @@ -632,7 +638,7 @@ nfsd_file_cache_init(void) if (!nfsd_filecache_wq) goto out; - nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE, sizeof(*nfsd_file_hashtbl), GFP_KERNEL); if (!nfsd_file_hashtbl) { pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); @@ -700,7 +706,7 @@ out_err: nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kfree(nfsd_file_hashtbl); + kvfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; @@ -811,7 +817,7 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kfree(nfsd_file_hashtbl); + kvfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 2e2f1d5e9f62..070f90ed09b6 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -117,7 +117,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, da->netaddr.addr_len = snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, - "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + "%s.%d.%d", addr, port >> 8, port & 0xff); da->tightly_coupled = false; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 367551bddfc6..b5760801d377 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -249,34 +249,34 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) int w; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; if (dentry == NULL || d_really_is_negative(dentry)) - return 1; + return true; inode = d_inode(dentry); if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->mask) < 0) - return 0; + return false; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!*(rqstp->rq_next_page++)) - return 1; + return true; w -= PAGE_SIZE; } if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, resp->mask & NFS_ACL, 0)) - return 0; + return false; if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) - return 0; + return false; - return 1; + return true; } /* ACCESS */ @@ -286,17 +286,17 @@ nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nfsd3_accessres *resp = rqstp->rq_resp; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; switch (resp->status) { case nfs_ok: if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->access) < 0) - return 0; + return false; break; } - return 1; + return true; } /* diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6d1b5bb051c5..2c05692a9abf 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -422,7 +422,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); if (!new) return nfserr_jukebox; - memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + memcpy(&new->lo_seg, seg, sizeof(new->lo_seg)); new->lo_state = ls; spin_lock(&fp->fi_lock); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 32063733443d..234e852fcdfa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4711,6 +4711,14 @@ nfsd_break_deleg_cb(struct file_lock *fl) return ret; } +/** + * nfsd_breaker_owns_lease - Check if lease conflict was resolved + * @fl: Lock state to check + * + * Return values: + * %true: Lease conflict was resolved + * %false: Lease conflict was not resolved. + */ static bool nfsd_breaker_owns_lease(struct file_lock *fl) { struct nfs4_delegation *dl = fl->fl_owner; @@ -4718,11 +4726,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) struct nfs4_client *clp; if (!i_am_nfsd()) - return NULL; + return false; rqst = kthread_data(current); /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */ if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4) - return NULL; + return false; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } @@ -6526,7 +6534,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) } static fl_owner_t -nfsd4_fl_get_owner(fl_owner_t owner) +nfsd4_lm_get_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; @@ -6535,7 +6543,7 @@ nfsd4_fl_get_owner(fl_owner_t owner) } static void -nfsd4_fl_put_owner(fl_owner_t owner) +nfsd4_lm_put_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; @@ -6570,8 +6578,8 @@ nfsd4_lm_notify(struct file_lock *fl) static const struct lock_manager_operations nfsd_posix_mng_ops = { .lm_notify = nfsd4_lm_notify, - .lm_get_owner = nfsd4_fl_get_owner, - .lm_put_owner = nfsd4_fl_put_owner, + .lm_get_owner = nfsd4_lm_get_owner, + .lm_put_owner = nfsd4_lm_put_owner, }; static inline void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 714a3a3bd50c..da92e7d2ab6a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2854,6 +2854,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; + if (!(stat.result_mask & STATX_BTIME)) + /* underlying FS does not offer btime so we can't share it */ + bmval1 &= ~FATTR4_WORD1_TIME_CREATE; if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | @@ -3254,6 +3257,13 @@ out_acl: p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); *p++ = cpu_to_be32(stat.mtime.tv_nsec); } + if (bmval1 & FATTR4_WORD1_TIME_CREATE) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec); + *p++ = cpu_to_be32(stat.btime.tv_nsec); + } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { struct kstat parent_stat; u64 ino = stat.ino; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index a4a69ab6ab28..0b3f12aa37ff 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static u32 -nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) -{ - return hash_32((__force u32)xid, nn->maskbits); -} - static struct svc_cacherep * nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, struct nfsd_net *nn) @@ -241,6 +235,14 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) list_move_tail(&rp->c_lru, &b->lru_head); } +static noinline struct nfsd_drc_bucket * +nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) +{ + unsigned int hash = hash_32((__force u32)xid, nn->maskbits); + + return &nn->drc_hashtbl[hash]; +} + static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, unsigned int max) { @@ -419,12 +421,10 @@ out: */ int nfsd_cache_lookup(struct svc_rqst *rqstp) { - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_net *nn; struct svc_cacherep *rp, *found; - __be32 xid = rqstp->rq_xid; __wsum csum; - u32 hash = nfsd_cache_hash(xid, nn); - struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash]; + struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; int rtn = RC_DOIT; @@ -440,17 +440,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ + nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rp = nfsd_reply_cache_alloc(rqstp, csum, nn); if (!rp) goto out; + b = nfsd_cache_bucket_find(rqstp->rq_xid, nn); spin_lock(&b->cache_lock); found = nfsd_cache_insert(b, rp, nn); - if (found != rp) { - nfsd_reply_cache_free_locked(NULL, rp, nn); - rp = found; + if (found != rp) goto found_entry; - } nfsd_stats_rc_misses_inc(); rqstp->rq_cacherep = rp; @@ -468,8 +467,10 @@ out: found_entry: /* We found a matching entry which is either in progress or done. */ + nfsd_reply_cache_free_locked(NULL, rp, nn); nfsd_stats_rc_hits_inc(); rtn = RC_DROPIT; + rp = found; /* Request being processed */ if (rp->c_state == RC_INPROG) @@ -528,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; - u32 hash; struct nfsd_drc_bucket *b; int len; size_t bufsize = 0; @@ -536,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) if (!rp) return; - hash = nfsd_cache_hash(rp->c_key.k_xid, nn); - b = &nn->drc_hashtbl[hash]; + b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn); len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); len >>= 2; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 68b020f2002b..16920e4512bd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -772,13 +772,13 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err != 0) return err; - err = svc_create_xprt(nn->nfsd_serv, transport, net, - PF_INET, port, SVC_SOCK_ANONYMOUS, cred); + err = svc_xprt_create(nn->nfsd_serv, transport, net, + PF_INET, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0) goto out_err; - err = svc_create_xprt(nn->nfsd_serv, transport, net, - PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); + err = svc_xprt_create(nn->nfsd_serv, transport, net, + PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; @@ -790,7 +790,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr out_close: xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); if (xprt != NULL) { - svc_close_xprt(xprt); + svc_xprt_close(xprt); svc_xprt_put(xprt); } out_err: diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 3e5008b475ff..4fc1fd639527 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -364,7 +364,7 @@ void nfsd_lockd_shutdown(void); | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ - | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) #define NFSD4_SUPPORTED_ATTRS_WORD2 0 diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 145208bcb9bd..c29baa03dfaf 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -611,8 +611,6 @@ out_negative: return nfserr_serverfault; } -#ifdef CONFIG_NFSD_V3 - /** * fh_fill_pre_attrs - Fill in pre-op attributes * @fhp: file handle to be updated @@ -673,8 +671,6 @@ void fh_fill_post_attrs(struct svc_fh *fhp) nfsd4_change_attribute(&fhp->fh_post_attr, inode); } -#endif /* CONFIG_NFSD_V3 */ - /* * Release a file handle. */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 434930d8a946..fb9d358a267e 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -90,7 +90,6 @@ typedef struct svc_fh { * operation */ int fh_flags; /* FH flags */ -#ifdef CONFIG_NFSD_V3 bool fh_post_saved; /* post-op attrs saved */ bool fh_pre_saved; /* pre-op attrs saved */ @@ -107,7 +106,6 @@ typedef struct svc_fh { /* Post-op attributes saved in fh_unlock */ struct kstat fh_post_attr; /* full attrs after operation */ u64 fh_post_change; /* nfsv4 change; see above */ -#endif /* CONFIG_NFSD_V3 */ } svc_fh; #define NFSD4_FH_FOREIGN (1<<0) #define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f)) @@ -283,8 +281,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) } #endif -#ifdef CONFIG_NFSD_V3 - /** * fh_clear_pre_post_attrs - Reset pre/post attributes * @fhp: file handle to be updated @@ -327,22 +323,6 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat, extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); -#else /* !CONFIG_NFSD_V3 */ - -static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) -{ -} - -static inline void fh_fill_pre_attrs(struct svc_fh *fhp) -{ -} - -static inline void fh_fill_post_attrs(struct svc_fh *fhp) -{ -} - -#endif /* !CONFIG_NFSD_V3 */ - /* * Lock a file handle/inode diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 18b8eb43a19b..fcdab8a8a41f 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) unsigned long cnt = argp->len; unsigned int nvecs; - dprintk("nfsd: WRITE %s %d bytes at %d\n", + dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b8c682b62d29..4bb5baa17040 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -117,9 +117,7 @@ static struct svc_stat nfsd_acl_svcstats = { static const struct svc_version *nfsd_version[] = { [2] = &nfsd_version2, -#if defined(CONFIG_NFSD_V3) [3] = &nfsd_version3, -#endif #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif @@ -293,13 +291,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred) if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS, cred); + error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; - error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS, cred); + error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; @@ -612,13 +610,6 @@ static int nfsd_get_default_max_blksize(void) return ret; } -static const struct svc_serv_ops nfsd_thread_sv_ops = { - .svo_shutdown = nfsd_last_thread, - .svo_function = nfsd, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_module = THIS_MODULE, -}; - void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -657,8 +648,7 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - &nfsd_thread_sv_ops); + serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; @@ -724,7 +714,8 @@ void nfsd_put(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { - svc_shutdown_net(nn->nfsd_serv, net); + svc_xprt_destroy_all(nn->nfsd_serv, net); + nfsd_last_thread(nn->nfsd_serv, net); svc_destroy(&nn->nfsd_serv->sv_refcnt); spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; @@ -1019,8 +1010,6 @@ out: msleep(20); } - /* Release module */ - module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 5889db66409d..242fa123e0e9 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -13,22 +13,6 @@ #include "export.h" #include "nfsfh.h" -#define NFSD_TRACE_PROC_ARG_FIELDS \ - __field(unsigned int, netns_ino) \ - __field(u32, xid) \ - __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ - __array(unsigned char, client, sizeof(struct sockaddr_in6)) - -#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \ - do { \ - __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ - __entry->xid = be32_to_cpu(rqstp->rq_xid); \ - memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ - rqstp->rq_xprt->xpt_locallen); \ - memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ - rqstp->rq_xprt->xpt_remotelen); \ - } while (0); - #define NFSD_TRACE_PROC_RES_FIELDS \ __field(unsigned int, netns_ino) \ __field(u32, xid) \ @@ -53,16 +37,22 @@ DECLARE_EVENT_CLASS(nfsd_xdr_err_class, ), TP_ARGS(rqstp), TP_STRUCT__entry( - NFSD_TRACE_PROC_ARG_FIELDS - + __field(unsigned int, netns_ino) + __field(u32, xid) __field(u32, vers) __field(u32, proc) + __sockaddr(server, rqstp->rq_xprt->xpt_locallen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) ), TP_fast_assign( - NFSD_TRACE_PROC_ARG_ASSIGNMENTS + const struct svc_xprt *xprt = rqstp->rq_xprt; + __entry->netns_ino = xprt->xpt_net->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->vers = rqstp->rq_vers; __entry->proc = rqstp->rq_proc; + __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen); + __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen); ), TP_printk("xid=0x%08x vers=%u proc=%u", __entry->xid, __entry->vers, __entry->proc @@ -613,20 +603,21 @@ TRACE_EVENT(nfsd_clid_cred_mismatch, __field(u32, cl_id) __field(unsigned long, cl_flavor) __field(unsigned long, new_flavor) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; __entry->cl_flavor = clp->cl_cred.cr_flavor; __entry->new_flavor = rqstp->rq_cred.cr_flavor; - memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); ), TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc", __entry->cl_boot, __entry->cl_id, show_nfsd_authflavor(__entry->cl_flavor), - show_nfsd_authflavor(__entry->new_flavor), __entry->addr + show_nfsd_authflavor(__entry->new_flavor), + __get_sockaddr(addr) ) ) @@ -642,7 +633,7 @@ TRACE_EVENT(nfsd_clid_verf_mismatch, __field(u32, cl_id) __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE) __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -651,14 +642,14 @@ TRACE_EVENT(nfsd_clid_verf_mismatch, NFS4_VERIFIER_SIZE); memcpy(__entry->new_verifier, (void *)verf, NFS4_VERIFIER_SIZE); - memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); ), TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc", __entry->cl_boot, __entry->cl_id, __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE), __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE), - __entry->addr + __get_sockaddr(addr) ) ); @@ -908,18 +899,17 @@ TRACE_EVENT(nfsd_cb_args, __field(u32, cl_id) __field(u32, prog) __field(u32, ident) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, conn->cb_addrlen) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; __entry->prog = conn->cb_prog; __entry->ident = conn->cb_ident; - memcpy(__entry->addr, &conn->cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen); ), TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->prog, __entry->ident) ); @@ -951,17 +941,17 @@ DECLARE_EVENT_CLASS(nfsd_cb_class, __field(unsigned long, state) __field(u32, cl_boot) __field(u32, cl_id) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) ), TP_fast_assign( __entry->state = clp->cl_cb_state; __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x state=%s", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, show_cb_state(__entry->state)) ); @@ -1001,7 +991,7 @@ TRACE_EVENT(nfsd_cb_setup, __field(u32, cl_boot) __field(u32, cl_id) __field(unsigned long, authflavor) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) __array(unsigned char, netid, 8) ), TP_fast_assign( @@ -1009,11 +999,11 @@ TRACE_EVENT(nfsd_cb_setup, __entry->cl_id = clp->cl_clientid.cl_id; strlcpy(__entry->netid, netid, sizeof(__entry->netid)); __entry->authflavor = authflavor; - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->netid, show_nfsd_authflavor(__entry->authflavor)) ); @@ -1027,30 +1017,32 @@ TRACE_EVENT(nfsd_cb_setup_err, __field(long, error) __field(u32, cl_boot) __field(u32, cl_id) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) ), TP_fast_assign( __entry->error = error; __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x error=%ld", - __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error) + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->error) ); -TRACE_EVENT(nfsd_cb_recall, +TRACE_EVENT_CONDITION(nfsd_cb_recall, TP_PROTO( const struct nfs4_stid *stid ), TP_ARGS(stid), + TP_CONDITION(stid->sc_client), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) __field(u32, si_id) __field(u32, si_generation) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen) ), TP_fast_assign( const stateid_t *stp = &stid->sc_stateid; @@ -1060,14 +1052,11 @@ TRACE_EVENT(nfsd_cb_recall, __entry->cl_id = stp->si_opaque.so_clid.cl_id; __entry->si_id = stp->si_opaque.so_id; __entry->si_generation = stp->si_generation; - if (clp) - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); - else - memset(__entry->addr, 0, sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->si_id, __entry->si_generation) ); @@ -1081,7 +1070,7 @@ TRACE_EVENT(nfsd_cb_notify_lock, __field(u32, cl_boot) __field(u32, cl_id) __field(u32, fh_hash) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen) ), TP_fast_assign( const struct nfs4_client *clp = lo->lo_owner.so_client; @@ -1089,11 +1078,11 @@ TRACE_EVENT(nfsd_cb_notify_lock, __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh); - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->fh_hash) ); @@ -1114,7 +1103,7 @@ TRACE_EVENT(nfsd_cb_offload, __field(u32, fh_hash) __field(int, status) __field(u64, count) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) ), TP_fast_assign( __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; @@ -1124,11 +1113,11 @@ TRACE_EVENT(nfsd_cb_offload, __entry->fh_hash = knfsd_fh_hash(fh); __entry->status = be32_to_cpu(status); __entry->count = count; - memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, - sizeof(struct sockaddr_in6)); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) ), TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d", - __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->si_id, __entry->si_generation, __entry->fh_hash, __entry->count, __entry->status) ); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 91600e71be19..c22ad0532e8e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -26,15 +26,14 @@ #include <linux/xattr.h> #include <linux/jhash.h> #include <linux/ima.h> +#include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> -#ifdef CONFIG_NFSD_V3 #include "xdr3.h" -#endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 #include "../internal.h" @@ -608,7 +607,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif /* defined(CONFIG_NFSD_V4) */ -#ifdef CONFIG_NFSD_V3 /* * Check server access rights to a file system object */ @@ -720,7 +718,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor out: return error; } -#endif /* CONFIG_NFSD_V3 */ int nfsd_open_break_lease(struct inode *inode, int access) { @@ -1113,7 +1110,6 @@ out: return err; } -#ifdef CONFIG_NFSD_V3 /** * nfsd_commit - Commit pending writes to stable storage * @rqstp: RPC request being processed @@ -1190,7 +1186,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, out: return err; } -#endif /* CONFIG_NFSD_V3 */ static __be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, @@ -1380,8 +1375,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, rdev, resfhp); } -#ifdef CONFIG_NFSD_V3 - /* * NFSv3 and NFSv4 version of nfsd_create */ @@ -1547,7 +1540,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserrno(host_err); goto out; } -#endif /* CONFIG_NFSD_V3 */ /* * Read a symlink. On entry, *lenp must contain the maximum path length that diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2c43d10e3cab..ccb87b2864f6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -68,7 +68,6 @@ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); -#ifdef CONFIG_NFSD_V3 __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, @@ -76,7 +75,6 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, u32 *verifier, bool *truncp, bool *created); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, u64 offset, u32 count, __be32 *verf); -#endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, void **bufp, int *lenp); diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 528fb299430e..852f71580bd0 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -32,7 +32,7 @@ struct nfsd_readargs { struct nfsd_writeargs { svc_fh fh; __u32 offset; - int len; + __u32 len; struct xdr_buf payload; }; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 66bdaa2cf496..ca611ac09f7c 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -20,6 +20,23 @@ #include "page.h" #include "btnode.h" + +/** + * nilfs_init_btnc_inode - initialize B-tree node cache inode + * @btnc_inode: inode to be initialized + * + * nilfs_init_btnc_inode() sets up an inode for B-tree node cache. + */ +void nilfs_init_btnc_inode(struct inode *btnc_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(btnc_inode); + + btnc_inode->i_mode = S_IFREG; + ii->i_flags = 0; + memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); +} + void nilfs_btnode_cache_clear(struct address_space *btnc) { invalidate_mapping_pages(btnc, 0, -1); @@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc) struct buffer_head * nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) { - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct buffer_head *bh; bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); @@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct page *page; int err; @@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct nilfs_btnode_chkey_ctxt *ctxt) { struct buffer_head *obh, *nbh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; int err; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 11663650add7..bd5544e63a01 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; +void nilfs_init_btnc_inode(struct inode *btnc_inode); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3594eabe1419..f544c22fff78 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path) static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); @@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp, const struct nilfs_btree_readahead_info *ra) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh, *ra_bh; sector_t submit_ptr = 0; int ret; @@ -1741,6 +1743,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, dat = nilfs_bmap_get_dat(btree); } + ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode); + if (ret < 0) + return ret; + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; @@ -1913,7 +1919,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, @@ -1939,7 +1945,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } @@ -1958,7 +1964,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); } @@ -2134,7 +2140,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { - struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct pagevec pvec; struct buffer_head *bh, *head; @@ -2188,12 +2195,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } @@ -2398,6 +2405,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap) if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; + else + ret = nilfs_attach_btree_node_cache( + &NILFS_BMAP_I(bmap)->vfs_inode); + return ret; } diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index dc51d3b7a7bf..3b55e239705f 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, di = NILFS_DAT_I(dat); lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); nilfs_palloc_setup_cache(dat, &di->palloc_cache); - nilfs_mdt_setup_shadow_map(dat, &di->shadow); + err = nilfs_mdt_setup_shadow_map(dat, &di->shadow); + if (err) + goto failed; err = nilfs_read_inode_common(dat, raw_inode); if (err) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index a8f5315f01e3..04fdd420eae7 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { + struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode; int ret; - ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn, REQ_OP_READ, 0, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ @@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); - return 0; + return nilfs_attach_btree_node_cache(inode); } /** @@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index e3d807d5b83a..6045cea21f52 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -29,12 +29,16 @@ * @cno: checkpoint number * @root: pointer on NILFS root object (mounted checkpoint) * @for_gc: inode for GC flag + * @for_btnc: inode for B-tree node cache flag + * @for_shadow: inode for shadowed page cache flag */ struct nilfs_iget_args { u64 ino; __u64 cno; struct nilfs_root *root; - int for_gc; + bool for_gc; + bool for_btnc; + bool for_shadow; }; static int nilfs_iget_test(struct inode *inode, void *opaque); @@ -199,23 +203,22 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) return 0; } -static int nilfs_set_page_dirty(struct page *page) +static bool nilfs_dirty_folio(struct address_space *mapping, + struct folio *folio) { - struct inode *inode = page->mapping->host; - int ret = __set_page_dirty_nobuffers(page); + struct inode *inode = mapping->host; + struct buffer_head *head; + unsigned int nr_dirty = 0; + bool ret = filemap_dirty_folio(mapping, folio); - if (page_has_buffers(page)) { - unsigned int nr_dirty = 0; - struct buffer_head *bh, *head; + /* + * The page may not be locked, eg if called from try_to_unmap_one() + */ + spin_lock(&mapping->private_lock); + head = folio_buffers(folio); + if (head) { + struct buffer_head *bh = head; - /* - * This page is locked by callers, and no other thread - * concurrently marks its buffers dirty since they are - * only dirtied through routines in fs/buffer.c in - * which call sites of mark_buffer_dirty are protected - * by page lock. - */ - bh = head = page_buffers(page); do { /* Do not mark hole blocks dirty */ if (buffer_dirty(bh) || !buffer_mapped(bh)) @@ -224,14 +227,13 @@ static int nilfs_set_page_dirty(struct page *page) set_buffer_dirty(bh); nr_dirty++; } while (bh = bh->b_this_page, bh != head); - - if (nr_dirty) - nilfs_set_file_dirty(inode, nr_dirty); } else if (ret) { - unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); + } + spin_unlock(&mapping->private_lock); + if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); - } return ret; } @@ -299,12 +301,12 @@ const struct address_space_operations nilfs_aops = { .writepage = nilfs_writepage, .readpage = nilfs_readpage, .writepages = nilfs_writepages, - .set_page_dirty = nilfs_set_page_dirty, + .dirty_folio = nilfs_dirty_folio, .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ - .invalidatepage = block_invalidatepage, + .invalidate_folio = block_invalidate_folio, .direct_IO = nilfs_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, }; @@ -314,7 +316,8 @@ static int nilfs_insert_inode_locked(struct inode *inode, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); @@ -527,6 +530,19 @@ static int nilfs_iget_test(struct inode *inode, void *opaque) return 0; ii = NILFS_I(inode); + if (test_bit(NILFS_I_BTNC, &ii->i_state)) { + if (!args->for_btnc) + return 0; + } else if (args->for_btnc) { + return 0; + } + if (test_bit(NILFS_I_SHADOW, &ii->i_state)) { + if (!args->for_shadow) + return 0; + } else if (args->for_shadow) { + return 0; + } + if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) return !args->for_gc; @@ -538,15 +554,17 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) struct nilfs_iget_args *args = opaque; inode->i_ino = args->ino; - if (args->for_gc) { + NILFS_I(inode)->i_cno = args->cno; + NILFS_I(inode)->i_root = args->root; + if (args->root && args->ino == NILFS_ROOT_INO) + nilfs_get_root(args->root); + + if (args->for_gc) NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); - NILFS_I(inode)->i_cno = args->cno; - NILFS_I(inode)->i_root = NULL; - } else { - if (args->root && args->ino == NILFS_ROOT_INO) - nilfs_get_root(args->root); - NILFS_I(inode)->i_root = args->root; - } + if (args->for_btnc) + NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC); + if (args->for_shadow) + NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW); return 0; } @@ -554,7 +572,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return ilookup5(sb, ino, nilfs_iget_test, &args); @@ -564,7 +583,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); @@ -595,7 +615,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno) { struct nilfs_iget_args args = { - .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 + .ino = ino, .root = NULL, .cno = cno, .for_gc = true, + .for_btnc = false, .for_shadow = false }; struct inode *inode; int err; @@ -615,6 +636,113 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, return inode; } +/** + * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode + * @inode: inode object + * + * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode, + * or does nothing if the inode already has it. This function allocates + * an additional inode to maintain page cache of B-tree nodes one-on-one. + * + * Return Value: On success, 0 is returned. On errors, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode; + struct nilfs_iget_args args; + + if (ii->i_assoc_inode) + return 0; + + args.ino = inode->i_ino; + args.root = ii->i_root; + args.cno = ii->i_cno; + args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0; + args.for_btnc = true; + args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0; + + btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!btnc_inode)) + return -ENOMEM; + if (btnc_inode->i_state & I_NEW) { + nilfs_init_btnc_inode(btnc_inode); + unlock_new_inode(btnc_inode); + } + NILFS_I(btnc_inode)->i_assoc_inode = inode; + NILFS_I(btnc_inode)->i_bmap = ii->i_bmap; + ii->i_assoc_inode = btnc_inode; + + return 0; +} + +/** + * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode + * @inode: inode object + * + * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its + * holder inode bound to @inode, or does nothing if @inode doesn't have it. + */ +void nilfs_detach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode = ii->i_assoc_inode; + + if (btnc_inode) { + NILFS_I(btnc_inode)->i_assoc_inode = NULL; + ii->i_assoc_inode = NULL; + iput(btnc_inode); + } +} + +/** + * nilfs_iget_for_shadow - obtain inode for shadow mapping + * @inode: inode object that uses shadow mapping + * + * nilfs_iget_for_shadow() allocates a pair of inodes that holds page + * caches for shadow mapping. The page cache for data pages is set up + * in one inode and the one for b-tree node pages is set up in the + * other inode, which is attached to the former inode. + * + * Return Value: On success, a pointer to the inode for data pages is + * returned. On errors, one of the following negative error code is returned + * in a pointer type. + * + * %-ENOMEM - Insufficient memory available. + */ +struct inode *nilfs_iget_for_shadow(struct inode *inode) +{ + struct nilfs_iget_args args = { + .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = true + }; + struct inode *s_inode; + int err; + + s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!s_inode)) + return ERR_PTR(-ENOMEM); + if (!(s_inode->i_state & I_NEW)) + return inode; + + NILFS_I(s_inode)->i_flags = 0; + memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + + err = nilfs_attach_btree_node_cache(s_inode); + if (unlikely(err)) { + iget_failed(s_inode); + return ERR_PTR(err); + } + unlock_new_inode(s_inode); + return s_inode; +} + void nilfs_write_inode_common(struct inode *inode, struct nilfs_inode *raw_inode, int has_bmap) { @@ -762,7 +890,8 @@ static void nilfs_clear_inode(struct inode *inode) if (test_bit(NILFS_I_BMAP, &ii->i_state)) nilfs_bmap_clear(ii->i_bmap); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + if (!test_bit(NILFS_I_BTNC, &ii->i_state)) + nilfs_detach_btree_node_cache(inode); if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) nilfs_put_root(ii->i_root); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 4b3d33cf0041..d29a0f2b9c16 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -434,7 +434,8 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) static const struct address_space_operations def_mdt_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .writepage = nilfs_mdt_write_page, }; @@ -470,9 +471,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) void nilfs_mdt_clear(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + struct nilfs_shadow_map *shadow = mdi->mi_shadow; if (mdi->mi_palloc_cache) nilfs_palloc_destroy_cache(inode); + + if (shadow) { + struct inode *s_inode = shadow->inode; + + shadow->inode = NULL; + iput(s_inode); + mdi->mi_shadow = NULL; + } } /** @@ -506,12 +516,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct inode *s_inode; INIT_LIST_HEAD(&shadow->frozen_buffers); - address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode); - address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode); + + s_inode = nilfs_iget_for_shadow(inode); + if (IS_ERR(s_inode)) + return PTR_ERR(s_inode); + + shadow->inode = s_inode; mi->mi_shadow = shadow; return 0; } @@ -525,14 +538,15 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode) struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_inode_info *ii = NILFS_I(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *s_inode = shadow->inode; int ret; - ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); + ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping); if (ret) goto out; - ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, - &ii->i_btnode_cache); + ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping, + ii->i_assoc_inode->i_mapping); if (ret) goto out; @@ -548,7 +562,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits; - page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); if (!page) return -ENOMEM; @@ -580,7 +594,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n; - page = find_lock_page(&shadow->frozen_data, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; @@ -621,10 +635,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) nilfs_palloc_clear_cache(inode); nilfs_clear_dirty_pages(inode->i_mapping, true); - nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); + nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping); - nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); - nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); + nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true); + nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping, + NILFS_I(shadow->inode)->i_assoc_inode->i_mapping); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); @@ -639,10 +654,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode; down_write(&mi->mi_sem); nilfs_release_frozen_buffers(shadow); - truncate_inode_pages(&shadow->frozen_data, 0); - truncate_inode_pages(&shadow->frozen_btnodes, 0); + truncate_inode_pages(shadow->inode->i_mapping, 0); + truncate_inode_pages(shadow_btnc_inode->i_mapping, 0); up_write(&mi->mi_sem); } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 8f86080a436d..9e23bab3ff12 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -18,14 +18,12 @@ /** * struct nilfs_shadow_map - shadow mapping of meta data file * @bmap_store: shadow copy of bmap state - * @frozen_data: shadowed dirty data pages - * @frozen_btnodes: shadowed dirty b-tree nodes' pages + * @inode: holder of page caches used in shadow mapping * @frozen_buffers: list of frozen buffers */ struct nilfs_shadow_map { struct nilfs_bmap_store bmap_store; - struct address_space frozen_data; - struct address_space frozen_btnodes; + struct inode *inode; struct list_head frozen_buffers; }; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a7b81755c350..1344f7d475d3 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -28,7 +28,7 @@ * @i_xattr: <TODO> * @i_dir_start_lookup: page index of last successful search * @i_cno: checkpoint number for GC inode - * @i_btnode_cache: cached pages of b-tree nodes + * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer) * @i_dirty: list for connecting dirty files * @xattr_sem: semaphore for extended attributes processing * @i_bh: buffer contains disk inode @@ -43,7 +43,7 @@ struct nilfs_inode_info { __u64 i_xattr; /* sector_t ??? */ __u32 i_dir_start_lookup; __u64 i_cno; /* check point number for GC inode */ - struct address_space i_btnode_cache; + struct inode *i_assoc_inode; struct list_head i_dirty; /* List for connecting dirty files */ #ifdef CONFIG_NILFS_XATTR @@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap) return container_of(bmap, struct nilfs_inode_info, i_bmap_data); } -static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) -{ - struct nilfs_inode_info *ii = - container_of(btnc, struct nilfs_inode_info, i_btnode_cache); - return &ii->vfs_inode; -} - /* * Dynamic state flags of NILFS on-memory inode (i_state) */ @@ -98,6 +91,8 @@ enum { NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_BTNC, /* inode for btree node cache */ + NILFS_I_SHADOW, /* inode for shadowed page cache */ }; /* @@ -267,6 +262,9 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, unsigned long ino); extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); +int nilfs_attach_btree_node_cache(struct inode *inode); +void nilfs_detach_btree_node_cache(struct inode *inode); +struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 063dd16d75b5..a8e88cc38e16 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -436,22 +436,12 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) -{ - mapping->host = inode; - mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; - mapping->a_ops = &empty_aops; -} - /* * NILFS2 needs clear_page_dirty() in the following two cases: * - * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears - * page dirty flags when it copies back pages from the shadow cache - * (gcdat->{i_mapping,i_btnode_cache}) to its original cache - * (dat->{i_mapping,i_btnode_cache}). + * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty + * flag of pages when it copies back pages from shadow cache to the + * original cache. * * 2) Some B-tree operations like insertion or deletion may dispose buffers * in dirty state, and this needs to cancel the dirty state of their pages. diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 569263b23c0c..21ddcdd4d63e 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -43,7 +43,6 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, unsigned int); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 43287b0d3e9b..1362ccb64ec7 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -337,26 +337,12 @@ static void nilfs_end_bio_write(struct bio *bio) } static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi, int mode, - int mode_flags) + struct nilfs_write_info *wi) { struct bio *bio = wi->bio; - int err; - - if (segbuf->sb_nbio > 0 && - bdi_write_congested(segbuf->sb_super->s_bdi)) { - wait_for_completion(&segbuf->sb_bio_event); - segbuf->sb_nbio--; - if (unlikely(atomic_read(&segbuf->sb_err))) { - bio_put(bio); - err = -EIO; - goto failed; - } - } bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_set_op_attrs(bio, mode, mode_flags); submit_bio(bio); segbuf->sb_nbio++; @@ -365,33 +351,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end; return 0; - - failed: - wi->bio = NULL; - return err; -} - -/** - * nilfs_alloc_seg_bio - allocate a new bio for writing log - * @nilfs: nilfs object - * @start: start block number of the bio - * @nr_vecs: request size of page vector. - * - * Return Value: On success, pointer to the struct bio is returned. - * On error, NULL is returned. - */ -static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, - int nr_vecs) -{ - struct bio *bio; - - bio = bio_alloc(GFP_NOIO, nr_vecs); - if (likely(bio)) { - bio_set_dev(bio, nilfs->ns_bdev); - bio->bi_iter.bi_sector = - start << (nilfs->ns_blocksize_bits - 9); - } - return bio; } static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, @@ -407,17 +366,17 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi, - struct buffer_head *bh, int mode) + struct buffer_head *bh) { int len, err; BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { - wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end, - wi->nr_vecs); - if (unlikely(!wi->bio)) - return -ENOMEM; + wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, + REQ_OP_WRITE, GFP_NOIO); + wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) << + (wi->nilfs->ns_blocksize_bits - 9); } len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -426,7 +385,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, return 0; } /* bio is FULL */ - err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0); + err = nilfs_segbuf_submit_bio(segbuf, wi); /* never submit current bh */ if (likely(!err)) goto repeat; @@ -456,13 +415,13 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } @@ -472,8 +431,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE, - REQ_SYNC); + wi.bio->bi_opf |= REQ_SYNC; + res = nilfs_segbuf_submit_bio(segbuf, &wi); } failed_bio: diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 85a853334771..0afe0832c754 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -733,15 +733,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, struct list_head *listp) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct address_space *mapping = &ii->i_btnode_cache; + struct inode *btnc_inode = ii->i_assoc_inode; struct pagevec pvec; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; + if (!btnc_inode) + return; + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, + while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); @@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) continue; list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 63e5fa74016c..ba108f915391 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -151,13 +151,14 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) { struct nilfs_inode_info *ii; - ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS); if (!ii) return NULL; ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); + ii->i_assoc_inode = NULL; + ii->i_bmap = &ii->i_bmap_data; return &ii->vfs_inode; } @@ -1377,8 +1378,6 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - address_space_init_once(&ii->i_btnode_cache); - ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2ff6bd85ba8f..9b32b76a9c30 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1003,17 +1003,18 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags, __u32 umask, int *destroy) { - __u32 oldmask = 0; + __u32 oldmask, newmask; /* umask bits cannot be removed by user */ mask &= ~umask; spin_lock(&fsn_mark->lock); + oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; fsn_mark->mask &= ~mask; } else { fsn_mark->ignored_mask &= ~mask; } + newmask = fsnotify_calc_mask(fsn_mark); /* * We need to keep the mark around even if remaining mask cannot * result in any events (e.g. mask == FAN_ONDIR) to support incremenal @@ -1023,7 +1024,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); spin_unlock(&fsn_mark->lock); - return mask & oldmask; + return oldmask & ~newmask; } static int fanotify_remove_mark(struct fsnotify_group *group, @@ -1080,24 +1081,42 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, flags, umask); } +static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, unsigned int flags, + __u32 *removed) +{ + fsn_mark->ignored_mask |= mask; + + /* + * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to + * the removal of the FS_MODIFY bit in calculated mask if it was set + * because of an ignored mask that is now going to survive FS_MODIFY. + */ + if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + if (!(fsn_mark->mask & FS_MODIFY)) + *removed = FS_MODIFY; + } +} + static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, - unsigned int flags) + __u32 mask, unsigned int flags, + __u32 *removed) { - __u32 oldmask = -1; + __u32 oldmask, newmask; spin_lock(&fsn_mark->lock); + oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; fsn_mark->mask |= mask; } else { - fsn_mark->ignored_mask |= mask; - if (flags & FAN_MARK_IGNORED_SURV_MODIFY) - fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed); } + newmask = fsnotify_calc_mask(fsn_mark); spin_unlock(&fsn_mark->lock); - return mask & ~oldmask; + return newmask & ~oldmask; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, @@ -1155,7 +1174,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; - __u32 added; + __u32 added, removed = 0; int ret = 0; mutex_lock(&group->mark_mutex); @@ -1178,8 +1197,8 @@ static int fanotify_add_mark(struct fsnotify_group *group, goto out; } - added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - if (added & ~fsnotify_conn_mask(fsn_mark->connector)) + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed); + if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector))) fsnotify_recalc_mask(fsn_mark->connector); out: diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ab81a0776ece..70a8516b78bc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - if (iput_inode) - iput(iput_inode); + iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); @@ -85,8 +84,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) } spin_unlock(&sb->s_inode_list_lock); - if (iput_inode) - iput(iput_inode); + iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) @@ -531,11 +529,13 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * if this is a modify event we may need to clear the ignored masks - * otherwise return if none of the marks care about this type of event. + * If this is a modify event we may need to clear some ignored masks. + * In that case, the object with ignored masks will have the FS_MODIFY + * event in its mask. + * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (!(mask & FS_MODIFY) && !(test_mask & marks_mask)) + if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 9007d6affff3..4853184f7dde 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -127,7 +127,7 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) return; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) - new_mask |= mark->mask; + new_mask |= fsnotify_calc_mask(mark); } *fsnotify_conn_mask_p(conn) = new_mask; } @@ -692,7 +692,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, if (ret) goto err; - if (mark->mask) + if (mark->mask || mark->ignored_mask) fsnotify_recalc_mask(mark->connector); return ret; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index bb0a43860ad2..90e3dad8ee45 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -593,12 +593,12 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) iblock = initialized_size >> blocksize_bits; /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ @@ -653,7 +653,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // block_dirty_folio(); // put_page() // We don't need to wait on the writes. // Update iblock. @@ -1350,12 +1350,13 @@ retry_writepage: /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) { + struct folio *folio = page_folio(page); /* * The page may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ - block_invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + block_invalidate_folio(folio, 0, folio_size(folio)); + folio_unlock(folio); ntfs_debug("Write outside i_size - truncated?"); return 0; } @@ -1653,7 +1654,7 @@ const struct address_space_operations ntfs_normal_aops = { .readpage = ntfs_readpage, #ifdef NTFS_RW .writepage = ntfs_writepage, - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .bmap = ntfs_bmap, .migratepage = buffer_migrate_page, @@ -1668,7 +1669,7 @@ const struct address_space_operations ntfs_compressed_aops = { .readpage = ntfs_readpage, #ifdef NTFS_RW .writepage = ntfs_writepage, - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, @@ -1683,9 +1684,7 @@ const struct address_space_operations ntfs_mst_aops = { .readpage = ntfs_readpage, /* Fill page with data. */ #ifdef NTFS_RW .writepage = ntfs_writepage, /* Write dirty page to disk. */ - .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty - without touching the buffers - belonging to the page. */ + .dirty_folio = filemap_dirty_folio, #endif /* NTFS_RW */ .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, @@ -1747,7 +1746,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); spin_unlock(&mapping->private_lock); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { bh = buffers_to_free->b_this_page; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca..efe0602b4e51 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) ntfs_inode *ni; ntfs_debug("Entering."); - ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS); + ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); if (likely(ni != NULL)) { ni->state = 0; return VFS_I(ni); @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 4de9acb16968..3de5700a9b83 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, return err; } -static inline struct bio *ntfs_alloc_bio(u32 nr_vecs) -{ - struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs); - - if (!bio && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs); - } - return bio; -} - /* * ntfs_bio_pages - Read/write pages from/to disk. */ @@ -1496,19 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; new_bio: - new = ntfs_alloc_bio(nr_pages - page_idx); - if (!new) { - err = -ENOMEM; - goto out; - } + new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = lbo >> 9; - bio->bi_opf = op; while (len) { off = vbo & (PAGE_SIZE - 1); @@ -1599,18 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run) lbo = (u64)lcn << cluster_bits; len = (u64)clen << cluster_bits; new_bio: - new = ntfs_alloc_bio(BIO_MAX_VECS); - if (!new) { - err = -ENOMEM; - break; - } + new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE; bio->bi_iter.bi_sector = lbo >> 9; for (;;) { @@ -1626,11 +1603,10 @@ new_bio: } } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen)); - if (bio) { - if (!err) - err = submit_bio_wait(bio); - bio_put(bio); - } + if (!err) + err = submit_bio_wait(bio); + bio_put(bio); + blk_finish_plug(&plug); out: unlock_page(fill); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index a87ab3ad3cd3..9eab11e3b034 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -1950,7 +1950,7 @@ const struct address_space_operations ntfs_aops = { .write_end = ntfs_write_end, .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, }; const struct address_space_operations ntfs_aops_cmpr = { diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 29813200c7af..278dcf502410 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep; static struct inode *ntfs_alloc_inode(struct super_block *sb) { - struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS); + struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS); if (!ni) return NULL; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bf9357123bc5..49f41074baad 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5981,7 +5981,7 @@ bail: return status; } -/* Expects you to already be holding tl_inode->i_mutex */ +/* Expects you to already be holding tl_inode->i_rwsem */ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 498da317580a..4b9af65cb61b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, down_write(&oi->ip_alloc_sem); - /* Delete orphan before acquire i_mutex. */ + /* Delete orphan before acquire i_rwsem. */ if (dwc->dw_orphaned) { BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); @@ -2453,7 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations ocfs2_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, .readpage = ocfs2_readpage, .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, @@ -2461,7 +2461,7 @@ const struct address_space_operations ocfs2_aops = { .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, .direct_IO = ocfs2_direct_IO, - .invalidatepage = block_invalidatepage, + .invalidate_folio = block_invalidate_folio, .releasepage = ocfs2_releasepage, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a17be1618bf7..ea0e70c0fce0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, /* Must put everything in 512 byte sectors for the bio... */ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio_set_dev(bio, reg->hr_bdev); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - bio_set_op_attrs(bio, op, op_flags); vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 625c92521416..27fee68f860a 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - /* this runs under the parent dir's i_mutex; there can be only + /* this runs under the parent dir's i_rwsem; there can be only * one caller in here at a time */ if (o2nm_single_cluster) return ERR_PTR(-ENOSPC); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f2cc1ff29e6d..81c3d65d68fe 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1957,7 +1957,7 @@ bail_nolock: } /* - * NOTE: this should always be called with parent dir i_mutex taken. + * NOTE: this should always be called with parent dir i_rwsem taken. */ int ocfs2_find_files_on_disk(const char *name, int namelen, @@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name * - * Callers should have i_mutex + a cluster lock on dir + * Callers should have i_rwsem + a cluster lock on dir */ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index fa0a14f199eb..e360543ad7e7 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -280,7 +280,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) { struct dlmfs_inode_private *ip; - ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS); if (!ip) return NULL; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fc5f780fa235..01b7407a8893 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode, /* * Don't use ocfs2_mark_inode_dirty() here as we don't always - * have i_mutex to guard against concurrent changes to other + * have i_rwsem to guard against concurrent changes to other * inode fields. */ inode->i_atime = current_time(inode); @@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, @@ -1068,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode, /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on - * i_mutex to block other extend/truncate calls while we're + * i_rwsem to block other extend/truncate calls while we're * here. We even have to hold it for sparse files because there * might be some tail zeroing. */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c2411c2afcf..5739dc301569 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -713,7 +713,7 @@ bail: /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir - * i_mutex held, we'll deadlock here. Instead we detect this + * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5f6bacbeef6b..c4426d12a2ad 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -606,7 +606,7 @@ out: /* * make sure we've got at least bits_wanted contiguous bits in the - * local alloc. You lose them when you drop i_mutex. + * local alloc. You lose them when you drop i_rwsem. * * We will add ourselves to the transaction passed in, but may start * our own in order to shift windows. @@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * We must double check state and allocator bits because - * another process may have changed them while holding i_mutex. + * another process may have changed them while holding i_rwsem. */ spin_lock(&osb->osb_lock); if (!ocfs2_la_state_enabled(osb) || @@ -1029,7 +1029,7 @@ enum ocfs2_la_event { /* * Given an event, calculate the size of our next local alloc window. * - * This should always be called under i_mutex of the local alloc inode + * This should always be called under i_rwsem of the local alloc inode * so that local alloc disabling doesn't race with processes trying to * use the allocator. * diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea..c75fd54b9185 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -476,7 +476,7 @@ leave: ocfs2_free_alloc_context(meta_ac); /* - * We should call iput after the i_mutex of the bitmap been + * We should call iput after the i_rwsem of the bitmap been * unlocked in ocfs2_free_alloc_context, or the * ocfs2_delete_inode will mutex_lock again. */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bb62cc2e0211..337527571461 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -355,7 +355,7 @@ struct ocfs2_super struct delayed_work la_enable_wq; /* - * Must hold local alloc i_mutex and osb->osb_lock to change + * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; @@ -430,7 +430,7 @@ struct ocfs2_super atomic_t osb_tl_disable; /* * How many clusters in our truncate log. - * It must be protected by osb_tl_inode->i_mutex. + * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index f033de733adb..0b6f551a342a 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -36,7 +36,7 @@ * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected * by dqio_sem or dquot->dq_lock. - * - any modification of global quota file holds inode cluster lock, i_mutex, + * - any modification of global quota file holds inode cluster lock, i_rwsem, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. * - an allocation of new blocks for local quota file is protected by @@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) /* Read information header from global quota file */ int ocfs2_global_read_info(struct super_block *sb, int type) { - struct inode *gqinode = NULL; unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; @@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type) u64 pcount; int status; + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + /* Read global header */ - gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], OCFS2_INVALID_SLOT); - if (!gqinode) { + if (!oinfo->dqi_gqinode) { mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", type); status = -EINVAL; goto out_err; } - oinfo->dqi_gi.dqi_sb = sb; - oinfo->dqi_gi.dqi_type = type; - oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); - oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; - oinfo->dqi_gqi_bh = NULL; - oinfo->dqi_gqi_count = 0; - oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); if (status < 0) { mlog_errno(status); goto out_err; } - status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk, &pcount, NULL); if (status < 0) goto out_unlock; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 0e4b16d4c037..b1a8b046f4c2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); - oinfo->dqi_gqinode = NULL; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 85a47621e0c0..a75e2b7d67f5 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2772dec9dcea..477cdf94122e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -548,7 +548,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) { struct ocfs2_inode_info *oi; - oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); + oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS); if (!oi) return NULL; @@ -1105,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - root = d_make_root(inode); - if (!root) { - status = -ENOMEM; - mlog_errno(status); - goto read_super_error; - } - - sb->s_root = root; - - ocfs2_complete_mount_recovery(osb); - osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); if (!osb->osb_dev_kset) { @@ -1133,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + root = d_make_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c..95d0611c5fc7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7205,7 +7205,7 @@ out: * Used for reflink a non-preserve-security file. * * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. + * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 89725b15a64b..3f297b541713 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -372,7 +372,8 @@ const struct inode_operations omfs_file_inops = { }; const struct address_space_operations omfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = omfs_readpage, .readahead = omfs_readahead, .writepage = omfs_writepage, diff --git a/fs/open.c b/fs/open.c index 9ff2f621b760..1315253e0247 100644 --- a/fs/open.c +++ b/fs/open.c @@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; - f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index f825176ff4ed..f0b7f4d51a17 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -335,7 +335,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) { struct op_inode_info *oi; - oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL); + oi = alloc_inode_sb(sb, op_inode_cachep, GFP_KERNEL); if (!oi) return NULL; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index e5e3e500ed46..79c1025d18ea 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -46,7 +46,7 @@ static int orangefs_writepage_locked(struct page *page, else wlen = PAGE_SIZE; } - /* Should've been handled in orangefs_invalidatepage. */ + /* Should've been handled in orangefs_invalidate_folio. */ WARN_ON(off == len || off + wlen > len); bv.bv_page = page; @@ -243,7 +243,7 @@ static int orangefs_writepages(struct address_space *mapping, return ret; } -static int orangefs_launder_page(struct page *); +static int orangefs_launder_folio(struct folio *); static void orangefs_readahead(struct readahead_control *rac) { @@ -290,14 +290,15 @@ static void orangefs_readahead(struct readahead_control *rac) static int orangefs_readpage(struct file *file, struct page *page) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct iov_iter iter; struct bio_vec bv; ssize_t ret; loff_t off; /* offset into this page */ - if (PageDirty(page)) - orangefs_launder_page(page); + if (folio_test_dirty(folio)) + orangefs_launder_folio(folio); off = page_offset(page); bv.bv_page = page; @@ -330,6 +331,7 @@ static int orangefs_write_begin(struct file *file, void **fsdata) { struct orangefs_write_range *wr; + struct folio *folio; struct page *page; pgoff_t index; int ret; @@ -341,27 +343,28 @@ static int orangefs_write_begin(struct file *file, return -ENOMEM; *pagep = page; + folio = page_folio(page); - if (PageDirty(page) && !PagePrivate(page)) { + if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* * Should be impossible. If it happens, launder the page * since we don't know what's dirty. This will WARN in * orangefs_writepage_locked. */ - ret = orangefs_launder_page(page); + ret = orangefs_launder_folio(folio); if (ret) return ret; } - if (PagePrivate(page)) { + if (folio_test_private(folio)) { struct orangefs_write_range *wr; - wr = (struct orangefs_write_range *)page_private(page); + wr = folio_get_private(folio); if (wr->pos + wr->len == pos && uid_eq(wr->uid, current_fsuid()) && gid_eq(wr->gid, current_fsgid())) { wr->len += len; goto okay; } else { - ret = orangefs_launder_page(page); + ret = orangefs_launder_folio(folio); if (ret) return ret; } @@ -375,7 +378,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - attach_page_private(page, wr); + folio_attach_private(folio, wr); okay: return 0; } @@ -415,47 +418,45 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, return copied; } -static void orangefs_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +static void orangefs_invalidate_folio(struct folio *folio, + size_t offset, size_t length) { - struct orangefs_write_range *wr; - wr = (struct orangefs_write_range *)page_private(page); + struct orangefs_write_range *wr = folio_get_private(folio); if (offset == 0 && length == PAGE_SIZE) { - kfree(detach_page_private(page)); + kfree(folio_detach_private(folio)); return; /* write range entirely within invalidate range (or equal) */ - } else if (page_offset(page) + offset <= wr->pos && - wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree(detach_page_private(page)); + } else if (folio_pos(folio) + offset <= wr->pos && + wr->pos + wr->len <= folio_pos(folio) + offset + length) { + kfree(folio_detach_private(folio)); /* XXX is this right? only caller in fs */ - cancel_dirty_page(page); + folio_cancel_dirty(folio); return; /* invalidate range chops off end of write range */ - } else if (wr->pos < page_offset(page) + offset && - wr->pos + wr->len <= page_offset(page) + offset + length && - page_offset(page) + offset < wr->pos + wr->len) { + } else if (wr->pos < folio_pos(folio) + offset && + wr->pos + wr->len <= folio_pos(folio) + offset + length && + folio_pos(folio) + offset < wr->pos + wr->len) { size_t x; - x = wr->pos + wr->len - (page_offset(page) + offset); + x = wr->pos + wr->len - (folio_pos(folio) + offset); WARN_ON(x > wr->len); wr->len -= x; wr->uid = current_fsuid(); wr->gid = current_fsgid(); /* invalidate range chops off beginning of write range */ - } else if (page_offset(page) + offset <= wr->pos && - page_offset(page) + offset + length < wr->pos + wr->len && - wr->pos < page_offset(page) + offset + length) { + } else if (folio_pos(folio) + offset <= wr->pos && + folio_pos(folio) + offset + length < wr->pos + wr->len && + wr->pos < folio_pos(folio) + offset + length) { size_t x; - x = page_offset(page) + offset + length - wr->pos; + x = folio_pos(folio) + offset + length - wr->pos; WARN_ON(x > wr->len); wr->pos += x; wr->len -= x; wr->uid = current_fsuid(); wr->gid = current_fsgid(); /* invalidate range entirely within write range (punch hole) */ - } else if (wr->pos < page_offset(page) + offset && - page_offset(page) + offset + length < wr->pos + wr->len) { + } else if (wr->pos < folio_pos(folio) + offset && + folio_pos(folio) + offset + length < wr->pos + wr->len) { /* XXX what do we do here... should not WARN_ON */ WARN_ON(1); /* punch hole */ @@ -467,11 +468,11 @@ static void orangefs_invalidatepage(struct page *page, /* non-overlapping ranges */ } else { /* WARN if they do overlap */ - if (!((page_offset(page) + offset + length <= wr->pos) ^ - (wr->pos + wr->len <= page_offset(page) + offset))) { + if (!((folio_pos(folio) + offset + length <= wr->pos) ^ + (wr->pos + wr->len <= folio_pos(folio) + offset))) { WARN_ON(1); - printk("invalidate range offset %llu length %u\n", - page_offset(page) + offset, length); + printk("invalidate range offset %llu length %zu\n", + folio_pos(folio) + offset, length); printk("write range offset %llu length %zu\n", wr->pos, wr->len); } @@ -483,7 +484,7 @@ static void orangefs_invalidatepage(struct page *page, * Thus the following runs if wr was modified above. */ - orangefs_launder_page(page); + orangefs_launder_folio(folio); } static int orangefs_releasepage(struct page *page, gfp_t foo) @@ -496,17 +497,17 @@ static void orangefs_freepage(struct page *page) kfree(detach_page_private(page)); } -static int orangefs_launder_page(struct page *page) +static int orangefs_launder_folio(struct folio *folio) { int r = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, }; - wait_on_page_writeback(page); - if (clear_page_dirty_for_io(page)) { - r = orangefs_writepage_locked(page, &wbc); - end_page_writeback(page); + folio_wait_writeback(folio); + if (folio_clear_dirty_for_io(folio)) { + r = orangefs_writepage_locked(&folio->page, &wbc); + folio_end_writeback(folio); } return r; } @@ -633,19 +634,19 @@ static const struct address_space_operations orangefs_address_operations = { .readahead = orangefs_readahead, .readpage = orangefs_readpage, .writepages = orangefs_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .write_begin = orangefs_write_begin, .write_end = orangefs_write_end, - .invalidatepage = orangefs_invalidatepage, + .invalidate_folio = orangefs_invalidate_folio, .releasepage = orangefs_releasepage, .freepage = orangefs_freepage, - .launder_page = orangefs_launder_page, + .launder_folio = orangefs_launder_folio, .direct_IO = orangefs_direct_IO, }; vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); unsigned long *bitlock = &orangefs_inode->bitlock; @@ -659,27 +660,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) goto out; } - lock_page(page); - if (PageDirty(page) && !PagePrivate(page)) { + folio_lock(folio); + if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* - * Should be impossible. If it happens, launder the page + * Should be impossible. If it happens, launder the folio * since we don't know what's dirty. This will WARN in * orangefs_writepage_locked. */ - if (orangefs_launder_page(page)) { + if (orangefs_launder_folio(folio)) { ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } } - if (PagePrivate(page)) { - wr = (struct orangefs_write_range *)page_private(page); + if (folio_test_private(folio)) { + wr = folio_get_private(folio); if (uid_eq(wr->uid, current_fsuid()) && gid_eq(wr->gid, current_fsgid())) { - wr->pos = page_offset(page); + wr->pos = page_offset(vmf->page); wr->len = PAGE_SIZE; goto okay; } else { - if (orangefs_launder_page(page)) { + if (orangefs_launder_folio(folio)) { ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } @@ -690,27 +691,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } - wr->pos = page_offset(page); + wr->pos = page_offset(vmf->page); wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - attach_page_private(page, wr); + folio_attach_private(folio, wr); okay: file_update_time(vmf->vma->vm_file); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; goto out; } /* - * We mark the page dirty already here so that when freeze is in + * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will - * see the dirty page and writeprotect it again. + * see the dirty folio and writeprotect it again. */ - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; out: sb_end_pagefault(inode->i_sb); diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index d90d8addbfc2..5254256a224d 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -107,7 +107,7 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) { struct orangefs_inode_s *orangefs_inode; - orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL); + orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); if (!orangefs_inode) return NULL; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7bb0a47cb615..001cdbb8f015 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -174,7 +174,7 @@ static struct kmem_cache *ovl_inode_cachep; static struct inode *ovl_alloc_inode(struct super_block *sb) { - struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL); + struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL); if (!oi) return NULL; diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b6..e140ea150bbb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; @@ -606,7 +607,7 @@ out: static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, head, tail, mask; + unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: @@ -828,13 +829,11 @@ out_free_uid: void free_pipe_info(struct pipe_inode_info *pipe) { - int i; + unsigned int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -844,6 +843,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 80acb6885cf9..962d32468eb4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -759,9 +759,14 @@ static void posix_acl_fix_xattr_userns( } void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, @@ -769,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, } void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, diff --git a/fs/proc/array.c b/fs/proc/array.c index fd8b0c12b2cb..eb815759842c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -88,7 +88,6 @@ #include <linux/pid_namespace.h> #include <linux/prctl.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index d654ce7150fd..c1031843cc6a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,7 +74,6 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> @@ -1764,25 +1763,25 @@ out: static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)__get_free_page(GFP_KERNEL); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 6d8d4bf20837..2e244ada1f97 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -32,6 +32,8 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); + if (!key) + return -ENOMEM; xbc_for_each_key_value(leaf, val) { ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f84355c5a36d..73aeb4e6d32e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -66,7 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; - ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde..a2873a617ae8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -10,6 +10,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memremap.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6e97ed775074..f46060eb91b5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - const char *anon_name; + struct anon_vma_name *anon_name; if (!mm) { name = "[vdso]"; @@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = vma_anon_name(vma); + anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name); + seq_printf(m, "[anon:%s]", anon_name->name); } } @@ -1597,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped - * Bits 57-60 zero + * Bit 57 pte is uffd-wp write-protected + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 702754dd1daf..6f1b8ddc6f7a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DECLARE_RWSEM(vmcore_cb_rwsem); +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ @@ -70,8 +71,8 @@ static bool vmcore_opened; void register_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); - list_del(&cb->next); + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is * very unusual (e.g., forced driver removal), but we cannot stop @@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn) struct vmcore_cb *cb; bool ret = true; - lockdep_assert_held_read(&vmcore_cb_rwsem); - - list_for_each_entry(cb, &vmcore_cb_list, next) { + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { if (unlikely(!cb->pfn_is_ram)) continue; ret = cb->pfn_is_ram(cb, pfn); @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - down_read(&vmcore_cb_rwsem); + spin_lock(&vmcore_cb_lock); vmcore_opened = true; - up_read(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); return 0; } @@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, unsigned long pfn, offset; size_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset, userbuf); } if (tmp < 0) { - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return tmp; } @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count, ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); - up_read(&vmcore_cb_rwsem); return read; } @@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = { /** * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @sizez: size of buffer + * @size: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - int ret; + int ret, idx; /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return ret; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f243cb5e6a4f..e26162f102ff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -143,21 +143,22 @@ static void pstore_timer_kick(void) mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } -/* - * Should pstore_dump() wait for a concurrent pstore_dump()? If - * not, the current pstore_dump() will report a failure to dump - * and return. - */ -static bool pstore_cannot_wait(enum kmsg_dump_reason reason) +static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) { - /* In NMI path, pstore shouldn't block regardless of reason. */ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked. */ + /* + * Emergency restart shouldn't be blocked by spinning on + * pstore_info::buf_lock. + */ case KMSG_DUMP_EMERG: return true; default: @@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; + unsigned long flags = 0; int ret; why = kmsg_dump_reason_str(reason); - if (down_trylock(&psinfo->buf_lock)) { - /* Failed to acquire lock: give up if we cannot wait. */ - if (pstore_cannot_wait(reason)) { - pr_err("dump skipped in %s path: may corrupt error record\n", - in_nmi() ? "NMI" : why); - return; - } - if (down_interruptible(&psinfo->buf_lock)) { - pr_err("could not grab semaphore?!\n"); + if (pstore_cannot_block_path(reason)) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) { + pr_err("dump skipped in %s path because of concurrent dump\n", + in_nmi() ? "NMI" : why); return; } + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); } kmsg_dump_rewind(&iter); @@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - - up(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); - sema_init(&psinfo->buf_lock, 1); + spin_lock_init(&psinfo->buf_lock); if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index fe5305028c6e..a89e33719fcf 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, if (prz->corrected_bytes || prz->bad_blocks) ret = snprintf(str, len, "" - "\n%d Corrected bytes, %d unrecoverable blocks\n", + "\nECC: %d Corrected bytes, %d unrecoverable blocks\n", prz->corrected_bytes, prz->bad_blocks); else - ret = snprintf(str, len, "\nNo errors detected\n"); + ret = snprintf(str, len, "\nECC: No errors detected\n"); return ret; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3fb7fc819b4f..a635bb6615e9 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -338,7 +338,7 @@ static struct kmem_cache *qnx4_inode_cachep; static struct inode *qnx4_alloc_inode(struct super_block *sb) { struct qnx4_inode_info *ei; - ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 61191f7bdf62..9d8e7e9788a1 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -597,7 +597,7 @@ static struct kmem_cache *qnx6_inode_cachep; static struct inode *qnx6_alloc_inode(struct super_block *sb) { struct qnx6_inode_info *ei; - ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/read_write.c b/fs/read_write.c index 0074afa7ecb3..e643aec2b0ef 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -385,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } +EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { @@ -1617,35 +1618,41 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +/* Like generic_write_checks(), but takes size of write instead of iter. */ +int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - loff_t count; - int ret; if (IS_SWAPFILE(inode)) return -ETXTBSY; - if (!iov_iter_count(from)) + if (!*count) return 0; - /* FIXME: this is for backwards compatibility with 2.4 */ if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - count = iov_iter_count(from); - ret = generic_write_check_limits(file, iocb->ki_pos, &count); + return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); +} +EXPORT_SYMBOL(generic_write_checks_count); + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + loff_t count = iov_iter_count(from); + int ret; + + ret = generic_write_checks_count(iocb, &count); if (ret) return ret; diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 8fd54ed8f844..33c8b0dd07a2 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,10 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS - tristate "Reiserfs support" + tristate "Reiserfs support (deprecated)" select CRC32 help - Stores not just filenames but the files themselves in a balanced - tree. Uses journalling. + Reiserfs is deprecated and scheduled to be removed from the kernel + in 2025. If you are still using it, please migrate to another + filesystem or tell us your usecase for reiserfs. + + Reiserfs stores not just filenames but the files themselves in a + balanced tree. Uses journalling. Balanced trees are more efficient than traditional file system architectural foundations. diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f49b72ccac4c..36c59b25486c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2763,13 +2763,6 @@ static int reiserfs_write_begin(struct file *file, int old_ref = 0; inode = mapping->host; - *fsdata = NULL; - if (flags & AOP_FLAG_CONT_EXPAND && - (pos & (inode->i_sb->s_blocksize - 1)) == 0) { - pos ++; - *fsdata = (void *)(unsigned long)flags; - } - index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -2896,9 +2889,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, unsigned start; bool locked = false; - if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) - pos ++; - reiserfs_wait_on_write_block(inode->i_sb); if (reiserfs_transaction_running(inode->i_sb)) th = current->journal_info; @@ -3094,7 +3084,7 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) * decide if this buffer needs to stay around for data logging or ordered * write purposes */ -static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh) { int ret = 1; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); @@ -3147,26 +3137,26 @@ free_jh: return ret; } -/* clm -- taken from fs/buffer.c:block_invalidate_page */ -static void reiserfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +/* clm -- taken from fs/buffer.c:block_invalidate_folio */ +static void reiserfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct buffer_head *head, *bh, *next; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned int curr_off = 0; unsigned int stop = offset + length; - int partial_page = (offset || length < PAGE_SIZE); + int partial_page = (offset || length < folio_size(folio)); int ret = 1; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); if (!partial_page) - ClearPageChecked(page); + folio_clear_checked(folio); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) goto out; - head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; @@ -3179,7 +3169,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset, * is this block fully invalidated? */ if (offset <= curr_off) { - if (invalidatepage_can_drop(inode, bh)) + if (invalidate_folio_can_drop(inode, bh)) reiserfs_unmap_buffer(bh); else ret = 0; @@ -3194,21 +3184,21 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset, * so real IO is not possible anymore. */ if (!partial_page && ret) { - ret = try_to_release_page(page, 0); + ret = filemap_release_folio(folio, 0); /* maybe should BUG_ON(!ret); - neilb */ } out: return; } -static int reiserfs_set_page_dirty(struct page *page) +static bool reiserfs_dirty_folio(struct address_space *mapping, + struct folio *folio) { - struct inode *inode = page->mapping->host; - if (reiserfs_file_data_log(inode)) { - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + if (reiserfs_file_data_log(mapping->host)) { + folio_set_checked(folio); + return filemap_dirty_folio(mapping, folio); } - return __set_page_dirty_buffers(page); + return block_dirty_folio(mapping, folio); } /* @@ -3316,7 +3306,11 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* fill in hole pointers in the expanding truncate case. */ if (attr->ia_size > inode->i_size) { - error = generic_cont_expand_simple(inode, attr->ia_size); + loff_t pos = attr->ia_size; + + if ((pos & (inode->i_sb->s_blocksize - 1)) == 0) + pos++; + error = generic_cont_expand_simple(inode, pos); if (REISERFS_I(inode)->i_prealloc_count > 0) { int err; struct reiserfs_transaction_handle th; @@ -3430,10 +3424,10 @@ const struct address_space_operations reiserfs_address_space_operations = { .readpage = reiserfs_readpage, .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, - .invalidatepage = reiserfs_invalidatepage, + .invalidate_folio = reiserfs_invalidate_folio, .write_begin = reiserfs_write_begin, .write_end = reiserfs_write_end, .bmap = reiserfs_aop_bmap, .direct_IO = reiserfs_direct_IO, - .set_page_dirty = reiserfs_set_page_dirty, + .dirty_folio = reiserfs_dirty_folio, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a3e21160b634..b5b6f6201bed 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -858,8 +858,8 @@ loop_next: ret = -EIO; } /* - * ugly interaction with invalidatepage here. - * reiserfs_invalidate_page will pin any buffer that has a + * ugly interaction with invalidate_folio here. + * reiserfs_invalidate_folio will pin any buffer that has a * valid journal head from an older transaction. If someone * else sets our buffer dirty after we write it in the first * loop, and then someone truncates the page away, nobody diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 82e09901462e..cfb7c44c7366 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -639,7 +639,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); @@ -1652,6 +1652,8 @@ static int read_super_block(struct super_block *s, int offset) return 1; } + reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and " + "scheduled to be removed from the kernel in 2025"); SB_BUFFER_WITH_SB(s) = bh; SB_DISK_SUPER_BLOCK(s) = rs; diff --git a/fs/remap_range.c b/fs/remap_range.c index 231159682907..e112b5424cdb 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -146,11 +146,11 @@ static int generic_remap_check_len(struct inode *inode_in, } /* Read a page's worth of file data into the page cache. */ -static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos) +static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) { struct folio *folio; - folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); + folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); if (IS_ERR(folio)) return folio; if (!folio_test_uptodate(folio)) { @@ -187,8 +187,8 @@ static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) * Compare extents of two files to see if they are the same. * Caller must have locked both inodes to prevent write races. */ -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t dstoff, +static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff, + struct file *dest, loff_t dstoff, loff_t len, bool *is_same) { bool same = true; @@ -224,8 +224,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, * someone is invalidating pages on us and we lose. */ if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || - src_folio->mapping != src->i_mapping || - dst_folio->mapping != dest->i_mapping) { + src_folio->mapping != src->f_mapping || + dst_folio->mapping != dest->f_mapping) { same = false; goto unlock; } @@ -333,8 +333,8 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); if (ret) return ret; if (!is_same) @@ -362,11 +362,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); - /* - * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on - * the same mount. Practically, they only need to be on the same file - * system. - */ if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; @@ -458,7 +453,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EXDEV; - if (src_file->f_path.mnt != dst_file->f_path.mnt) + if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb) goto out_drop_write; ret = -EISDIR; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 259f684d9236..9e6bbb4219de 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -375,7 +375,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; - inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } diff --git a/fs/seq_file.c b/fs/seq_file.c index f8e1f4ee87ff..7ab8a58c29b6 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -554,9 +554,9 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) } EXPORT_SYMBOL(seq_dentry); -static void *single_start(struct seq_file *p, loff_t *pos) +void *single_start(struct seq_file *p, loff_t *pos) { - return NULL + (*pos == 0); + return *pos ? NULL : SEQ_START_TOKEN; } static void *single_next(struct seq_file *p, void *v, loff_t *pos) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 38b8fc514860..0507aecfc669 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -61,6 +61,40 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 /* + * Size of the session key (crypto key encrypted with the password + */ +#define SMB2_NTLMV2_SESSKEY_SIZE 16 +#define SMB2_SIGNATURE_SIZE 16 +#define SMB2_HMACSHA256_SIZE 32 +#define SMB2_CMACAES_SIZE 16 +#define SMB3_GCM128_CRYPTKEY_SIZE 16 +#define SMB3_GCM256_CRYPTKEY_SIZE 32 + +/* + * Size of the smb3 encryption/decryption keys + * This size is big enough to store any cipher key types. + */ +#define SMB3_ENC_DEC_KEY_SIZE 32 + +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE 16 + +#define CIFS_CLIENT_CHALLENGE_SIZE 8 + +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + +/* + * The default wsize is 1M for SMB2 (and for some CIFS cases). + * find_get_pages seems to return a maximum of 256 + * pages in a single call. With PAGE_SIZE == 4k, this means we can + * fill a single wsize request with a single call. + */ +#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) + +/* * SMB2 Header Definition * * "MBZ" : Must be Zero @@ -88,6 +122,15 @@ #define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) #define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ +/* + * Definitions for SMB2 Protocol Data Units (network frames) + * + * See MS-SMB2.PDF specification for protocol details. + * The Naming convention is the lower case version of the SMB2 + * command code name for the struct. Note that structures must be packed. + * + */ + /* See MS-SMB2 section 2.2.1 */ struct smb2_hdr { __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ @@ -115,6 +158,18 @@ struct smb2_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; +#define SMB2_ERROR_STRUCTURE_SIZE2 9 +#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2) + +struct smb2_err_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __u8 ErrorContextCount; + __u8 Reserved; + __le32 ByteCount; /* even if zero, at least one byte follows */ + __u8 ErrorData[1]; /* variable length */ +} __packed; + #define SMB3_AES_CCM_NONCE 11 #define SMB3_AES_GCM_NONCE 12 @@ -608,8 +663,8 @@ struct smb2_close_req { __le16 StructureSize; /* Must be 24 */ __le16 Flags; __le32 Reserved; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ } __packed; /* @@ -653,8 +708,8 @@ struct smb2_read_req { __u8 Flags; /* MBZ unless SMB3.02 or later */ __le32 Length; __le64 Offset; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; __le32 MinimumCount; __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; @@ -692,8 +747,8 @@ struct smb2_write_req { __le16 DataOffset; /* offset from start of SMB2 header to write data */ __le32 Length; __le64 Offset; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ __le32 Channel; /* MBZ unless SMB3.02 or later */ __le32 RemainingBytes; __le16 WriteChannelInfoOffset; @@ -722,8 +777,8 @@ struct smb2_flush_req { __le16 StructureSize; /* Must be 24 */ __le16 Reserved1; __le32 Reserved2; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } __packed; struct smb2_flush_rsp { @@ -732,6 +787,123 @@ struct smb2_flush_rsp { __le16 Reserved; } __packed; +#define SMB2_LOCKFLAG_SHARED 0x0001 +#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002 +#define SMB2_LOCKFLAG_UNLOCK 0x0004 +#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 +#define SMB2_LOCKFLAG_MASK 0x0007 + +struct smb2_lock_element { + __le64 Offset; + __le64 Length; + __le32 Flags; + __le32 Reserved; +} __packed; + +struct smb2_lock_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 48 */ + __le16 LockCount; + /* + * The least significant four bits are the index, the other 28 bits are + * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 + */ + __le32 LockSequenceNumber; + __u64 PersistentFileId; + __u64 VolatileFileId; + /* Followed by at least one */ + struct smb2_lock_element locks[1]; +} __packed; + +struct smb2_lock_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_echo_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +struct smb2_echo_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +/* + * Valid FileInformation classes for query directory + * + * Note that these are a subset of the (file) QUERY_INFO levels defined + * later in this file (but since QUERY_DIRECTORY uses equivalent numbers + * we do not redefine them here) + * + * FileDirectoryInfomation 0x01 + * FileFullDirectoryInformation 0x02 + * FileIdFullDirectoryInformation 0x26 + * FileBothDirectoryInformation 0x03 + * FileIdBothDirectoryInformation 0x25 + * FileNamesInformation 0x0C + * FileIdExtdDirectoryInformation 0x3C + */ + +/* search (query_directory) Flags field */ +#define SMB2_RESTART_SCANS 0x01 +#define SMB2_RETURN_SINGLE_ENTRY 0x02 +#define SMB2_INDEX_SPECIFIED 0x04 +#define SMB2_REOPEN 0x10 + +struct smb2_query_directory_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 FileInformationClass; + __u8 Flags; + __le32 FileIndex; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le16 FileNameOffset; + __le16 FileNameLength; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_directory_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* + * Maximum number of iovs we need for a set-info request. + * The largest one is rename/hardlink + * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info + * [1] : path + * [2] : compound padding + */ +#define SMB2_SET_INFO_IOV_SIZE 3 + +struct smb2_set_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 BufferLength; + __le16 BufferOffset; + __u16 Reserved; + __le32 AdditionalInformation; + __u64 PersistentFileId; + __u64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 2 */ +} __packed; /* * SMB2_NOTIFY See MS-SMB2 section 2.2.35 @@ -769,8 +941,8 @@ struct smb2_change_notify_req { __le16 StructureSize; __le16 Flags; __le32 OutputBufferLength; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ __le32 CompletionFilter; __u32 Reserved; } __packed; @@ -978,12 +1150,455 @@ struct smb2_create_rsp { __le64 EndofFile; __le32 FileAttributes; __le32 Reserved2; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; __u8 Buffer[1]; } __packed; +struct create_posix { + struct create_context ccontext; + __u8 Name[16]; + __le32 Mode; + __u32 Reserved; +} __packed; + +#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) + +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) + +#define SMB2_LEASE_KEY_SIZE 16 + +struct lease_context { + __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; +} __packed; + +struct lease_context_v2 { + __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE]; + __le16 Epoch; + __le16 Reserved; +} __packed; + +struct create_lease { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context lcontext; +} __packed; + +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + +/* See MS-SMB2 2.2.31 and 2.2.32 */ +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __le16 Reserved; /* offset from start of SMB2 header to write data */ + __le32 CtlCode; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[]; +} __packed; + +/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + +/* Reparse structures - see MS-FSCC 2.1.2 */ + +/* struct fsctl_reparse_info_req is empty, only response structs (see below) */ +struct reparse_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 DataBuffer[]; /* Variable Length */ +} __packed; + +struct reparse_guid_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 ReparseGuid[16]; + __u8 DataBuffer[]; /* Variable Length */ +} __packed; + +struct reparse_mount_point_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __u8 PathBuffer[]; /* Variable Length */ +} __packed; + +#define SYMLINK_FLAG_RELATIVE 0x00000001 + +struct reparse_symlink_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + __u8 PathBuffer[]; /* Variable Length */ +} __packed; + +/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ + +struct validate_negotiate_info_req { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ +} __packed; + +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + +/* Possible InfoType values */ +#define SMB2_O_INFO_FILE 0x01 +#define SMB2_O_INFO_FILESYSTEM 0x02 +#define SMB2_O_INFO_SECURITY 0x03 +#define SMB2_O_INFO_QUOTA 0x04 + +/* SMB2 Query Info see MS-SMB2 (2.2.37) or MS-DTYP */ + +/* List of QUERY INFO levels (those also valid for QUERY_DIR are noted below */ +#define FILE_DIRECTORY_INFORMATION 1 /* also for QUERY_DIR */ +#define FILE_FULL_DIRECTORY_INFORMATION 2 /* also for QUERY_DIR */ +#define FILE_BOTH_DIRECTORY_INFORMATION 3 /* also for QUERY_DIR */ +#define FILE_BASIC_INFORMATION 4 +#define FILE_STANDARD_INFORMATION 5 +#define FILE_INTERNAL_INFORMATION 6 +#define FILE_EA_INFORMATION 7 +#define FILE_ACCESS_INFORMATION 8 +#define FILE_NAME_INFORMATION 9 +#define FILE_RENAME_INFORMATION 10 +#define FILE_LINK_INFORMATION 11 +#define FILE_NAMES_INFORMATION 12 /* also for QUERY_DIR */ +#define FILE_DISPOSITION_INFORMATION 13 +#define FILE_POSITION_INFORMATION 14 +#define FILE_FULL_EA_INFORMATION 15 +#define FILE_MODE_INFORMATION 16 +#define FILE_ALIGNMENT_INFORMATION 17 +#define FILE_ALL_INFORMATION 18 +#define FILE_ALLOCATION_INFORMATION 19 +#define FILE_END_OF_FILE_INFORMATION 20 +#define FILE_ALTERNATE_NAME_INFORMATION 21 +#define FILE_STREAM_INFORMATION 22 +#define FILE_PIPE_INFORMATION 23 +#define FILE_PIPE_LOCAL_INFORMATION 24 +#define FILE_PIPE_REMOTE_INFORMATION 25 +#define FILE_MAILSLOT_QUERY_INFORMATION 26 +#define FILE_MAILSLOT_SET_INFORMATION 27 +#define FILE_COMPRESSION_INFORMATION 28 +#define FILE_OBJECT_ID_INFORMATION 29 +/* Number 30 not defined in documents */ +#define FILE_MOVE_CLUSTER_INFORMATION 31 +#define FILE_QUOTA_INFORMATION 32 +#define FILE_REPARSE_POINT_INFORMATION 33 +#define FILE_NETWORK_OPEN_INFORMATION 34 +#define FILE_ATTRIBUTE_TAG_INFORMATION 35 +#define FILE_TRACKING_INFORMATION 36 +#define FILEID_BOTH_DIRECTORY_INFORMATION 37 /* also for QUERY_DIR */ +#define FILEID_FULL_DIRECTORY_INFORMATION 38 /* also for QUERY_DIR */ +#define FILE_VALID_DATA_LENGTH_INFORMATION 39 +#define FILE_SHORT_NAME_INFORMATION 40 +#define FILE_SFIO_RESERVE_INFORMATION 44 +#define FILE_SFIO_VOLUME_INFORMATION 45 +#define FILE_HARD_LINK_INFORMATION 46 +#define FILE_NORMALIZED_NAME_INFORMATION 48 +#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 +#define FILE_STANDARD_LINK_INFORMATION 54 +#define FILE_ID_INFORMATION 59 +#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */ +/* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */ +#define SMB_FIND_FILE_POSIX_INFO 0x064 + +/* Security info type additionalinfo flags. */ +#define OWNER_SECINFO 0x00000001 +#define GROUP_SECINFO 0x00000002 +#define DACL_SECINFO 0x00000004 +#define SACL_SECINFO 0x00000008 +#define LABEL_SECINFO 0x00000010 +#define ATTRIBUTE_SECINFO 0x00000020 +#define SCOPE_SECINFO 0x00000040 +#define BACKUP_SECINFO 0x00010000 +#define UNPROTECTED_SACL_SECINFO 0x10000000 +#define UNPROTECTED_DACL_SECINFO 0x20000000 +#define PROTECTED_SACL_SECINFO 0x40000000 +#define PROTECTED_DACL_SECINFO 0x80000000 + +/* Flags used for FileFullEAinfo */ +#define SL_RESTART_SCAN 0x00000001 +#define SL_RETURN_SINGLE_ENTRY 0x00000002 +#define SL_INDEX_SPECIFIED 0x00000004 + +struct smb2_query_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 41 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 OutputBufferLength; + __le16 InputBufferOffset; + __u16 Reserved; + __le32 InputBufferLength; + __le32 AdditionalInformation; + __le32 Flags; + __u64 PersistentFileId; + __u64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* + * PDU query infolevel structure definitions + */ + +struct file_allocated_range_buffer { + __le64 file_offset; + __le64 length; +} __packed; + +struct smb2_file_internal_info { + __le64 IndexNumber; +} __packed; /* level 6 Query */ + +struct smb2_file_rename_info { /* encoding of request for level 10 */ + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[]; /* New name to be assigned */ + /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ +} __packed; /* level 10 Set */ + +struct smb2_file_link_info { /* encoding of request for level 11 */ + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[]; /* Name to be assigned to new link */ +} __packed; /* level 11 Set */ + +/* + * This level 18, although with struct with same name is different from cifs + * level 0x107. Level 0x107 has an extra u64 between AccessFlags and + * CurrentByteOffset. + */ +struct smb2_file_all_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ + __le64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 18 Query */ + +struct smb2_file_eof_info { /* encoding of request for level 10 */ + __le64 EndOfFile; /* new end of file value */ +} __packed; /* level 20 Set */ + +/* Level 100 query info */ +struct smb311_posix_qinfo { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 DosAttributes; + __le64 Inode; + __le32 DeviceId; + __le32 Zero; + /* beginning of POSIX Create Context Response */ + __le32 HardLinks; + __le32 ReparseTag; + __le32 Mode; + u8 Sids[]; + /* + * var sized owner SID + * var sized group SID + * le32 filenamelength + * u8 filename[] + */ +} __packed; + +/* File System Information Classes */ +#define FS_VOLUME_INFORMATION 1 /* Query */ +#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_SIZE_INFORMATION 3 /* Query */ +#define FS_DEVICE_INFORMATION 4 /* Query */ +#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ +#define FS_CONTROL_INFORMATION 6 /* Query, Set */ +#define FS_FULL_SIZE_INFORMATION 7 /* Query */ +#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ +#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ + +struct smb2_fs_full_size_info { + __le64 TotalAllocationUnits; + __le64 CallerAvailableAllocationUnits; + __le64 ActualAvailableAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; + +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FSEffPhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + +/* File System Control Information */ +struct smb2_fs_control_info { + __le64 FreeSpaceStartFiltering; + __le64 FreeSpaceThreshold; + __le64 FreeSpaceStopFiltering; + __le64 DefaultQuotaThreshold; + __le64 DefaultQuotaLimit; + __le32 FileSystemControlFlags; + __le32 Padding; +} __packed; + +/* volume info struct - see MS-FSCC 2.5.9 */ +#define MAX_VOL_LABEL_LEN 32 +struct smb3_fs_vol_info { + __le64 VolumeCreationTime; + __u32 VolumeSerialNumber; + __le32 VolumeLabelLength; /* includes trailing null */ + __u8 SupportsObjects; /* True if eg like NTFS, supports objects */ + __u8 Reserved; + __u8 VolumeLabel[]; /* variable len */ +} __packed; + +/* See MS-SMB2 2.2.23 through 2.2.25 */ +struct smb2_oplock_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) + +struct smb2_lease_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 44 */ + __le16 Epoch; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 CurrentLeaseState; + __le32 NewLeaseState; + __le32 BreakReason; + __le32 AccessMaskHint; + __le32 ShareMaskHint; +} __packed; + +struct smb2_lease_ack { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 LeaseState; + __le64 LeaseDuration; +} __packed; +#define OP_BREAK_STRUCT_SIZE_20 24 +#define OP_BREAK_STRUCT_SIZE_21 36 #endif /* _COMMON_SMB2PDU_H */ diff --git a/fs/splice.c b/fs/splice.c index 5dbce4dcc1a7..047b79db8eb5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -46,45 +46,45 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->page); struct address_space *mapping; - lock_page(page); + folio_lock(folio); - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (mapping) { - WARN_ON(!PageUptodate(page)); + WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on - * writeback completing on this page, since we'll remove it + * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the - * page, allowing the disk blocks to be reused by someone else + * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ - if (remove_mapping(mapping, page)) { + if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* - * Raced with truncate or failed to remove page from current + * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: - unlock_page(page); + folio_unlock(folio); return false; } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2db8bcf7ff85..622c844f6d11 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_VECS) - bio = bio_alloc(GFP_NOIO, page_count); - else + if (page_count <= BIO_MAX_VECS) { + bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO); + } else { bio = bio_kmalloc(GFP_NOIO, page_count); + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = REQ_OP_READ; + } if (!bio) return -ENOMEM; - bio_set_dev(bio, sb->s_bdev); - bio->bi_opf = READ; bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index b1b556dbce12..4f74abbc1a54 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -584,7 +584,7 @@ static void __exit exit_squashfs_fs(void) static struct inode *squashfs_alloc_inode(struct super_block *sb) { struct squashfs_inode_info *ei = - kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL); return ei ? &ei->vfs_inode : NULL; } diff --git a/fs/stat.c b/fs/stat.c index 28d2020ba1f4..5c2c94464e8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -184,6 +184,20 @@ int vfs_fstat(int fd, struct kstat *stat) return error; } +int getname_statx_lookup_flags(int flags) +{ + int lookup_flags = 0; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (!(flags & AT_NO_AUTOMOUNT)) + lookup_flags |= LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + return lookup_flags; +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -199,26 +213,19 @@ int vfs_fstat(int fd, struct kstat *stat) * * 0 will be returned on success, and a -ve error code if unsuccessful. */ -static int vfs_statx(int dfd, const char __user *filename, int flags, +static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; - unsigned lookup_flags = 0; + unsigned int lookup_flags = getname_statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | AT_STATX_SYNC_TYPE)) return -EINVAL; - if (!(flags & AT_SYMLINK_NOFOLLOW)) - lookup_flags |= LOOKUP_FOLLOW; - if (!(flags & AT_NO_AUTOMOUNT)) - lookup_flags |= LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - retry: - error = user_path_at(dfd, filename, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; @@ -240,8 +247,15 @@ out: int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, - stat, STATX_BASIC_STATS); + int ret; + int statx_flags = flags | AT_NO_AUTOMOUNT; + struct filename *name; + + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); + ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); + putname(name); + + return ret; } #ifdef __ARCH_WANT_OLD_STAT @@ -334,9 +348,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -345,7 +356,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -353,7 +366,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -363,7 +376,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -602,7 +615,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } -int do_statx(int dfd, const char __user *filename, unsigned flags, +int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer) { struct kstat stat; @@ -636,7 +649,14 @@ SYSCALL_DEFINE5(statx, unsigned int, mask, struct statx __user *, buffer) { - return do_statx(dfd, filename, flags, mask, buffer); + int ret; + struct filename *name; + + name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); + ret = do_statx(dfd, name, flags, mask, buffer); + putname(name); + + return ret; } #ifdef CONFIG_COMPAT @@ -644,11 +664,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -658,7 +680,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 42dcf96881b6..a12ac0356c69 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -703,19 +703,6 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) ktype = get_ktype(kobj); if (ktype) { - struct attribute **kattr; - - /* - * Change owner of the default attributes associated with the - * ktype of @kobj. - */ - for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) { - error = sysfs_file_change_owner(kobj, (*kattr)->name, - kuid, kgid); - if (error) - return error; - } - /* * Change owner of the default groups associated with the * ktype of @kobj. diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e747c135c1d1..98467bb76737 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -103,7 +103,7 @@ int __init sysfs_init(void) if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); - sysfs_root_kn = sysfs_root->kn; + sysfs_root_kn = kernfs_root_to_node(sysfs_root); err = register_filesystem(&sysfs_fs_type); if (err) { diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index be47263b8605..9e8d4a6fb2f3 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -306,7 +306,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) { struct sysv_inode_info *si; - si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL); + si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL); if (!si) return NULL; return &si->vfs_inode; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 749385015a8d..409ab5e17803 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -495,7 +495,8 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations sysv_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = sysv_readpage, .writepage = sysv_writepage, .write_begin = sysv_write_begin, diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bafc02bf8220..de7252715b12 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; - set_gid(tracefs_mount->mnt_root, gid); break; case Opt_mode: if (match_octal(&args[0], &option)) @@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb) inode->i_mode |= opts->mode; inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); return 0; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index dbe72f664abf..86151889548e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,20 +349,97 @@ out_budg: return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -392,42 +469,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -441,12 +506,6 @@ out_budg: return err; } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -660,32 +719,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -949,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* @@ -1264,17 +1298,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. + * + * wht_req: new whiteout inode for RENAME_WHITEOUT. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1331,20 +1367,44 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ + iput(whiteout); + goto out_release; + } + + /* Add the old_dentry size to the old_dir size. */ + old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1416,29 +1476,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; - iput(whiteout); - goto out_release; - } - - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1449,6 +1491,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1457,11 +1504,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1482,11 +1534,11 @@ out_cancel: inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5cfa28cd00cd..0383fbdc95ff 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1287,25 +1287,25 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } -static void ubifs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void ubifs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; - ubifs_assert(c, PagePrivate(page)); - if (offset || length < PAGE_SIZE) - /* Partial page remains dirty */ + ubifs_assert(c, folio_test_private(folio)); + if (offset || length < folio_size(folio)) + /* Partial folio remains dirty */ return; - if (PageChecked(page)) + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); - ClearPageChecked(page); + folio_detach_private(folio); + folio_clear_checked(folio); } int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) @@ -1445,18 +1445,18 @@ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } -static int ubifs_set_page_dirty(struct page *page) +static bool ubifs_dirty_folio(struct address_space *mapping, + struct folio *folio) { - int ret; - struct inode *inode = page->mapping->host; - struct ubifs_info *c = inode->i_sb->s_fs_info; + bool ret; + struct ubifs_info *c = mapping->host->i_sb->s_fs_info; - ret = __set_page_dirty_nobuffers(page); + ret = filemap_dirty_folio(mapping, folio); /* * An attempt to dirty a page without budgeting for it - should not * happen. */ - ubifs_assert(c, ret == 0); + ubifs_assert(c, ret == false); return ret; } @@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -1646,8 +1646,8 @@ const struct address_space_operations ubifs_file_address_operations = { .writepage = ubifs_writepage, .write_begin = ubifs_write_begin, .write_end = ubifs_write_end, - .invalidatepage = ubifs_invalidatepage, - .set_page_dirty = ubifs_set_page_dirty, + .invalidate_folio = ubifs_invalidate_folio, + .dirty_folio = ubifs_dirty_folio, #ifdef CONFIG_MIGRATION .migratepage = ubifs_migrate_page, #endif diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 789a7813f3fa..1607a3c76681 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<<c->max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index c6a863487780..71bcebe45f9c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 8ea680dba61e..75dab0ae3939 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ out_free: * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout doesn't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index aa7a1381c457..bad67455215f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -268,7 +268,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) { struct ubifs_inode *ui; - ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS); if (!ui) return NULL; diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 7acc5a74e5fa..06ad8fa1fcfb 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -42,6 +42,7 @@ static struct attribute *ubifs_attrs[] = { ATTR_LIST(errors_crc), NULL, }; +ATTRIBUTE_GROUPS(ubifs); static ssize_t ubifs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -74,7 +75,7 @@ static const struct sysfs_ops ubifs_attr_ops = { }; static struct kobj_type ubifs_sb_ktype = { - .default_attrs = ubifs_attrs, + .default_groups = ubifs_groups, .sysfs_ops = &ubifs_attr_ops, .release = ubifs_sb_release, }; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f55828c0a300..008fa46ef61e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -381,7 +381,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS * operation. This way UBIFS makes sure the inode fields are consistent. For - * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and * write-back must not write any of them before we have finished. * * The second reason is budgeting - UBIFS has to budget all operations. If an diff --git a/fs/udf/file.c b/fs/udf/file.c index 1baff8ddb754..0f6bf2504437 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,7 +125,8 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin } const struct address_space_operations udf_adinicb_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ea8f6cd01f50..ca4fa710e562 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -235,7 +235,8 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations udf_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = udf_readpage, .readahead = udf_readahead, .writepage = udf_writepage, diff --git a/fs/udf/super.c b/fs/udf/super.c index f26b5e0b84b6..4042d9739fb7 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -136,7 +136,7 @@ static struct kmem_cache *udf_inode_cachep; static struct inode *udf_alloc_inode(struct super_block *sb) { struct udf_inode_info *ei; - ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -2474,7 +2474,6 @@ static unsigned int udf_count_free_table(struct super_block *sb, unsigned int accum = 0; uint32_t elen; struct kernel_lb_addr eloc; - int8_t etype; struct extent_position epos; mutex_lock(&UDF_SB(sb)->s_alloc_mutex); @@ -2482,7 +2481,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, epos.offset = sizeof(struct unallocSpaceEntry); epos.bh = NULL; - while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) + while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1) accum += (elen >> table->i_sb->s_blocksize_bits); brelse(epos.bh); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ac628de69601..d0dda01620f0 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -526,7 +526,8 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations ufs_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 00a01471ea05..23377c1baed9 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1443,7 +1443,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 0cc87423de82..0e51c0025a16 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -33,7 +33,7 @@ $(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE else $(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE - $(call if_changed,shipped) + $(call if_changed,copy) endif diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..aa0c47cb0d16 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; + + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) + address &= PAGE_MASK; msg.arg.pagefault.address = address; /* * These flags indicate why the userfault occurred: @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; @@ -878,7 +881,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) vma = prev; else @@ -1438,7 +1441,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; goto next; @@ -1615,7 +1618,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; goto next; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 864c2fad23be..d74e0d336995 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -354,7 +354,7 @@ out: const struct address_space_operations vboxsf_reg_aops = { .readpage = vboxsf_readpage, .writepage = vboxsf_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .write_begin = simple_write_begin, .write_end = vboxsf_write_end, }; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 37dd3fe5b1e9..d2f6df69f611 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -241,7 +241,7 @@ static struct inode *vboxsf_alloc_inode(struct super_block *sb) { struct vboxsf_inode *sf_i; - sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS); + sf_i = alloc_inode_sb(sb, vboxsf_inode_cachep, GFP_NOFS); if (!sf_i) return NULL; diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index aec2ebf7d25a..e1db0f3f7e5e 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -9,6 +9,7 @@ #include <linux/namei.h> #include <linux/nls.h> #include <linux/sizes.h> +#include <linux/pagemap.h> #include <linux/vfs.h> #include "vfsmod.h" diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0adb970f4e73..14e2fb49cff5 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readahead() * * Copyright 2019 Google LLC */ @@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * that fail verification are set to the Error state. Verification is skipped * for pages already in the Error state, e.g. due to fscrypt decryption failure. * - * This is a helper function for use by the ->readpages() method of filesystems + * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that * populate the page cache without issuing bios (e.g. non block-based * filesystems) must instead call fsverity_verify_page() directly on each page. diff --git a/fs/xattr.c b/fs/xattr.c index 5c8c5175b385..998045165916 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -569,7 +569,8 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d, } if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size); + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), + kvalue, size); } error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); @@ -667,7 +668,8 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error); + posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), + kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 353e53b892e6..b52ed339727f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -82,6 +82,24 @@ xfs_prealloc_blocks( } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ if (xfs_has_rmapbt(mp)) - blocks++; /* rmap root block */ + blocks++; /* rmap root block */ if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1c14a0b1abea..d4c057b764f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f18a875f51c6..c1500b238520 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2818,7 +2818,7 @@ xfs_btree_split_worker( * in any way. */ if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); xfs_trans_set_context(args->cur->bc_tp); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 50546eadaae2..5f1e4799e8fa 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,7 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +const struct xfs_name xfs_name_dotdot = { + .name = (const unsigned char *)"..", + .len = 2, + .type = XFS_DIR3_FT_DIR, +}; /* * Convert inode mode to directory entry filetype @@ -54,10 +58,10 @@ xfs_mode_to_ftype( */ xfs_dahash_t xfs_ascii_ci_hashname( - struct xfs_name *name) + const struct xfs_name *name) { - xfs_dahash_t hash; - int i; + xfs_dahash_t hash; + int i; for (i = 0, hash = 0; i < name->len; i++) hash = tolower(name->name[i]) ^ rol32(hash, 7); @@ -243,7 +247,7 @@ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -337,16 +341,16 @@ xfs_dir_cilookup_result( int xfs_dir_lookup( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name, - xfs_ino_t *inum, /* out: inode number */ - struct xfs_name *ci_name) /* out: actual name if CI match */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - int lock_mode; + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); @@ -475,7 +479,7 @@ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, /* name of entry to replace */ + const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -728,7 +732,7 @@ xfs_dir2_namecheck( xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, - struct xfs_name *name) + const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d03e6098ded9..b6df3c34b26a 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -21,7 +21,7 @@ struct xfs_dir2_data_unused; struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; -extern struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dotdot; /* * Convert inode mode to directory entry filetype @@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum, + const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 711709a2aa53..7404a9ff1a92 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr { }; /* xfs_dir2.c */ -xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name); enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -201,7 +201,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, + const struct xfs_name *name); enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 1719e1c4da59..3590e10e3e62 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -24,7 +24,7 @@ struct xchk_xattr_buf { * space bitmap follows immediately after; and we have a third buffer * for storing intermediate bitmap results. */ - uint8_t buf[0]; + uint8_t buf[]; }; /* A place to store attribute values. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d6a67c7d227..90b7f4d127de 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -567,9 +567,9 @@ const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .releasepage = iomap_releasepage, - .invalidatepage = iomap_invalidatepage, + .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, @@ -581,7 +581,6 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 667e297f59b1..ae4345b37621 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, NULL, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, @@ -61,10 +26,9 @@ xfs_rw_bdev( if (is_vmalloc && op == REQ_OP_WRITE) flush_kernel_vmap_range(data, count); - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, + GFP_KERNEL); bio->bi_iter.bi_sector = sector; - bio->bi_opf = op | REQ_META | REQ_SYNC; do { struct page *page = kmem_to_page(data); @@ -74,10 +38,9 @@ xfs_rw_bdev( while (bio_add_page(bio, page, len, off) != len) { struct bio *prev = bio; - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_copy_dev(bio, prev); + bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); - bio->bi_opf = prev->bi_opf; bio_chain(prev, bio); submit_bio(prev); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e1f4d7d5a011..761dde155099 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -463,7 +463,7 @@ xfs_bui_item_recover( struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_filblks_t count; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b45e0d50a405..e1afb9e503e1 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_recover.h" +#include "xfs_log_priv.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" @@ -813,7 +814,15 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!xfs_is_shutdown(target->bt_mount)) + /* + * Check against log shutdown for error reporting because + * metadata writeback may require a read first and we need to + * report errors in metadata writeback until the log is shut + * down. High level transaction read functions already check + * against mount shutdown, anyway, so we only need to be + * concerned about low level IO interactions here. + */ + if (!xlog_is_shutdown(target->bt_mount->m_log)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -843,9 +852,6 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) - return; - xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -1177,10 +1183,10 @@ xfs_buf_ioend_handle_error( struct xfs_error_cfg *cfg; /* - * If we've already decided to shutdown the filesystem because of I/O - * errors, there's no point in giving this a retry. + * If we've already shutdown the journal because of I/O errors, there's + * no point in giving this a retry. */ - if (xfs_is_shutdown(mp)) + if (xlog_is_shutdown(mp->m_log)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1440,12 +1446,10 @@ next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = bio_max_segs(total_nr_pages); - bio = bio_alloc(GFP_NOIO, nr_pages); - bio_set_dev(bio, bp->b_target->bt_bdev); + bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - bio->bi_opf = op; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1593,8 +1597,23 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - /* on shutdown we stale and complete the buffer immediately */ - if (xfs_is_shutdown(bp->b_mount)) { + /* + * On log shutdown we stale and complete the buffer immediately. We can + * be called to read the superblock before the log has been set up, so + * be careful checking the log state. + * + * Checking the mount shutdown state here can result in the log tail + * moving inappropriately on disk as the log may not yet be shut down. + * i.e. failing this buffer on mount shutdown can remove it from the AIL + * and move the tail of the log forwards without having written this + * buffer to disk. This corrupts the log tail state in memory, and + * because the log may not be shut down yet, it can then be propagated + * to disk before the log is shutdown. Hence we check log shutdown + * state here rather than mount state to avoid corrupting the log tail + * on shutdown. + */ + if (bp->b_mount->m_log && + xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1808,10 +1827,10 @@ xfs_buftarg_drain( * If one or more failed buffers were freed, that means dirty metadata * was thrown away. This should only ever happen after I/O completion * handling has elevated I/O error(s) to permanent failures and shuts - * down the fs. + * down the journal. */ if (write_fail) { - ASSERT(xfs_is_shutdown(btp->bt_mount)); + ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } @@ -2094,12 +2113,13 @@ xfs_buf_delwri_submit_buffers( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { + if (!xfs_buf_trylock(bp)) + continue; if (xfs_buf_ispinned(bp)) { + xfs_buf_unlock(bp); pinned++; continue; } - if (!xfs_buf_trylock(bp)) - continue; } else { xfs_buf_lock(bp); } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a7a8e4528881..522d450a94b1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -21,6 +21,7 @@ #include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_buf_item_cache; @@ -428,7 +429,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_has_v3inodes(lip->li_mountp) || + if (xfs_has_v3inodes(lip->li_log->l_mp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -616,7 +617,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xfs_is_shutdown(lip->li_mountp); + xlog_is_shutdown(lip->li_log); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ef9c9c5c17..0e50f2c9348e 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -604,7 +604,7 @@ xfs_efi_item_recover( struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 48287caad28b..10e1cb71439e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -864,8 +864,8 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; - use_rmap = capable(CAP_SYS_ADMIN) && - xfs_has_rmapbt(mp); + use_rmap = xfs_has_rmapbt(mp) && + has_capability_noaudit(current, CAP_SYS_ADMIN); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..68f74549fa22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -347,7 +348,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; @@ -430,46 +431,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + free = percpu_counter_sum(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp); + delta = request - mp->m_resblks; + mp->m_resblks = request; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + if (!error) + xfs_mod_fdblocks(mp, fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; } - out: if (outval) { outval->resblks = mp->m_resblks; @@ -528,8 +519,11 @@ xfs_do_force_shutdown( int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9644f938990c..bffd6eb0b298 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -23,6 +23,7 @@ #include "xfs_reflink.h" #include "xfs_ialloc.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" #include <linux/iversion.h> @@ -77,7 +78,7 @@ xfs_inode_alloc( * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL * and return NULL here on ENOMEM. */ - ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); + ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_cache, ip); @@ -873,9 +874,16 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (xfs_is_shutdown(ip->i_mount)) { + /* + * Check for log shutdown because aborting the inode can move the log + * tail and corrupt in memory state. This is fine if the log is shut + * down, but if the log is still active and only the mount is shut down + * then the in-memory log tail movement caused by the abort can be + * incorrectly propagated to disk. + */ + if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 04bf467b1090..9de6205fe134 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -35,6 +35,7 @@ #include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_inode_cache; @@ -658,9 +659,9 @@ xfs_ip2xflags( */ int xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, + struct xfs_inode *dp, + const struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name) { xfs_ino_t inum; @@ -1217,7 +1218,7 @@ xfs_link( { xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - int error; + int error, nospace_error = 0; int resblks; trace_xfs_link(tdp, target_name); @@ -1236,19 +1237,11 @@ xfs_link( goto std_return; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); - } + error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, + &tp, &nospace_error); if (error) goto std_return; - xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) @@ -1306,6 +1299,8 @@ xfs_link( error_return: xfs_trans_cancel(tp); std_return: + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -2755,6 +2750,7 @@ xfs_remove( xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); + int dontcare; int error = 0; uint resblks; @@ -2772,31 +2768,24 @@ xfs_remove( goto std_return; /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. + * We try to get the real space reservation first, allowing for + * directory btree deletion(s) implying possible bmap insert(s). If we + * can't get the space reservation then we use 0 instead, and avoid the + * bmap btree insert(s) in the directory code by, if the bmap insert + * tries to happen, instead trimming the LAST block from the directory. + * + * Ignore EDQUOT and ENOSPC being returned via nospace_error because + * the directory code can handle a reservationless update and we don't + * want to prevent a user from trying to free space by deleting things. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, - &tp); - } + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, + &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); goto std_return; } - xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* * If we're removing a directory perform some additional validation. */ @@ -3109,7 +3098,8 @@ xfs_rename( bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); int spaceres; - int error; + bool retried = false; + int error, nospace_error = 0; trace_xfs_rename(src_dp, target_dp, src_name, target_name); @@ -3133,9 +3123,12 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); +retry: + nospace_error = 0; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { + nospace_error = error; spaceres = 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, &tp); @@ -3190,6 +3183,31 @@ xfs_rename( spaceres); /* + * Try to reserve quota to handle an expansion of the target directory. + * We'll allow the rename to continue in reservationless mode if we hit + * a space usage constraint. If we trigger reservationless mode, save + * the errno if there isn't any free space in the target directory. + */ + if (spaceres != 0) { + error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, + 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(target_dp, 0); + retried = true; + goto retry; + } + + nospace_error = error; + spaceres = 0; + error = 0; + } + if (error) + goto out_trans_cancel; + } + + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. * @@ -3435,6 +3453,8 @@ out_trans_cancel: out_release_wip: if (wip) xfs_irele(wip); + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -3611,7 +3631,7 @@ xfs_iflush_cluster( /* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; @@ -3659,7 +3679,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (xfs_is_shutdown(mp)) { + if (xlog_is_shutdown(mp->m_log)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -3685,9 +3705,19 @@ xfs_iflush_cluster( } if (error) { + /* + * Shutdown first so we kill the log before we release this + * buffer. If it is an INODE_ALLOC buffer and pins the tail + * of the log, failing it before the _log_ is shut down can + * result in the log tail being moved forward in the journal + * on disk because log writes can still be taking place. Hence + * unpinning the tail will allow the ICREATE intent to be + * removed from the log an recovery will fail with uninitialised + * inode cluster buffers. + */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b7e8f14d9fca..740ab13d1aa2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -402,7 +402,7 @@ enum layout_break_reason { int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, +int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *name, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 90d8e591baf8..9e6ef55cf29e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include <linux/iversion.h> @@ -543,10 +544,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -720,6 +728,17 @@ xfs_iflush_ail_updates( if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; + /* + * dgc: Not sure how this happens, but it happens very + * occassionaly via generic/388. xfs_iflush_abort() also + * silently handles this same "under writeback but not in AIL at + * shutdown" condition via xfs_trans_ail_delete(). + */ + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { + ASSERT(xlog_is_shutdown(lip->li_log)); + continue; + } + lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; @@ -822,46 +841,143 @@ xfs_buf_inode_io_fail( } /* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp; - if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock); + + xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); } + /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1a302000d604..bbd836a44ff0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2515fe8299e1..83481005317a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1189,7 +1189,7 @@ xfs_ioctl_setattr_get_trans( goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 004ed2a251e8..ca25ed89b706 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -217,7 +217,7 @@ xfs_compat_ioc_fsbulkstat( inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat; bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat; -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { /* * ... but on x32 the input xfs_fsop_bulkreq has pointers diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b79b3846e71b..b34e8e4344a8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -613,37 +613,6 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; -} - -void -xfs_setattr_time( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (iattr->ia_valid & ATTR_ATIME) - inode->i_atime = iattr->ia_atime; - if (iattr->ia_valid & ATTR_CTIME) - inode->i_ctime = iattr->ia_ctime; - if (iattr->ia_valid & ATTR_MTIME) - inode->i_mtime = iattr->ia_mtime; -} - static int xfs_vn_change_ok( struct user_namespace *mnt_userns, @@ -678,10 +647,10 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; - kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; - struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL; ASSERT((mask & ATTR_SIZE) == 0); @@ -723,66 +692,30 @@ xfs_setattr_nonsize( } error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_dqrele; /* - * Change file ownership. Must be the owner or privileged. + * Register quota modifications in the transaction. Must be the owner + * or privileged. These IDs could have changed since we last looked at + * them. But, we're assured that if the ownership did change while we + * didn't have the inode locked, inode's dquot(s) would have changed + * also. */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = inode->i_uid; - igid = inode->i_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((inode->i_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - inode->i_mode &= ~(S_ISUID|S_ISGID); - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (!uid_eq(iuid, uid)) { - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - inode->i_uid = uid; - } - if (!gid_eq(igid, gid)) { - if (XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_has_pquotino(mp) || - !XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - inode->i_gid = gid; - } + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && + !uid_eq(inode->i_uid, iattr->ia_uid)) { + ASSERT(udqp); + old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && + !gid_eq(inode->i_gid, iattr->ia_gid)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); + ASSERT(gdqp); + old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - if (mask & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -794,8 +727,8 @@ xfs_setattr_nonsize( /* * Release any dquot(s) the inode had kept before chown. */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(old_udqp); + xfs_qm_dqrele(old_gdqp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1006,11 +939,8 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (iattr->ia_valid & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 09a8fba84ff9..cb9105d667db 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 89fec9a18c34..499e15b24215 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -487,7 +487,10 @@ out_error: * Run all the pending iclog callbacks and wake log force waiters and iclog * space waiters so they can process the newly set shutdown state. We really * don't care what order we process callbacks here because the log is shut down - * and so state cannot change on disk anymore. + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * * We avoid processing actively referenced iclogs so that we don't run callbacks * while the iclog owner might still be preparing the iclog for IO submssion. @@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks( struct xlog_in_core *iclog; LIST_HEAD(cb_list); - spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { @@ -509,26 +511,22 @@ xlog_state_shutdown_callbacks( continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); + + spin_lock(&log->l_icloglock); wake_up_all(&iclog->ic_write_wait); wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); - spin_unlock(&log->l_icloglock); - - xlog_cil_process_committed(&cb_list); } /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * - * If the caller passes in a non-zero @old_tail_lsn and the current log tail - * does not match, there may be metadata on disk that must be persisted before - * this iclog is written. To satisfy that requirement, set the - * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new - * log tail value. - * * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the * log tail is updated correctly. NEED_FUA indicates that the iclog will be * written to stable storage, and implies that a commit record is contained @@ -545,12 +543,10 @@ xlog_state_shutdown_callbacks( * always capture the tail lsn on the iclog on the first NEED_FUA release * regardless of the number of active reference counts on this iclog. */ - int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_in_core *iclog) { xfs_lsn_t tail_lsn; bool last_ref; @@ -561,18 +557,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; - - if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && - !iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -583,11 +575,8 @@ xlog_state_release_iclog( * pending iclog callbacks that were waiting on the release of * this iclog. */ - if (last_ref) { - spin_unlock(&log->l_icloglock); + if (last_ref) xlog_state_shutdown_callbacks(log); - spin_lock(&log->l_icloglock); - } return -EIO; } @@ -600,8 +589,6 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - if (!iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); @@ -812,10 +799,9 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; + xfs_log_work_queue(mp); if (xlog_recovery_needed(log)) error = xlog_recover_finish(log); - if (!error) - xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); @@ -874,7 +860,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog); } /* @@ -1102,7 +1088,7 @@ xfs_log_item_init( int type, const struct xfs_item_ops *ops) { - item->li_mountp = mp; + item->li_log = mp->m_log; item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; @@ -1374,7 +1360,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } xlog_state_done_syncing(iclog); @@ -1883,19 +1869,19 @@ xlog_write_iclog( return; } - bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); - bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); - iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; - iclog->ic_bio.bi_end_io = xlog_bio_end_io; - iclog->ic_bio.bi_private = iclog; - /* * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more * IOs coming immediately after this one. This prevents the block layer * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec, + howmany(count, PAGE_SIZE), + REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE); + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; + iclog->ic_bio.bi_end_io = xlog_bio_end_io; + iclog->ic_bio.bi_private = iclog; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) { iclog->ic_bio.bi_opf |= REQ_PREFLUSH; /* @@ -1913,7 +1899,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -2412,7 +2398,7 @@ xlog_write_copy_finish( ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || xlog_is_shutdown(log)); release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; } @@ -2488,7 +2474,7 @@ xlog_write( xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } len = xlog_write_calc_vec_length(ticket, log_vector, optype); @@ -2629,7 +2615,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; @@ -3053,7 +3039,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3822,9 +3808,10 @@ xlog_verify_iclog( #endif /* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3833,6 +3820,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3844,25 +3833,25 @@ xlog_force_shutdown( { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false; - ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* @@ -3879,12 +3868,25 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock); /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as * writeq. In addition, we make sure in xlog_{re}grant_log_space that @@ -3904,8 +3906,12 @@ xlog_force_shutdown( wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); + + spin_lock(&log->l_icloglock); xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 83a039762b81..ba57323bfdce 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -540,7 +540,7 @@ xlog_cil_insert_items( spin_unlock(&cil->xc_cil_lock); if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } static void @@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -854,7 +864,7 @@ xlog_cil_write_commit_record( error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -888,10 +898,7 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; new_ctx = xlog_cil_ctx_alloc(); @@ -962,23 +969,6 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - - /* * Pull all the log vectors off the items in the CIL, and remove the * items from the CIL. We don't need the CIL lock here because it's only * needed on the transaction commit side which is currently locked out @@ -1054,12 +1044,6 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. - */ - wait_for_completion(&bdev_flush); - error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; @@ -1118,7 +1102,7 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ @@ -1139,7 +1123,7 @@ out_abort_free_ticket: return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); } @@ -1243,18 +1227,27 @@ xlog_cil_push_now( if (!async) flush_workqueue(cil->xc_push_wq); + spin_lock(&cil->xc_push_lock); + + /* + * If this is an async flush request, we always need to set the + * xc_push_commit_stable flag even if something else has already queued + * a push. The flush caller is asking for the CIL to be on stable + * storage when the next push completes, so regardless of who has queued + * the push, the flush requires stable semantics from it. + */ + cil->xc_push_commit_stable = async; + /* * If the CIL is empty or we've already pushed the sequence then - * there's no work we need to do. + * there's no more work that we need to do. */ - spin_lock(&cil->xc_push_lock); if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - cil->xc_push_commit_stable = async; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1352,6 +1345,13 @@ xlog_cil_flush( trace_xfs_log_force(log->l_mp, seq, _RET_IP_); xlog_cil_push_now(log, seq, true); + + /* + * If the CIL is empty, make sure that any previous checkpoint that may + * still be in an active iclog is pushed to stable storage. + */ + if (list_empty(&log->l_cilp->xc_cil)) + xfs_log_force(log->l_mp, 0); } /* @@ -1468,7 +1468,7 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; + struct xfs_cil *cil = lip->li_log->l_cilp; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..401cdc400980 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( @@ -524,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96c997ed2ec8..c4ad4296c540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2485,7 +2485,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -2519,21 +2519,22 @@ xlog_abort_defer_ops( xfs_defer_ops_capture_free(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2556,17 +2557,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } /* * We should never see a redo item with a LSN higher than @@ -2607,8 +2599,9 @@ err: } /* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2622,17 +2615,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip); @@ -3470,7 +3454,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3517,7 +3501,7 @@ xlog_recover_finish( * end of intents processing can be pushed through the CIL * and AIL. */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bed73e8002a5..c5f153c3693f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" @@ -1146,7 +1147,7 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 00720a02e761..f6dc19de8322 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -479,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +static inline uint64_t +xfs_fdblocks_unavailable( + struct xfs_mount *mp) +{ + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4abe17312c2b..37a24f0f7cd4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -319,7 +319,8 @@ xfs_fs_commit_blocks( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_setattr_time(ip, iattr); + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(&init_user_ns, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 32ac8d9c8940..f165d1a3de1d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_ialloc.h" +#include "xfs_log_priv.h" /* * The global quota manager. There is only one of these for the entire @@ -121,8 +122,7 @@ xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; int error = -EAGAIN; xfs_dqlock(dqp); @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(xfs_is_shutdown(mp) || + ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -172,7 +172,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(mp, xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d3da67772d57..0d868c93144d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -457,7 +457,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; unsigned int refc_type; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index db70060e7bf6..54e68e5693fd 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -425,7 +425,10 @@ convert: if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, cmap); - return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + return error; out_trans_cancel: xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c3966b4c58ef..a22b2d19ef91 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -510,7 +510,7 @@ xfs_rui_item_recover( struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; enum xfs_rmap_intent_type type; xfs_exntst_t state; int i; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4c0dee78b2f8..54be9d64093e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -815,7 +815,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); @@ -1753,6 +1754,11 @@ xfs_remount_ro( }; int error; + /* Flush all the dirty data to disk. */ + error = sync_filesystem(mp->m_super); + if (error) + return error; + /* * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. @@ -1831,8 +1837,6 @@ xfs_fs_reconfigure( if (error) return error; - sync_filesystem(mp->m_super); - /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a8076ef8cb4..b141ef78c755 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); DECLARE_EVENT_CLASS(xfs_namespace_class, - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), TP_ARGS(dp, name), TP_STRUCT__entry( __field(dev_t, dev) @@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, #define DEFINE_NAMESPACE_EVENT(name) \ DEFINE_EVENT(xfs_namespace_class, name, \ - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \ TP_ARGS(dp, name)) DEFINE_NAMESPACE_EVENT(xfs_remove); DEFINE_NAMESPACE_EVENT(xfs_link); @@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(xfs_lsn_t, lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; @@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(xfs_lsn_t, new_lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 59e2f9031b9f..0ac717aad380 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -648,7 +648,7 @@ xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_log == tp->t_mountp->m_log); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); ASSERT(list_empty(&lip->li_trans)); ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); @@ -775,7 +775,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(xfs_is_shutdown(ailp->ail_mount)); + ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; @@ -836,6 +836,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -864,7 +865,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -878,7 +885,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -905,10 +912,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -926,18 +933,27 @@ xfs_trans_commit( } /* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * If the transaction has made a log reservation, make sure to release it as + * well. + * + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); trace_xfs_trans_cancel(tp, _RET_IP_); @@ -955,16 +971,18 @@ xfs_trans_cancel( } /* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) @@ -975,7 +993,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } @@ -1210,3 +1228,89 @@ out_cancel: xfs_trans_cancel(tp); return error; } + +/* + * Allocate an transaction, lock and join the directory and child inodes to it, + * and reserve quota for a directory update. If there isn't sufficient space, + * @dblocks will be set to zero for a reservationless directory update and + * @nospace_error will be set to a negative errno describing the space + * constraint we hit. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCKs will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_dir( + struct xfs_inode *dp, + struct xfs_trans_res *resv, + struct xfs_inode *ip, + unsigned int *dblocks, + struct xfs_trans **tpp, + int *nospace_error) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + unsigned int resblks; + bool retried = false; + int error; + +retry: + *nospace_error = 0; + resblks = *dblocks; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + if (error == -ENOSPC) { + *nospace_error = error; + resblks = 0; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + } + if (error) + return error; + + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(dp, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + if (resblks == 0) + goto done; + + error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(dp, 0); + retried = true; + goto retry; + } + + *nospace_error = error; + resblks = 0; + error = 0; + } + if (error) + goto out_cancel; + +done: + *tpp = tp; + *dblocks = resblks; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a487b264a9eb..de177842b951 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -8,6 +8,7 @@ /* kernel only transaction subsystem defines */ +struct xlog; struct xfs_buf; struct xfs_buftarg; struct xfs_efd_log_item; @@ -31,7 +32,7 @@ struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xlog *li_log; struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ unsigned long li_flags; /* misc flags */ @@ -259,6 +260,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv, + struct xfs_inode *ip, unsigned int *dblocks, + struct xfs_trans **tpp, int *nospace_error); static inline void xfs_trans_set_context( diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2a8c8dc54c95..d3a97a028560 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,7 +398,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* @@ -418,7 +418,7 @@ static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; @@ -443,15 +443,27 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xlog_cil_flush(mp->m_log); + xlog_cil_flush(ailp->ail_log); } spin_lock(&ailp->ail_lock); - /* barrier matches the ail_target update in xfs_ail_push() */ - smp_rmb(); - target = ailp->ail_target; - ailp->ail_target_prev = target; + /* + * If we have a sync push waiter, we always have to push till the AIL is + * empty. Update the target to point to the end of the AIL so that + * capture updates that occur after the sync push waiter has gone to + * sleep. + */ + if (waitqueue_active(&ailp->ail_empty)) { + lip = xfs_ail_max(ailp); + if (lip) + target = lip->li_lsn; + } else { + /* barrier matches the ail_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->ail_target; + ailp->ail_target_prev = target; + } /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); @@ -620,7 +632,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - xfs_is_shutdown(ailp->ail_mount)); + xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -683,7 +695,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || xfs_is_shutdown(ailp->ail_mount) || + if (!lip || xlog_is_shutdown(ailp->ail_log) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -724,7 +736,6 @@ xfs_ail_push_all_sync( spin_lock(&ailp->ail_lock); while ((lip = xfs_ail_max(ailp)) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->ail_target = lip->li_lsn; wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); schedule(); @@ -740,7 +751,7 @@ xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; /* if the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { @@ -748,13 +759,13 @@ xfs_ail_update_finish( return; } - if (!xfs_is_shutdown(mp)) - xlog_assign_tail_lsn_locked(mp); + if (!xlog_is_shutdown(log)) + xlog_assign_tail_lsn_locked(log->l_mp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(mp); + xfs_log_space_wake(log->l_mp); } /* @@ -862,17 +873,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xfs_is_shutdown(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; } @@ -893,7 +904,7 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->ail_mount = mp; + ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); @@ -901,7 +912,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_super->s_id); + mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3004aeac9110..f0d79a9050ba 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -6,6 +6,7 @@ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__ +struct xlog; struct xfs_log_item; struct xfs_mount; struct xfs_trans; @@ -50,7 +51,7 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *ail_mount; + struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; xfs_lsn_t ail_target; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b76dfb310ab6..3614c7834007 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -185,9 +185,9 @@ static const struct address_space_operations zonefs_file_aops = { .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .releasepage = iomap_releasepage, - .invalidatepage = iomap_invalidatepage, + .invalidate_folio = iomap_invalidate_folio, .migratepage = iomap_migrate_page, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -692,12 +692,10 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (!nr_pages) return 0; - bio = bio_alloc(GFP_NOFS, nr_pages); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_write_hint = iocb->ki_hint; bio->bi_ioprio = iocb->ki_ioprio; - bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; if (iocb->ki_flags & IOCB_DSYNC) bio->bi_opf |= REQ_FUA; @@ -1137,7 +1135,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) { struct zonefs_inode_info *zi; - zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL); + zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); if (!zi) return NULL; @@ -1541,10 +1539,8 @@ static int zonefs_read_super(struct super_block *sb) if (!page) return -ENOMEM; - bio_init(&bio, &bio_vec, 1); + bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = 0; - bio.bi_opf = REQ_OP_READ; - bio_set_dev(&bio, sb->s_bdev); bio_add_page(&bio, page, PAGE_SIZE, 0); ret = submit_bio_wait(&bio); |