diff options
Diffstat (limited to 'fs')
420 files changed, 16958 insertions, 17819 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index f2ba131cede1..55e108e5e133 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -16,186 +16,61 @@ #include "v9fs.h" #include "cache.h" -#define CACHETAG_LEN 11 - -struct fscache_netfs v9fs_cache_netfs = { - .name = "9p", - .version = 0, -}; - -/* - * v9fs_random_cachetag - Generate a random tag to be associated - * with a new cache session. - * - * The value of jiffies is used for a fairly randomly cache tag. - */ - -static -int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name) { - v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); - if (!v9ses->cachetag) - return -ENOMEM; + struct fscache_volume *vcookie; + char *name, *p; - return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); -} - -const struct fscache_cookie_def v9fs_cache_session_index_def = { - .name = "9P.session", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; + name = kasprintf(GFP_KERNEL, "9p,%s,%s", + dev_name, v9ses->cachetag ?: v9ses->aname); + if (!name) + return -ENOMEM; -void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) -{ - /* If no cache session tag was specified, we generate a random one. */ - if (!v9ses->cachetag) { - if (v9fs_random_cachetag(v9ses) < 0) { - v9ses->fscache = NULL; - kfree(v9ses->cachetag); - v9ses->cachetag = NULL; - return; + for (p = name; *p; p++) + if (*p == '/') + *p = ';'; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + p9_debug(P9_DEBUG_FSC, "session %p get volume %p (%s)\n", + v9ses, vcookie, name); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); } + pr_err("Cache volume key already in use (%s)\n", name); + vcookie = NULL; } - - v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, - &v9fs_cache_session_index_def, - v9ses->cachetag, - strlen(v9ses->cachetag), - NULL, 0, - v9ses, 0, true); - p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", - v9ses, v9ses->fscache); -} - -void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) -{ - p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", - v9ses, v9ses->fscache); - fscache_relinquish_cookie(v9ses->fscache, NULL, false); - v9ses->fscache = NULL; -} - -static enum -fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - - if (buflen != sizeof(v9inode->qid.version)) - return FSCACHE_CHECKAUX_OBSOLETE; - - if (memcmp(buffer, &v9inode->qid.version, - sizeof(v9inode->qid.version))) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; + v9ses->fscache = vcookie; + kfree(name); + return 0; } -const struct fscache_cookie_def v9fs_cache_inode_index_def = { - .name = "9p.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = v9fs_cache_inode_check_aux, -}; - void v9fs_cache_inode_get_cookie(struct inode *inode) { struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; + __le32 version; + __le64 path; if (!S_ISREG(inode->i_mode)) return; v9inode = V9FS_I(inode); - if (v9inode->fscache) + if (WARN_ON(v9inode->fscache)) return; + version = cpu_to_le32(v9inode->qid.version); + path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); + v9inode->fscache = + fscache_acquire_cookie(v9fs_session_cache(v9ses), + 0, + &path, sizeof(path), + &version, sizeof(version), + i_size_read(&v9inode->vfs_inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); } - -void v9fs_cache_inode_put_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version, - false); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_flush_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - - mutex_lock(&v9inode->fscache_lock); - - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - v9fs_cache_inode_flush_cookie(inode); - else - v9fs_cache_inode_get_cookie(inode); - - mutex_unlock(&v9inode->fscache_lock); -} - -void v9fs_cache_inode_reset_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct v9fs_session_info *v9ses; - struct fscache_cookie *old; - - if (!v9inode->fscache) - return; - - old = v9inode->fscache; - - mutex_lock(&v9inode->fscache_lock); - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - - v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); - p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", - inode, old, v9inode->fscache); - - mutex_unlock(&v9inode->fscache_lock); -} diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 7480b4b49fea..1923affcdc62 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -7,26 +7,15 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#define FSCACHE_USE_NEW_IO_API + #include <linux/fscache.h> #ifdef CONFIG_9P_FSCACHE -extern struct fscache_netfs v9fs_cache_netfs; -extern const struct fscache_cookie_def v9fs_cache_session_index_def; -extern const struct fscache_cookie_def v9fs_cache_inode_index_def; - -extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); -extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); +extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name); extern void v9fs_cache_inode_get_cookie(struct inode *inode); -extern void v9fs_cache_inode_put_cookie(struct inode *inode); -extern void v9fs_cache_inode_flush_cookie(struct inode *inode); -extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); -extern void v9fs_cache_inode_reset_cookie(struct inode *inode); - -extern int __v9fs_cache_register(void); -extern void __v9fs_cache_unregister(void); #else /* CONFIG_9P_FSCACHE */ @@ -34,13 +23,5 @@ static inline void v9fs_cache_inode_get_cookie(struct inode *inode) { } -static inline void v9fs_cache_inode_put_cookie(struct inode *inode) -{ -} - -static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file) -{ -} - #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e32dd5f7721b..08f65c40af4f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -469,7 +469,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - v9fs_cache_session_get_cookie(v9ses); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + rc = v9fs_cache_session_get_cookie(v9ses, dev_name); + if (rc < 0) + goto err_clnt; + } #endif spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); @@ -502,8 +506,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) } #ifdef CONFIG_9P_FSCACHE - if (v9ses->fscache) - v9fs_cache_session_put_cookie(v9ses); + fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false); kfree(v9ses->cachetag); #endif kfree(v9ses->uname); @@ -665,20 +668,12 @@ static int v9fs_cache_register(void) ret = v9fs_init_inode_cache(); if (ret < 0) return ret; -#ifdef CONFIG_9P_FSCACHE - ret = fscache_register_netfs(&v9fs_cache_netfs); - if (ret < 0) - v9fs_destroy_inode_cache(); -#endif return ret; } static void v9fs_cache_unregister(void) { v9fs_destroy_inode_cache(); -#ifdef CONFIG_9P_FSCACHE - fscache_unregister_netfs(&v9fs_cache_netfs); -#endif } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1647a8e63671..bc8b30205d36 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -89,7 +89,7 @@ struct v9fs_session_info { unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif char *uname; /* user name to mount as */ @@ -109,7 +109,6 @@ struct v9fs_session_info { struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE - struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; @@ -133,6 +132,16 @@ static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inod #endif } +static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses) +{ +#ifdef CONFIG_9P_FSCACHE + return v9ses->fscache; +#else + return NULL; +#endif +} + + extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index fac918ccb305..9a10e68c5f30 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/swap.h> #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> @@ -42,6 +43,11 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); + + /* if we just extended the file size, any portion not in + * cache won't be on server and is zeroes */ + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, err ?: total, false); } @@ -78,7 +84,7 @@ static bool v9fs_is_cache_enabled(struct inode *inode) { struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); + return fscache_cookie_enabled(cookie) && cookie->cache_priv; } /** @@ -87,9 +93,13 @@ static bool v9fs_is_cache_enabled(struct inode *inode) */ static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) { +#ifdef CONFIG_9P_FSCACHE struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); +#else + return -ENOBUFS; +#endif } static const struct netfs_read_request_ops v9fs_req_ops = { @@ -133,16 +143,18 @@ static void v9fs_vfs_readahead(struct readahead_control *ractl) static int v9fs_release_page(struct page *page, gfp_t gfp) { struct folio *folio = page_folio(page); + struct inode *inode = folio_inode(folio); if (folio_test_private(folio)) return 0; #ifdef CONFIG_9P_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return 0; folio_wait_fscache(folio); } #endif + fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode))); return 1; } @@ -161,10 +173,25 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset, folio_wait_fscache(folio); } +static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct v9fs_inode *v9inode = priv; + __le32 version; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) { + version = cpu_to_le32(v9inode->qid.version); + fscache_invalidate(v9fs_inode_cookie(v9inode), &version, + i_size_read(&v9inode->vfs_inode), 0); + } +} + static int v9fs_vfs_write_folio_locked(struct folio *folio) { struct inode *inode = folio_inode(folio); struct v9fs_inode *v9inode = V9FS_I(inode); + struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode); loff_t start = folio_pos(folio); loff_t i_size = i_size_read(inode); struct iov_iter from; @@ -181,10 +208,21 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); + folio_wait_fscache(folio); folio_start_writeback(folio); p9_client_write(v9inode->writeback_fid, start, &from, &err); + if (err == 0 && + fscache_cookie_enabled(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { + folio_start_fscache(folio); + fscache_write_to_cache(v9fs_inode_cookie(v9inode), + folio_mapping(folio), start, len, i_size, + v9fs_write_to_cache_done, v9inode, + true); + } + folio_end_writeback(folio); return err; } @@ -303,6 +341,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct folio *folio = page_folio(subpage); struct inode *inode = mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -322,6 +361,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (last_pos > inode->i_size) { inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); + fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos); } folio_mark_dirty(folio); out: @@ -331,11 +371,25 @@ out: return copied; } +#ifdef CONFIG_9P_FSCACHE +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +static int v9fs_set_page_dirty(struct page *page) +{ + struct v9fs_inode *v9inode = V9FS_I(page->mapping->host); + + return fscache_set_page_dirty(page, v9fs_inode_cookie(v9inode)); +} +#else +#define v9fs_set_page_dirty __set_page_dirty_nobuffers +#endif const struct address_space_operations v9fs_addr_operations = { .readpage = v9fs_vfs_readpage, .readahead = v9fs_vfs_readahead, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = v9fs_set_page_dirty, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, .write_end = v9fs_write_end, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 8c854d8cb0cd..958680f7f23e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -17,6 +17,7 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/uio.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -205,7 +206,10 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) int v9fs_dir_release(struct inode *inode, struct file *filp) { + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; + __le32 version; + loff_t i_size; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -216,6 +220,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_unlock(&inode->i_lock); p9_client_clunk(fid); } + + if ((filp->f_mode & FMODE_WRITE)) { + version = cpu_to_le32(v9inode->qid.version); + i_size = i_size_read(inode); + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), + &version, &i_size); + } else { + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); + } return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 612e297f3763..2573c08f335c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -93,7 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) } mutex_unlock(&v9inode->v_mutex); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); return 0; out_error: @@ -114,7 +115,6 @@ out_error: static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { - int res = 0; struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -124,7 +124,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) invalidate_mapping_pages(&inode->i_data, 0, -1); } - return res; + return 0; } static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) @@ -139,8 +139,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) fid = filp->private_data; BUG_ON(fid == NULL); - if ((fl->fl_flags & FL_POSIX) != FL_POSIX) - BUG(); + BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 328c338ff304..2a10242c79c7 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -233,7 +233,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) return NULL; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - mutex_init(&v9inode->fscache_lock); #endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; @@ -381,12 +380,16 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); + __le32 version; truncate_inode_pages_final(&inode->i_data); + version = cpu_to_le32(v9inode->qid.version); + fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, + &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); - v9fs_cache_inode_put_cookie(inode); + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); /* clunk the fid stashed in writeback_fid */ if (v9inode->writeback_fid) { p9_client_clunk(v9inode->writeback_fid); @@ -869,7 +872,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(d_inode(dentry), file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); file->f_mode |= FMODE_CREATED; @@ -1072,6 +1076,8 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; + struct inode *inode = d_inode(dentry); + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_wstat wstat; @@ -1117,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(d_inode(dentry)->i_mapping); + filemap_write_and_wait(inode->i_mapping); retval = p9_client_wstat(fid, &wstat); @@ -1128,13 +1134,15 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(d_inode(dentry))) - truncate_setsize(d_inode(dentry), iattr->ia_size); + iattr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, iattr->ia_size); + fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + } - v9fs_invalidate_inode_attr(d_inode(dentry)); + v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, d_inode(dentry), iattr); - mark_inode_dirty(d_inode(dentry)); + setattr_copy(&init_user_ns, inode, iattr); + mark_inode_dirty(inode); return 0; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 7dee89ba32e7..d17502a738a9 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -344,7 +344,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, goto err_clunk_old_fid; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, ofid); file->f_mode |= FMODE_CREATED; out: @@ -551,7 +552,10 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, { int retval, use_dentry = 0; struct p9_fid *fid = NULL; - struct p9_iattr_dotl p9attr; + struct p9_iattr_dotl p9attr = { + .uid = INVALID_UID, + .gid = INVALID_GID, + }; struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -561,14 +565,22 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, return retval; p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); - p9attr.mode = iattr->ia_mode; - p9attr.uid = iattr->ia_uid; - p9attr.gid = iattr->ia_gid; - p9attr.size = iattr->ia_size; - p9attr.atime_sec = iattr->ia_atime.tv_sec; - p9attr.atime_nsec = iattr->ia_atime.tv_nsec; - p9attr.mtime_sec = iattr->ia_mtime.tv_sec; - p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + if (iattr->ia_valid & ATTR_MODE) + p9attr.mode = iattr->ia_mode; + if (iattr->ia_valid & ATTR_UID) + p9attr.uid = iattr->ia_uid; + if (iattr->ia_valid & ATTR_GID) + p9attr.gid = iattr->ia_gid; + if (iattr->ia_valid & ATTR_SIZE) + p9attr.size = iattr->ia_size; + if (iattr->ia_valid & ATTR_ATIME_SET) { + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME_SET) { + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + } if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b739e02f5ef7..97e23b4e6982 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -309,6 +310,7 @@ static int v9fs_write_inode(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } @@ -332,6 +334,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } diff --git a/fs/Kconfig b/fs/Kconfig index a6313a969bc5..7a2b11c0b803 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER Enable this to perform validation of the parameter description for a filesystem when it is registered. -if BLOCK - config FS_IOMAP bool +if BLOCK + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" @@ -42,6 +42,8 @@ source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" source "fs/zonefs/Kconfig" +endif # BLOCK + config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU @@ -89,8 +91,6 @@ config FS_DAX_PMD config FS_DAX_LIMITED bool -endif # BLOCK - # Posix ACL utility routines # # Note: Posix ACLs can be implemented without these helpers. Never use diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 75c4e4043d1d..e8956b65d7ff 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -3,10 +3,7 @@ # Makefile for Red Hat Linux AFS client. # -afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o - kafs-y := \ - $(afs-cache-y) \ addr_list.o \ callback.o \ cell.o \ diff --git a/fs/afs/cache.c b/fs/afs/cache.c deleted file mode 100644 index 037af93e3aba..000000000000 --- a/fs/afs/cache.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* AFS caching stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/sched.h> -#include "internal.h" - -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size); - -struct fscache_netfs afs_cache_netfs = { - .name = "afs", - .version = 2, -}; - -struct fscache_cookie_def afs_cell_cache_index_def = { - .name = "AFS.cell", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_volume_cache_index_def = { - .name = "AFS.volume", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = afs_vnode_cache_check_aux, -}; - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct afs_vnode_cache_aux aux; - - _enter("{%llx,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, buflen); - - memcpy(&aux, buffer, sizeof(aux)); - - /* check the size of the data is what we're expecting */ - if (buflen != sizeof(aux)) { - _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux)); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - if (vnode->status.data_version != aux.data_version) { - _leave(" = OBSOLETE [vers %llx != %llx]", - aux.data_version, vnode->status.data_version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = SUCCESS"); - return FSCACHE_CHECKAUX_OKAY; -} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index d88407fb9bc0..07ad744eef77 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -680,13 +680,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) return ret; } -#ifdef CONFIG_AFS_FSCACHE - cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell->name, strlen(cell->name), - NULL, 0, - cell, 0, true); -#endif ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -723,11 +716,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, NULL, false); - cell->cache = NULL; -#endif - _leave(""); } diff --git a/fs/afs/file.c b/fs/afs/file.c index cb6ad61eec3b..720818a7c166 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,6 +14,7 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/netfs.h> #include "internal.h" @@ -158,7 +159,9 @@ int afs_open(struct inode *inode, struct file *file) if (file->f_flags & O_TRUNC) set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - + + fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE); + file->private_data = af; _leave(" = 0"); return 0; @@ -177,8 +180,10 @@ error: */ int afs_release(struct inode *inode, struct file *file) { + struct afs_vnode_cache_aux aux; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; + loff_t i_size; int ret = 0; _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); @@ -189,6 +194,15 @@ int afs_release(struct inode *inode, struct file *file) file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); + + if ((file->f_mode & FMODE_WRITE)) { + i_size = i_size_read(&vnode->vfs_inode); + afs_set_cache_aux(vnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); + } + key_put(af->key); kfree(af); afs_prune_wb_keys(vnode); @@ -354,14 +368,19 @@ static bool afs_is_cache_enabled(struct inode *inode) { struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); + return fscache_cookie_enabled(cookie) && cookie->cache_priv; } static int afs_begin_cache_operation(struct netfs_read_request *rreq) { +#ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); + return fscache_begin_read_operation(&rreq->cache_resources, + afs_vnode_cache(vnode)); +#else + return -ENOBUFS; +#endif } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, @@ -398,6 +417,12 @@ static void afs_readahead(struct readahead_control *ractl) netfs_readahead(ractl, &afs_req_ops, NULL); } +int afs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); + return 0; +} + /* * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. @@ -480,23 +505,24 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, * release a page and clean up its private state if it's not busy * - return true if the page can now be released, false if not */ -static int afs_releasepage(struct page *page, gfp_t gfp_flags) +static int afs_releasepage(struct page *page, gfp_t gfp) { struct folio *folio = page_folio(page); struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp_flags); + gfp); /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); } + fscache_note_page_release(afs_vnode_cache(vnode)); #endif if (folio_test_private(folio)) { @@ -514,8 +540,9 @@ static void afs_add_open_mmap(struct afs_vnode *vnode) if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { down_write(&vnode->volume->cell->fs_open_mmaps_lock); - list_add_tail(&vnode->cb_mmap_link, - &vnode->volume->cell->fs_open_mmaps); + if (list_empty(&vnode->cb_mmap_link)) + list_add_tail(&vnode->cb_mmap_link, + &vnode->volume->cell->fs_open_mmaps); up_write(&vnode->volume->cell->fs_open_mmaps_lock); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 16906eb592d9..5964f8aee090 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -413,9 +413,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE struct { - u32 vnode_id; - u32 unique; - u32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + __be32 vnode_id; + __be32 unique; + __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */ } __packed key; struct afs_vnode_cache_aux aux; @@ -424,17 +424,18 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) return; } - key.vnode_id = vnode->fid.vnode; - key.unique = vnode->fid.unique; - key.vnode_id_ext[0] = vnode->fid.vnode >> 32; - key.vnode_id_ext[1] = vnode->fid.vnode_hi; - aux.data_version = vnode->status.data_version; - - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - &key, sizeof(key), - &aux, sizeof(aux), - vnode, vnode->status.size, true); + key.vnode_id = htonl(vnode->fid.vnode); + key.unique = htonl(vnode->fid.unique); + key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32); + key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); + afs_set_cache_aux(vnode, &aux); + + vnode->cache = fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + vnode->status.size); #endif } @@ -563,9 +564,7 @@ static void afs_zap_data(struct afs_vnode *vnode) { _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); -#ifdef CONFIG_AFS_FSCACHE - fscache_invalidate(vnode->cache); -#endif + afs_invalidate_cache(vnode, 0); /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a @@ -762,9 +761,8 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_vnode *vnode; - - vnode = AFS_FS_I(inode); + struct afs_vnode_cache_aux aux; + struct afs_vnode *vnode = AFS_FS_I(inode); _enter("{%llx:%llu.%d}", vnode->fid.vid, @@ -776,6 +774,9 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages_final(&inode->i_data); + + afs_set_cache_aux(vnode, &aux); + fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -786,14 +787,9 @@ void afs_evict_inode(struct inode *inode) } #ifdef CONFIG_AFS_FSCACHE - { - struct afs_vnode_cache_aux aux; - - aux.data_version = vnode->status.data_version; - fscache_relinquish_cookie(vnode->cache, &aux, - test_bit(AFS_VNODE_DELETED, &vnode->flags)); - vnode->cache = NULL; - } + fscache_relinquish_cookie(vnode->cache, + test_bit(AFS_VNODE_DELETED, &vnode->flags)); + vnode->cache = NULL; #endif afs_prune_wb_keys(vnode); @@ -833,6 +829,9 @@ static void afs_setattr_edit_file(struct afs_operation *op) if (size < i_size) truncate_pagecache(inode, size); + if (size != i_size) + fscache_resize_cookie(afs_vnode_cache(vp->vnode), + vp->scb.status.size); } } @@ -849,40 +848,67 @@ static const struct afs_operation_ops afs_setattr_operation = { int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { + const unsigned int supported = + ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct inode *inode = &vnode->vfs_inode; + loff_t i_size; int ret; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); - if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | - ATTR_TOUCH))) { + if (!(attr->ia_valid & supported)) { _leave(" = 0 [unsupported]"); return 0; } + i_size = i_size_read(inode); if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(vnode->vfs_inode.i_mode)) + if (!S_ISREG(inode->i_mode)) return -EISDIR; - ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size); + ret = inode_newsize_ok(inode, attr->ia_size); if (ret) return ret; - if (attr->ia_size == i_size_read(&vnode->vfs_inode)) + if (attr->ia_size == i_size) attr->ia_valid &= ~ATTR_SIZE; } - /* flush any dirty data outstanding on a regular file */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - filemap_write_and_wait(vnode->vfs_inode.i_mapping); + fscache_use_cookie(afs_vnode_cache(vnode), true); /* Prevent any new writebacks from starting whilst we do this. */ down_write(&vnode->validate_lock); + if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) { + loff_t size = attr->ia_size; + + /* Wait for any outstanding writes to the server to complete */ + loff_t from = min(size, i_size); + loff_t to = max(size, i_size); + ret = filemap_fdatawait_range(inode->i_mapping, from, to); + if (ret < 0) + goto out_unlock; + + /* Don't talk to the server if we're just shortening in-memory + * writes that haven't gone to the server yet. + */ + if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && + attr->ia_size < i_size && + attr->ia_size > vnode->status.size) { + truncate_pagecache(inode, attr->ia_size); + fscache_resize_cookie(afs_vnode_cache(vnode), + attr->ia_size); + i_size_write(inode, attr->ia_size); + ret = 0; + goto out_unlock; + } + } + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); @@ -907,6 +933,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, out_unlock: up_write(&vnode->validate_lock); + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index aa4c0d6c9780..b6f02321fc09 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,7 +14,6 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> -#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -364,9 +363,6 @@ struct afs_cell { struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ atomic_t ref; /* Struct refcount */ @@ -590,7 +586,7 @@ struct afs_volume { #define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ #define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ #ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ + struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ @@ -872,9 +868,24 @@ struct afs_operation { * Cache auxiliary data. */ struct afs_vnode_cache_aux { - u64 data_version; + __be64 data_version; } __packed; +static inline void afs_set_cache_aux(struct afs_vnode *vnode, + struct afs_vnode_cache_aux *aux) +{ + aux->data_version = cpu_to_be64(vnode->status.data_version); +} + +static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) +{ + struct afs_vnode_cache_aux aux; + + afs_set_cache_aux(vnode, &aux); + fscache_invalidate(afs_vnode_cache(vnode), &aux, + i_size_read(&vnode->vfs_inode), flags); +} + /* * We use folio->private to hold the amount of the folio that we've written to, * splitting the field into two parts. However, we need to represent a range @@ -962,13 +973,6 @@ extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); */ #ifdef CONFIG_AFS_FSCACHE extern struct fscache_netfs afs_cache_netfs; -extern struct fscache_cookie_def afs_cell_cache_index_def; -extern struct fscache_cookie_def afs_volume_cache_index_def; -extern struct fscache_cookie_def afs_vnode_cache_index_def; -#else -#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) #endif /* @@ -1068,6 +1072,7 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); +extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1506,7 +1511,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); -extern void afs_activate_volume(struct afs_volume *); +extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); @@ -1515,7 +1520,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ +#ifdef CONFIG_AFS_FSCACHE extern int afs_set_page_dirty(struct page *); +#else +#define afs_set_page_dirty __set_page_dirty_nobuffers +#endif extern int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); diff --git a/fs/afs/main.c b/fs/afs/main.c index 179004b15566..eae288c8d40a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -186,13 +186,6 @@ static int __init afs_init(void) if (!afs_lock_manager) goto error_lockmgr; -#ifdef CONFIG_AFS_FSCACHE - /* we want to be able to cache */ - ret = fscache_register_netfs(&afs_cache_netfs); - if (ret < 0) - goto error_cache; -#endif - ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; @@ -215,10 +208,6 @@ error_proc: error_fs: unregister_pernet_device(&afs_net_ops); error_net: -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -error_cache: -#endif destroy_workqueue(afs_lock_manager); error_lockmgr: destroy_workqueue(afs_async_calls); @@ -245,9 +234,6 @@ static void __exit afs_exit(void) proc_remove(afs_proc_symlink); afs_fs_exit(); unregister_pernet_device(&afs_net_ops); -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -#endif destroy_workqueue(afs_lock_manager); destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); diff --git a/fs/afs/super.c b/fs/afs/super.c index d110def8aa8e..5ec9fd97eccc 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,6 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, + .write_inode = afs_write_inode, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, @@ -667,6 +668,7 @@ static void afs_i_init_once(void *_vnode) INIT_LIST_HEAD(&vnode->pending_locks); INIT_LIST_HEAD(&vnode->granted_locks); INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); + INIT_LIST_HEAD(&vnode->cb_mmap_link); seqlock_init(&vnode->cb_lock); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index f84194b791d3..94a3d247924b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -268,15 +268,30 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume, /* * Activate a volume. */ -void afs_activate_volume(struct afs_volume *volume) +int afs_activate_volume(struct afs_volume *volume) { #ifdef CONFIG_AFS_FSCACHE - volume->cache = fscache_acquire_cookie(volume->cell->cache, - &afs_volume_cache_index_def, - &volume->vid, sizeof(volume->vid), - NULL, 0, - volume, 0, true); + struct fscache_volume *vcookie; + char *name; + + name = kasprintf(GFP_KERNEL, "afs,%s,%llx", + volume->cell->name, volume->vid); + if (!name) + return -ENOMEM; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); + } + pr_err("AFS: Cache volume key already in use (%s)\n", name); + vcookie = NULL; + } + volume->cache = vcookie; + kfree(name); #endif + return 0; } /* @@ -287,7 +302,7 @@ void afs_deactivate_volume(struct afs_volume *volume) _enter("%s", volume->name); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, NULL, + fscache_relinquish_volume(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); volume->cache = NULL; #endif diff --git a/fs/afs/write.c b/fs/afs/write.c index ca4909baf5e6..5e9157d0da29 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,17 +12,30 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> -#include <linux/fscache.h> #include "internal.h" +static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, + loff_t i_size, bool caching); + +#ifdef CONFIG_AFS_FSCACHE /* - * mark a page as having been made dirty and thus needing writeback + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. */ int afs_set_page_dirty(struct page *page) { - _enter(""); - return __set_page_dirty_nobuffers(page); + return fscache_set_page_dirty(page, afs_vnode_cache(AFS_FS_I(page->mapping->host))); +} +static void afs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void afs_folio_start_fscache(bool caching, struct folio *folio) +{ } +#endif /* * prepare to perform part of a write to a page @@ -114,7 +127,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, unsigned long priv; unsigned int f, from = offset_in_folio(folio, pos); unsigned int t, to = from + copied; - loff_t i_size, maybe_i_size; + loff_t i_size, write_end_pos; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); @@ -131,15 +144,16 @@ int afs_write_end(struct file *file, struct address_space *mapping, if (copied == 0) goto out; - maybe_i_size = pos + copied; + write_end_pos = pos + copied; i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) { + if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) - afs_set_i_size(vnode, maybe_i_size); + if (write_end_pos > i_size) + afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); } if (folio_test_private(folio)) { @@ -418,6 +432,7 @@ static void afs_extend_writeback(struct address_space *mapping, loff_t start, loff_t max_len, bool new_content, + bool caching, unsigned int *_len) { struct pagevec pvec; @@ -464,7 +479,9 @@ static void afs_extend_writeback(struct address_space *mapping, folio_put(folio); break; } - if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); folio_put(folio); break; @@ -512,6 +529,7 @@ static void afs_extend_writeback(struct address_space *mapping, BUG(); if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); *_count -= folio_nr_pages(folio); folio_unlock(folio); @@ -539,6 +557,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, unsigned int offset, to, len, max_len; loff_t i_size = i_size_read(&vnode->vfs_inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; int ret; @@ -546,6 +565,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); count -= folio_nr_pages(folio); @@ -572,7 +592,8 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (len < max_len && (to == folio_size(folio) || new_content)) afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, &len); + start, max_len, new_content, + caching, &len); len = min_t(loff_t, len, max_len); } @@ -585,12 +606,19 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (start < i_size) { _debug("write back %x @%llx [%llx]", len, start, i_size); + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + afs_write_to_cache(vnode, start, len, i_size, caching); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(afs_vnode_cache(vnode), + mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } @@ -649,6 +677,10 @@ int afs_writepage(struct page *subpage, struct writeback_control *wbc) _enter("{%lx},", folio_index(folio)); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + start = folio_index(folio) * PAGE_SIZE; ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc, folio, start, LLONG_MAX - start); @@ -714,10 +746,15 @@ static int afs_writepages_region(struct address_space *mapping, continue; } - if (folio_test_writeback(folio)) { + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) + if (wbc->sync_mode != WB_SYNC_NONE) { folio_wait_writeback(folio); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + } folio_put(folio); continue; } @@ -970,3 +1007,28 @@ int afs_launder_page(struct page *subpage) folio_wait_fscache(folio); return ret; } + +/* + * Deal with the completion of writing the data to the cache. + */ +static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct afs_vnode *vnode = priv; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) + afs_invalidate_cache(vnode, 0); +} + +/* + * Save the write to the cache also. + */ +static void afs_write_to_cache(struct afs_vnode *vnode, + loff_t start, size_t len, loff_t i_size, + bool caching) +{ + fscache_write_to_cache(afs_vnode_cache(vnode), + vnode->vfs_inode.i_mapping, start, len, i_size, + afs_write_to_cache_done, vnode, caching); +} @@ -181,8 +181,9 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; bool cancelled; + bool work_scheduled; + bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; @@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work) iocb_put(iocb); } +/* + * Safely lock the waitqueue which the request is on, synchronizing with the + * case where the ->poll() provider decides to free its waitqueue early. + * + * Returns true on success, meaning that req->head->lock was locked, req->wait + * is on req->head, and an RCU read lock was taken. Returns false if the + * request was already removed from its waitqueue (which might no longer exist). + */ +static bool poll_iocb_lock_wq(struct poll_iocb *req) +{ + wait_queue_head_t *head; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us, then check whether the request is still on the queue. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + head = smp_load_acquire(&req->head); + if (head) { + spin_lock(&head->lock); + if (!list_empty(&req->wait.entry)) + return true; + spin_unlock(&head->lock); + } + rcu_read_unlock(); + return false; +} + +static void poll_iocb_unlock_wq(struct poll_iocb *req) +{ + spin_unlock(&req->head->lock); + rcu_read_unlock(); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); - if (!mask && !READ_ONCE(req->cancelled)) { - add_wait_queue(req->head, &req->wait); - spin_unlock_irq(&ctx->ctx_lock); - return; - } + if (poll_iocb_lock_wq(req)) { + if (!mask && !READ_ONCE(req->cancelled)) { + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + poll_iocb_unlock_wq(req); + spin_unlock_irq(&ctx->ctx_lock); + return; + } + list_del_init(&req->wait.entry); + poll_iocb_unlock_wq(req); + } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); @@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb) struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; - spin_lock(&req->head->lock); - WRITE_ONCE(req->cancelled, true); - if (!list_empty(&req->wait.entry)) { - list_del_init(&req->wait.entry); - schedule_work(&aiocb->poll.work); - } - spin_unlock(&req->head->lock); + if (poll_iocb_lock_wq(req)) { + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); + } /* else, the request was force-cancelled by POLLFREE already */ return 0; } @@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & req->events)) return 0; - list_del_init(&req->wait.entry); - - if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Complete the request inline if possible. This requires that three + * conditions be met: + * 1. An event mask must have been passed. If a plain wakeup was done + * instead, then mask == 0 and we have to call vfs_poll() to get + * the events, so inline completion isn't possible. + * 2. The completion work must not have already been scheduled. + * 3. ctx_lock must not be busy. We have to use trylock because we + * already hold the waitqueue lock, so this inverts the normal + * locking order. Use irqsave/irqrestore because not all + * filesystems (e.g. fuse) call this function with IRQs disabled, + * yet IRQs have to be disabled before ctx_lock is obtained. + */ + if (mask && !req->work_scheduled && + spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; - /* - * Try to complete the iocb inline if we can. Use - * irqsave/irqrestore because not all filesystems (e.g. fuse) - * call this function with IRQs disabled and because IRQs - * have to be disabled before ctx_lock is obtained. - */ + list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; - if (iocb->ki_eventfd && eventfd_signal_allowed()) { + if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); @@ -1703,7 +1769,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (iocb) iocb_put(iocb); } else { - schedule_work(&req->work); + /* + * Schedule the completion work if needed. If it was already + * scheduled, record that another wakeup came in. + * + * Don't remove the request from the waitqueue here, as it might + * not actually be complete yet (we won't know until vfs_poll() + * is called), and we must not miss any wakeups. POLLFREE is an + * exception to this; see below. + */ + if (req->work_scheduled) { + req->work_need_resched = true; + } else { + schedule_work(&req->work); + req->work_scheduled = true; + } + + /* + * If the waitqueue is being freed early but we can't complete + * the request inline, we have to tear down the request as best + * we can. That means immediately removing the request from its + * waitqueue and preventing all further accesses to the + * waitqueue via the request. We also need to schedule the + * completion work (done above). Also mark the request as + * cancelled, to potentially skip an unneeded call to ->poll(). + */ + if (mask & POLLFREE) { + WRITE_ONCE(req->cancelled, true); + list_del_init(&req->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&req->head, NULL); + } } return 1; } @@ -1711,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; + bool queued; int error; }; @@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ - if (unlikely(pt->iocb->poll.head)) { + if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } + pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); @@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->done = false; req->cancelled = false; + req->work_scheduled = false; + req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; + apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ @@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); - if (likely(req->head)) { - spin_lock(&req->head->lock); - if (unlikely(list_empty(&req->wait.entry))) { - if (apt.error) + if (likely(apt.queued)) { + bool on_queue = poll_iocb_lock_wq(req); + + if (!on_queue || req->work_scheduled) { + /* + * aio_poll_wake() already either scheduled the async + * completion work, or completed the request inline. + */ + if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { + /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { + /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!req->done) { /* actually waiting for an event */ + } else if (on_queue) { + /* + * Actually waiting for an event, so add the request to + * active_reqs so that it can be cancelled if needed. + */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); + if (on_queue) + poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); diff --git a/fs/attr.c b/fs/attr.c index 473d21b3a86d..66899b6e9bd8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -35,7 +35,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, kuid_t uid) { kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid)) + if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; @@ -62,7 +62,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, { kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, kgid))) + (in_group_p(gid) || gid_eq(gid, inode->i_gid))) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3dcf9bcc2326..4188ba3fd8c3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 309516e6a968..43c89952b7d2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f735b8798ba1..c9ee579bc5a6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); @@ -1049,12 +1049,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_fs_info *fs_info, +static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_path *path, u64 bytenr, int info_level, struct preftrees *preftrees, struct share_check *sc) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret; int slot; struct extent_buffer *leaf; @@ -1170,6 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct ulist *roots, const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1203,28 +1204,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ again: head = NULL; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && time_seq != BTRFS_SEQ_LAST) { -#else - if (trans && time_seq != BTRFS_SEQ_LAST) { -#endif /* - * look if there are updates for this ref queued and lock the - * head + * We have a specific time_seq we care about and trans which + * means we have the path lock, we need to grab the ref head and + * lock it so we have a consistent view of the refs at the given + * time. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -1271,7 +1270,7 @@ again: &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(fs_info, path, bytenr, info_level, + ret = add_keyed_refs(root, path, bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1360,10 +1359,18 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; @@ -1740,6 +1747,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags_ret) { + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); int ret; u64 flags; u64 size = 0; @@ -1755,11 +1763,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, key.objectid = logical; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1779,7 +1787,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); @@ -1962,7 +1970,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->extent_root); + trans = btrfs_attach_transaction(fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) @@ -2058,7 +2066,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u64 parent = 0; int found = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2084,10 +2091,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, @@ -2143,7 +2149,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; @@ -2330,6 +2336,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc( int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct btrfs_path *path = iter->path; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -2340,7 +2347,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) key.offset = (u64)-1; iter->bytenr = bytenr; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { @@ -2364,7 +2371,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->end_ptr = (u32)(iter->item_ptr + - btrfs_item_size_nr(path->nodes[0], path->slots[0])); + btrfs_item_size(path->nodes[0], path->slots[0])); ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); @@ -2383,7 +2390,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) /* If there is no inline backref, go search for keyed backref */ if (iter->cur_ptr >= iter->end_ptr) { - ret = btrfs_next_item(fs_info->extent_root, path); + ret = btrfs_next_item(extent_root, path); /* No inline nor keyed ref */ if (ret > 0) { @@ -2404,7 +2411,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->item_ptr = iter->cur_ptr; - iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr( + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( path->nodes[0], path->slots[0])); } @@ -2427,6 +2434,7 @@ release: int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; int ret; @@ -2457,7 +2465,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } /* We're at keyed items, there is no inline item, go to the next one */ - ret = btrfs_next_item(iter->fs_info->extent_root, iter->path); + extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); + ret = btrfs_next_item(extent_root, iter->path); if (ret) return ret; @@ -2469,7 +2478,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->cur_ptr = iter->item_ptr; - iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0], + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], path->slots[0]); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 444e9c89ff3e..1db24e6d6d90 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -514,7 +514,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -529,6 +529,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) return -ENOMEM; last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); + extent_root = btrfs_extent_root(fs_info, last); #ifdef CONFIG_BTRFS_DEBUG /* @@ -841,7 +842,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - root = fs_info->extent_root; + root = btrfs_block_group_root(fs_info); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; @@ -1106,6 +1107,7 @@ out: struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -1139,8 +1141,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( num_items = 3 + map->num_stripes; free_extent_map(em); - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items); + return btrfs_start_transaction_fallback_global_rsv(root, num_items); } /* @@ -1508,7 +1509,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1585,18 +1585,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret && ret != -EAGAIN) + if (ret) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); next: + btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); - if (ret == -EAGAIN && list_empty(&bg->bg_list)) - list_add_tail(&bg->bg_list, &again_list); - else - btrfs_put_block_group(bg); } - list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -1678,7 +1674,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -2165,6 +2161,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) int btrfs_read_block_groups(struct btrfs_fs_info *info) { + struct btrfs_root *root = btrfs_block_group_root(info); struct btrfs_path *path; int ret; struct btrfs_block_group *cache; @@ -2173,7 +2170,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!info->extent_root) + if (!root) return fill_dummy_bgs(info); key.objectid = 0; @@ -2276,7 +2273,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_item bgi; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; spin_lock(&block_group->lock); @@ -2289,7 +2286,6 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, key.offset = block_group->length; spin_unlock(&block_group->lock); - root = fs_info->extent_root; return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } @@ -2543,12 +2539,13 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; int ret; bool dirty_bg_running; do { - trans = btrfs_join_transaction(fs_info->extent_root); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -2653,7 +2650,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; @@ -3790,7 +3787,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, } if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, + ret = btrfs_block_rsv_add(fs_info, &fs_info->chunk_block_rsv, bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) @@ -3911,9 +3908,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } - spin_unlock(&info->unused_bgs_lock); - spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->reclaim_bgs)) { block_group = list_first_entry(&info->reclaim_bgs, struct btrfs_block_group, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 04a6226e0388..b3ee49b0b1e8 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,7 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" +#include "disk-io.h" /* * HOW DO BLOCK RESERVES WORK @@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) { @@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush) { @@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - unsigned min_items; + struct btrfs_root *root, *tmp; + u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item); + unsigned int min_items = 1; /* * The global block rsv is based on the size of the extent tree, the * checksum tree and the root tree. If the fs is empty we want to set * it to a minimal amount for safety. + * + * We also are going to need to modify the minimum of the tree root and + * any global roots we could touch. */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - - /* - * We at a minimum are going to modify the csum root, the tree root, and - * the extent root. - */ - min_items = 3; + read_lock(&fs_info->global_root_lock); + rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, + rb_node) { + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || + root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + num_bytes += btrfs_root_used(&root->root_item); + min_items++; + } + } + read_unlock(&fs_info->global_root_lock); /* * But we also want to reserve enough space so we can do the fallback @@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_unlock(&sinfo->lock); } +void btrfs_init_root_block_rsv(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + switch (root->root_key.objectid) { + case BTRFS_CSUM_TREE_OBJECTID: + case BTRFS_EXTENT_TREE_OBJECTID: + case BTRFS_FREE_SPACE_TREE_OBJECTID: + root->block_rsv = &fs_info->delayed_refs_rsv; + break; + case BTRFS_ROOT_TREE_OBJECTID: + case BTRFS_DEV_TREE_OBJECTID: + case BTRFS_QUOTA_TREE_OBJECTID: + root->block_rsv = &fs_info->global_block_rsv; + break; + case BTRFS_CHUNK_TREE_OBJECTID: + root->block_rsv = &fs_info->chunk_block_rsv; + break; + default: + root->block_rsv = NULL; + break; + } +} + void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; - /* - * Our various recovery options can leave us with NULL roots, so check - * here and just bail before we go dereferencing NULLs everywhere. - */ - if (!fs_info->extent_root || !fs_info->csum_root || - !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) - return; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_update_global_block_rsv(fs_info); } @@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv( struct btrfs_block_rsv *block_rsv = NULL; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) + (root == fs_info->uuid_root) || + (trans->adding_csums && + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -523,7 +539,7 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0b6ae5302837..3b67ff08d434 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -50,6 +50,7 @@ struct btrfs_block_rsv { }; void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, @@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ab2a4a52e0bb..b3e46aabc3d8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -138,19 +138,11 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - union { - /* - * Total number of bytes pending delalloc, used by stat to - * calculate the real block usage of the file. This is used - * only for files. - */ - u64 delalloc_bytes; - /* - * The offset of the last dir item key that was logged. - * This is used only for directories. - */ - u64 last_dir_item_offset; - }; + /* + * Total number of bytes pending delalloc, used by stat to calculate the + * real block usage of the file. This is used only for files. + */ + u64 delalloc_bytes; union { /* diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 32da97c3c19d..71e5b2e9a1ba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -96,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws, } } -static int compression_decompress_bio(int type, struct list_head *ws, - struct compressed_bio *cb) +static int compression_decompress_bio(struct list_head *ws, + struct compressed_bio *cb) { - switch (type) { + switch (cb->compress_type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); @@ -157,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return 0; shash->tfm = fs_info->csum_shash; @@ -1359,7 +1360,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int type = cb->compress_type; workspace = get_workspace(type, 0); - ret = compression_decompress_bio(type, workspace, cb); + ret = compression_decompress_bio(workspace, cb); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c3983bdaf4b8..a7db3f6f1b7b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -463,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -485,8 +485,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } } - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -726,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, } /* - * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. + * Search for a key in the given extent_buffer. * - * the slot in the array is returned via slot, and it points to - * the place where you would insert key if it is not found in - * the array. + * The lower boundary for the search is specified by the slot number @low. Use a + * value of 0 to search over the whole extent buffer. * - * Slot may point to total number of items if the key is bigger than - * all of the keys + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise + * it points to the slot where you would insert the key. + * + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, int item_size, +static noinline int generic_bin_search(struct extent_buffer *eb, int low, const struct btrfs_key *key, int *slot) { - int low = 0; + unsigned long p; + int item_size; int high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); @@ -753,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, return -EINVAL; } + if (btrfs_header_level(eb) == 0) { + p = offsetof(struct btrfs_leaf, items); + item_size = sizeof(struct btrfs_item); + } else { + p = offsetof(struct btrfs_node, ptrs); + item_size = sizeof(struct btrfs_key_ptr); + } + while (low < high) { unsigned long oip; unsigned long offset; @@ -791,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, } /* - * simple bin_search frontend that does the right thing for - * leaves vs nodes + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot) { - if (btrfs_header_level(eb) == 0) - return generic_bin_search(eb, - offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), key, slot); - else - return generic_bin_search(eb, - offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), key, slot); + return generic_bin_search(eb, 0, key, slot); } static void root_add_used(struct btrfs_root *root, u32 size) @@ -927,7 +930,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -986,7 +989,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), right, + 0, 1); free_extent_buffer_stale(right); right = NULL; } else { @@ -1031,7 +1035,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { @@ -1345,33 +1349,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level, { int i; int skip_level = level; - int no_skips = 0; - struct extent_buffer *t; + bool check_skip = true; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break; - if (!no_skips && path->slots[i] == 0) { - skip_level = i + 1; - continue; - } - if (!no_skips && path->keep_locks) { - u32 nritems; - t = path->nodes[i]; - nritems = btrfs_header_nritems(t); - if (nritems < 1 || path->slots[i] >= nritems - 1) { + + if (check_skip) { + if (path->slots[i] == 0) { skip_level = i + 1; continue; } + + if (path->keep_locks) { + u32 nritems; + + nritems = btrfs_header_nritems(path->nodes[i]); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } } - if (skip_level < i && i >= lowest_unlock) - no_skips = 1; - t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level) { - btrfs_tree_unlock_rw(t, path->locks[i]); + check_skip = false; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -1567,35 +1572,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; - int root_lock; + int root_lock = 0; int level = 0; - /* We try very hard to do read locks on the root */ - root_lock = BTRFS_READ_LOCK; - if (p->search_commit_root) { - /* - * The commit roots are read only so we always do read locks, - * and we always must hold the commit_root_sem when doing - * searches on them, the only exception is send where we don't - * want to block transaction commits for a long time, so - * we need to clone the commit root in order to avoid races - * with transaction commits that create a snapshot of one of - * the roots used by a send operation. - */ - if (p->need_commit_sem) { - down_read(&fs_info->commit_root_sem); - b = btrfs_clone_extent_buffer(root->commit_root); - up_read(&fs_info->commit_root_sem); - if (!b) - return ERR_PTR(-ENOMEM); - - } else { - b = root->commit_root; - atomic_inc(&b->refs); - } + b = root->commit_root; + atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1612,6 +1595,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, goto out; } + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + /* * If the level is set to maximum, we can skip trying to get the read * lock. @@ -1638,6 +1624,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (!extent_buffer_uptodate(b)) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + p->nodes[level] = b; if (!p->skip_locking) p->locks[level] = root_lock; @@ -1647,6 +1644,191 @@ out: return b; } +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} + +static inline int search_for_key_slot(struct extent_buffer *eb, + int search_low_slot, + const struct btrfs_key *key, + int prev_cmp, + int *slot) +{ + /* + * If a previous call to btrfs_bin_search() on a parent node returned an + * exact match (prev_cmp == 0), we can safely assume the target key will + * always be at slot 0 on lower levels, since each key pointer + * (struct btrfs_key_ptr) refers to the lowest key accessible from the + * subtree it points to. Thus we can skip searching lower levels. + */ + if (prev_cmp == 0) { + *slot = 0; + return 0; + } + + return generic_bin_search(eb, search_low_slot, key, slot); +} + +static int search_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *path, + int ins_len, + int prev_cmp) +{ + struct extent_buffer *leaf = path->nodes[0]; + int leaf_free_space = -1; + int search_low_slot = 0; + int ret; + bool do_bin_search = true; + + /* + * If we are doing an insertion, the leaf has enough free space and the + * destination slot for the key is not slot 0, then we can unlock our + * write lock on the parent, and any other upper nodes, before doing the + * binary search on the leaf (with search_for_key_slot()), allowing other + * tasks to lock the parent and any other upper nodes. + */ + if (ins_len > 0) { + /* + * Cache the leaf free space, since we will need it later and it + * will not change until then. + */ + leaf_free_space = btrfs_leaf_free_space(leaf); + + /* + * !path->locks[1] means we have a single node tree, the leaf is + * the root of the tree. + */ + if (path->locks[1] && leaf_free_space >= ins_len) { + struct btrfs_disk_key first_key; + + ASSERT(btrfs_header_nritems(leaf) > 0); + btrfs_item_key(leaf, &first_key, 0); + + /* + * Doing the extra comparison with the first key is cheap, + * taking into account that the first key is very likely + * already in a cache line because it immediately follows + * the extent buffer's header and we have recently accessed + * the header's level field. + */ + ret = comp_keys(&first_key, key); + if (ret < 0) { + /* + * The first key is smaller than the key we want + * to insert, so we are safe to unlock all upper + * nodes and we have to do the binary search. + * + * We do use btrfs_unlock_up_safe() and not + * unlock_up() because the later does not unlock + * nodes with a slot of 0 - we can safely unlock + * any node even if its slot is 0 since in this + * case the key does not end up at slot 0 of the + * leaf and there's no need to split the leaf. + */ + btrfs_unlock_up_safe(path, 1); + search_low_slot = 1; + } else { + /* + * The first key is >= then the key we want to + * insert, so we can skip the binary search as + * the target key will be at slot 0. + * + * We can not unlock upper nodes when the key is + * less than the first key, because we will need + * to update the key at slot 0 of the parent node + * and possibly of other upper nodes too. + * If the key matches the first key, then we can + * unlock all the upper nodes, using + * btrfs_unlock_up_safe() instead of unlock_up() + * as stated above. + */ + if (ret == 0) + btrfs_unlock_up_safe(path, 1); + /* + * ret is already 0 or 1, matching the result of + * a btrfs_bin_search() call, so there is no need + * to adjust it. + */ + do_bin_search = false; + path->slots[0] = 0; + } + } + } + + if (do_bin_search) { + ret = search_for_key_slot(leaf, search_low_slot, key, + prev_cmp, &path->slots[0]); + if (ret < 0) + return ret; + } + + if (ins_len > 0) { + /* + * Item key already exists. In this case, if we are allowed to + * insert the item (for example, in dir_item case, item key + * collision is allowed), it will be merged with the original + * item. Only the item size grows, no new btrfs item will be + * added. If search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf space + * check will be correct. + */ + if (ret == 0 && !path->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } + + ASSERT(leaf_free_space >= 0); + + if (leaf_free_space < ins_len) { + int err; + + err = split_leaf(trans, root, key, path, ins_len, + (ret == 0)); + ASSERT(err <= 0); + if (WARN_ON(err > 0)) + err = -EUCLEAN; + if (err) + ret = err; + } + } + + return ret; +} /* * btrfs_search_slot - look for a key in a tree and perform necessary @@ -1683,6 +1865,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -1724,6 +1907,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, min_write_lock_level = write_lock_level; + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + down_read(&fs_info->commit_root_sem); + } + again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); @@ -1777,10 +1965,6 @@ again: } cow_done: p->nodes[level] = b; - /* - * Leave path with blocking locks to avoid massive - * lock context switch, this is made on purpose. - */ /* * we have a lock on b and as long as we aren't changing @@ -1802,62 +1986,22 @@ cow_done: } } - /* - * If btrfs_bin_search returns an exact match (prev_cmp == 0) - * we can safely assume the target key will always be in slot 0 - * on lower levels due to the invariants BTRFS' btree provides, - * namely that a btrfs_key_ptr entry always points to the - * lowest key in the child node, thus we can skip searching - * lower levels - */ - if (prev_cmp == 0) { - slot = 0; - ret = 0; - } else { - ret = btrfs_bin_search(b, key, &slot); - prev_cmp = ret; - if (ret < 0) - goto done; - } - if (level == 0) { - p->slots[level] = slot; - /* - * Item key already exists. In this case, if we are - * allowed to insert the item (for example, in dir_item - * case, item key collision is allowed), it will be - * merged with the original item. Only the item size - * grows, no new btrfs item will be added. If - * search_for_extension is not set, ins_len already - * accounts the size btrfs_item, deduct it here so leaf - * space check will be correct. - */ - if (ret == 0 && ins_len > 0 && !p->search_for_extension) { - ASSERT(ins_len >= sizeof(struct btrfs_item)); - ins_len -= sizeof(struct btrfs_item); - } - if (ins_len > 0 && - btrfs_leaf_free_space(b) < ins_len) { - if (write_lock_level < 1) { - write_lock_level = 1; - btrfs_release_path(p); - goto again; - } + if (ins_len > 0) + ASSERT(write_lock_level >= 1); - err = split_leaf(trans, root, key, - p, ins_len, ret == 0); - - BUG_ON(err > 0); - if (err) { - ret = err; - goto done; - } - } + ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); if (!p->search_for_split) unlock_up(p, level, lowest_unlock, min_write_lock_level, NULL); goto done; } + + ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); + if (ret < 0) + goto done; + prev_cmp = ret; + if (ret && slot > 0) { dec = 1; slot--; @@ -1918,6 +2062,16 @@ cow_done: done: if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); @@ -2615,19 +2769,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { - struct btrfs_item *start_item; - struct btrfs_item *end_item; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - start_item = btrfs_item_nr(start); - end_item = btrfs_item_nr(end); - data_len = btrfs_item_offset(l, start_item) + - btrfs_item_size(l, start_item); - data_len = data_len - btrfs_item_offset(l, end_item); + data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); + data_len = data_len - btrfs_item_offset(l, end); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -2674,7 +2823,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, u32 i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 nr; u32 right_nritems; u32 data_end; @@ -2691,8 +2839,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] > i) break; @@ -2707,12 +2853,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(left, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(left, i); + if (this_item_size + sizeof(struct btrfs_item) + + push_space > free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); if (i == 0) break; i--; @@ -2726,7 +2873,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* push left to right */ right_nritems = btrfs_header_nritems(right); - push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space = btrfs_item_data_end(left, left_nritems - push_items); push_space -= leaf_data_end(left); /* make room in the right data area */ @@ -2757,9 +2904,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - push_space -= btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space -= btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } left_nritems -= push_items; @@ -2904,7 +3050,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, int i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 old_left_nritems; u32 nr; int ret = 0; @@ -2918,8 +3063,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] < i) break; @@ -2934,12 +3077,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(right, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(right, i); + if (this_item_size + sizeof(struct btrfs_item) + push_space > + free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); } if (push_items == 0) { @@ -2955,25 +3099,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, push_items * sizeof(struct btrfs_item)); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - - btrfs_item_offset_nr(right, push_items - 1); + btrfs_item_offset(right, push_items - 1); copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left) - push_space, BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset_nr(right, push_items - 1), + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); btrfs_init_map_token(&token, left); - old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(i); - - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2984,7 +3126,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, right_nritems); if (push_items < right_nritems) { - push_space = btrfs_item_offset_nr(right, push_items - 1) - + push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, @@ -3002,10 +3144,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - - push_space = push_space - btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space = push_space - btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } btrfs_mark_buffer_dirty(left); @@ -3133,7 +3273,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); + data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), @@ -3144,15 +3284,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(l), data_copy_size); - rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -3268,7 +3407,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - if (extend && data_size + btrfs_item_size_nr(l, slot) + + if (extend && data_size + btrfs_item_size(l, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; @@ -3437,7 +3576,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3457,7 +3596,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, ret = -EAGAIN; leaf = path->nodes[0]; /* if our item isn't there, return now */ - if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) + if (item_size != btrfs_item_size(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ @@ -3488,9 +3627,7 @@ static noinline int split_item(struct btrfs_path *path, unsigned long split_offset) { struct extent_buffer *leaf; - struct btrfs_item *item; - struct btrfs_item *new_item; - int slot; + int orig_slot, slot; char *buf; u32 nritems; u32 item_size; @@ -3500,9 +3637,9 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - item = btrfs_item_nr(path->slots[0]); - orig_offset = btrfs_item_offset(leaf, item); - item_size = btrfs_item_size(leaf, item); + orig_slot = path->slots[0]; + orig_offset = btrfs_item_offset(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); buf = kmalloc(item_size, GFP_NOFS); if (!buf) @@ -3523,14 +3660,12 @@ static noinline int split_item(struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(slot); + btrfs_set_item_offset(leaf, slot, orig_offset); + btrfs_set_item_size(leaf, slot, item_size - split_offset); - btrfs_set_item_offset(leaf, new_item, orig_offset); - btrfs_set_item_size(leaf, new_item, item_size - split_offset); - - btrfs_set_item_offset(leaf, item, - orig_offset + item_size - split_offset); - btrfs_set_item_size(leaf, item, split_offset); + btrfs_set_item_offset(leaf, orig_slot, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, orig_slot, split_offset); btrfs_set_header_nritems(leaf, nritems + 1); @@ -3591,7 +3726,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data_start; @@ -3603,14 +3737,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) leaf = path->nodes[0]; slot = path->slots[0]; - old_size = btrfs_item_size_nr(leaf, slot); + old_size = btrfs_item_size(leaf, slot); if (old_size == new_size) return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - old_data_start = btrfs_item_offset_nr(leaf, slot); + old_data_start = btrfs_item_offset(leaf, slot); size_diff = old_size - new_size; @@ -3624,10 +3758,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + size_diff); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + size_diff); } /* shift the data */ @@ -3670,8 +3803,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) fixup_low_keys(path, &disk_key, 1); } - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, new_size); + btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3687,7 +3819,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data; @@ -3705,7 +3836,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) BUG(); } slot = path->slots[0]; - old_data = btrfs_item_end_nr(leaf, slot); + old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); if (slot >= nritems) { @@ -3722,10 +3853,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff - data_size); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - data_size); } /* shift the data */ @@ -3734,9 +3864,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end, old_data - data_end); data_end = old_data; - old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, old_size + data_size); + old_size = btrfs_item_size(leaf, slot); + btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3758,7 +3887,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; @@ -3795,7 +3923,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p btrfs_init_map_token(&token, leaf); if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); + unsigned int old_data = btrfs_item_data_end(leaf, slot); if (old_data < data_end) { btrfs_print_leaf(leaf); @@ -3811,10 +3939,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, - ioff - batch->total_data_size); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, + ioff - batch->total_data_size); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), @@ -3833,10 +3960,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p for (i = 0; i < batch->nr; i++) { btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(slot + i); data_end -= batch->data_sizes[i]; - btrfs_set_token_item_offset(&token, item, data_end); - btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); + btrfs_set_token_item_offset(&token, slot + i, data_end); + btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); } btrfs_set_header_nritems(leaf, nritems + batch->nr); @@ -3943,7 +4069,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, u32 item_size; leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ret = setup_leaf_for_split(trans, root, path, item_size + sizeof(struct btrfs_item)); if (ret) @@ -4032,7 +4158,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); atomic_inc(&leaf->refs); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); } /* @@ -4044,7 +4170,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - struct btrfs_item *item; u32 last_off; u32 dsize = 0; int ret = 0; @@ -4053,10 +4178,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + last_off = btrfs_item_offset(leaf, slot + nr - 1); for (i = 0; i < nr; i++) - dsize += btrfs_item_size_nr(leaf, slot + i); + dsize += btrfs_item_size(leaf, slot + i); nritems = btrfs_header_nritems(leaf); @@ -4073,9 +4198,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + dsize); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + dsize); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -4402,7 +4526,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; int i; @@ -4419,14 +4545,20 @@ again: path->keep_locks = 1; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + down_read(&fs_info->commit_root_sem); + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } path->keep_locks = 0; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* @@ -4549,6 +4681,15 @@ again: ret = 0; done: unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7553e9dc5f93..b4a9b1c58d22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,6 +143,8 @@ enum { BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -511,11 +513,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ @@ -553,7 +550,6 @@ struct btrfs_swapfile_pin { bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); enum { - BTRFS_FS_BARRIER, BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, BTRFS_FS_LOG_RECOVERING, @@ -576,7 +572,6 @@ enum { /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. - * Set, tested and cleared while holding fs_info::send_reloc_lock. */ BTRFS_FS_RELOC_RUNNING, @@ -601,6 +596,9 @@ enum { /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -613,6 +611,7 @@ enum { */ enum btrfs_exclusive_operation { BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, BTRFS_EXCLOP_BALANCE, BTRFS_EXCLOP_DEV_ADD, BTRFS_EXCLOP_DEV_REMOVE, @@ -624,20 +623,21 @@ enum btrfs_exclusive_operation { struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; - struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; - struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; - struct btrfs_root *free_space_root; struct btrfs_root *data_reloc_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; @@ -673,6 +673,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* @@ -815,7 +821,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; - struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -952,13 +957,6 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* readahead tree */ - spinlock_t reada_lock; - struct radix_tree_root reada_tree; - - /* readahead works cnt */ - atomic_t reada_works_cnt; - /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ @@ -1003,13 +1001,6 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; - spinlock_t send_reloc_lock; - /* - * Number of send operations in progress. - * Updated while holding fs_info::send_reloc_lock. - */ - int send_in_progress; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; @@ -1110,6 +1101,8 @@ enum { BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, + /* We started the orphan cleanup for this root. */ + BTRFS_ROOT_ORPHAN_CLEANUP, }; /* @@ -1128,6 +1121,8 @@ struct btrfs_qgroup_swapped_blocks { * and for the extent tree extent_root root. */ struct btrfs_root { + struct rb_node rb_node; + struct extent_buffer *node; struct extent_buffer *commit_root; @@ -1178,8 +1173,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - int orphan_cleanup_state; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; @@ -1960,8 +1953,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, } /* struct btrfs_item */ -BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); @@ -1976,25 +1969,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline u32 btrfs_item_end(const struct extent_buffer *eb, - struct btrfs_item *item) -{ - return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ + int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ } -static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_end(eb, btrfs_item_nr(nr)); -} - -static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, btrfs_item_nr(nr)); -} +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); -static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(nr)); + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); } static inline void btrfs_item_key(const struct extent_buffer *eb, @@ -2257,6 +2261,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2458,7 +2467,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset_nr(leaf, nr - 1); + return btrfs_item_offset(leaf, nr - 1); } /* struct btrfs_file_extent_item */ @@ -2517,9 +2526,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, */ static inline u32 btrfs_file_extent_inline_item_len( const struct extent_buffer *eb, - struct btrfs_item *e) + int nr) { - return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* btrfs_qgroup_status_item */ @@ -2611,11 +2620,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { @@ -2719,7 +2728,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 empty_size, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -3114,36 +3123,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); -/* inode-item.c */ -int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index); -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - struct btrfs_key *location, int mod); - -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); - -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); -struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -3203,10 +3182,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, u64 new_size, - u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3305,6 +3280,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3821,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) btrfs_bio_counter_sub(fs_info, 1); } -/* reada.c */ -struct reada_control { - struct btrfs_fs_info *fs_info; /* tree to prefetch */ - struct btrfs_key key_start; - struct btrfs_key key_end; /* exclusive */ - atomic_t elems; - struct kref refcnt; - wait_queue_head_t wait; -}; -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *start, struct btrfs_key *end); -int btrfs_reada_wait(void *handle); -void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct extent_buffer *eb, int err); -void btrfs_reada_remove_dev(struct btrfs_device *dev); -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 2059d1504149..fb46a28f5065 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -143,10 +143,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) + if (ret < 0) { btrfs_free_reserved_data_space_noquota(fs_info, len); - else + extent_changeset_free(*reserved); + *reserved = NULL; + } else { ret = 0; + } return ret; } @@ -331,7 +334,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; @@ -452,8 +455,11 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(inode, len); - if (ret < 0) + if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); + extent_changeset_free(*reserved); + *reserved = NULL; + } return ret; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e164766dcc38..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -13,6 +13,7 @@ #include "ctree.h" #include "qgroup.h" #include "locking.h" +#include "inode-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -629,7 +630,7 @@ static int btrfs_delayed_inode_reserve_metadata( BTRFS_QGROUP_RSV_META_PREALLOC, true); if (ret < 0) return ret; - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* NO_FLUSH could only fail with -ENOSPC */ ASSERT(ret == 0 || ret == -ENOSPC); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cca7e85e32dd..4176df149d04 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", @@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) num_bytes = btrfs_calc_insert_metadata_size(fs_info, trans->delayed_ref_updates); + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->full = 0; @@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c85a7d44da79..62b9651ea662 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found: } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { @@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; @@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* * need to delete old one and insert a new one. * Since no attempt is made to recover any old state, if the @@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - if (!scrub_ret) - btrfs_reada_remove_dev(src_device); - /* * We have to use this loop approach because at this point src_device * has to be available for transaction commit to complete, yet new @@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_reada_undo_remove_dev(src_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -968,7 +964,6 @@ error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_reada_undo_remove_dev(src_device); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 7721ce0c0604..3b532bab0755 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; - struct btrfs_item *item; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); @@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); - BUG_ON(data_size > btrfs_item_size(leaf, item)); - ptr += btrfs_item_size(leaf, item) - data_size; + ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); + ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } @@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; - if (data_size + btrfs_item_size_nr(leaf, slot) + + if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { @@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - total_len = btrfs_item_size_nr(leaf, path->slots[0]); + total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + @@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); - item_len = btrfs_item_size_nr(leaf, path->slots[0]); + item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59c3be8c1f4c..87a5addbedf6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, if (ret < 0) goto err; - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - set_extent_buffer_uptodate(eb); free_extent_buffer(eb); @@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, } ret = validate_extent_buffer(eb); err: - if (reads_done && - test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - if (ret) { /* * our io error hook is going to dec the io pages @@ -1140,11 +1133,16 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; + root->root_key.objectid = objectid; root->node = NULL; root->commit_root = NULL; root->state = 0; - root->orphan_cleanup_state = 0; + RB_CLEAR_NODE(&root->rb_node); root->last_trans = 0; root->free_objectid = 0; @@ -1152,7 +1150,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); - root->block_rsv = NULL; + + btrfs_init_root_block_rsv(root); INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1190,6 +1189,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; + root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); @@ -1197,12 +1197,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, IO_TREE_LOG_CSUM_RANGE, NULL); } - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - root->root_key.objectid = objectid; - root->anon_dev = 0; - spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG @@ -1242,6 +1236,81 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) } #endif +static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); + const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); +} + +static int global_root_key_cmp(const void *k, const struct rb_node *node) +{ + const struct btrfs_key *key = k; + const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(key, &root->root_key); +} + +int btrfs_global_root_insert(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *tmp; + + write_lock(&fs_info->global_root_lock); + tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); + write_unlock(&fs_info->global_root_lock); + ASSERT(!tmp); + + return tmp ? -EEXIST : 0; +} + +void btrfs_global_root_delete(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + write_lock(&fs_info->global_root_lock); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + write_unlock(&fs_info->global_root_lock); +} + +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key) +{ + struct rb_node *node; + struct btrfs_root *root = NULL; + + read_lock(&fs_info->global_root_lock); + node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); + if (node) + root = container_of(node, struct btrfs_root, rb_node); + read_unlock(&fs_info->global_root_lock); + + return root; +} + +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_CSUM_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_EXTENT_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -1554,25 +1623,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, u64 objectid) { + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + struct btrfs_root *root = btrfs_global_root(fs_info, &key); + + return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); + } return NULL; } @@ -1619,6 +1696,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) #endif } +static void free_global_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct rb_node *node; + + while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { + root = rb_entry(node, struct btrfs_root, rb_node); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + btrfs_put_root(root); + } +} + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); @@ -1630,14 +1719,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - btrfs_put_root(fs_info->extent_root); + free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); btrfs_put_root(fs_info->dev_root); - btrfs_put_root(fs_info->csum_root); btrfs_put_root(fs_info->quota_root); btrfs_put_root(fs_info->uuid_root); - btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_check_leaked_roots(fs_info); @@ -1732,6 +1819,14 @@ again: } return root; fail: + /* + * If our caller provided us an anonymous device, then it's his + * responsability to free it in case we fail. So we have to set our + * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() + * and once again by our caller. + */ + if (anon_dev) + root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); } @@ -1927,7 +2022,8 @@ static int transaction_kthread(void *arg) } delta = ktime_get_seconds() - cur->start_time; - if (cur->state < TRANS_STATE_COMMIT_START && + if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && + cur->state < TRANS_STATE_COMMIT_START && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -1999,6 +2095,8 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2023,11 +2121,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root(root_backup, extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(info->extent_root->node)); + btrfs_header_generation(extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(info->extent_root->node)); + btrfs_header_level(extent_root->node)); /* * we might commit during log recovery, which happens before we set @@ -2048,11 +2146,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(info->csum_root->node)); + btrfs_header_generation(csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(info->csum_root->node)); + btrfs_header_level(csum_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); @@ -2127,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); - btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); if (fs_info->discard_ctl.discard_workers) @@ -2151,21 +2248,29 @@ static void free_root_extent_buffers(struct btrfs_root *root) } } +static void free_global_root_pointers(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + rbtree_postorder_for_each_entry_safe(root, tmp, + &fs_info->global_root_tree, + rb_node) + free_root_extent_buffers(root); +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); + free_global_root_pointers(info); free_root_extent_buffers(info->dev_root); - free_root_extent_buffers(info->extent_root); - free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); - free_root_extent_buffers(info->free_space_root); } void btrfs_put_root(struct btrfs_root *root) @@ -2283,8 +2388,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->qgroup_rescan_lock); } -static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices) +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; @@ -2333,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue(fs_info, "readahead", flags, - max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->discard_ctl.discard_workers = @@ -2347,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers && + fs_info->caching_workers && fs_info->fixup_workers && + fs_info->delayed_workers && fs_info->qgroup_rescan_workers && fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2427,6 +2527,104 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return 0; } +static int load_global_roots_objectid(struct btrfs_root *tree_root, + struct btrfs_path *path, u64 objectid, + const char *name) +{ + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *root; + int ret; + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + bool found = false; + + /* If we have IGNOREDATACSUMS skip loading these roots. */ + if (objectid == BTRFS_CSUM_TREE_OBJECTID && + btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + return 0; + } + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + break; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(tree_root, path); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + } + ret = 0; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != objectid) + break; + btrfs_release_path(path); + + found = true; + root = read_tree_root_path(tree_root, path, &key); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = PTR_ERR(root); + break; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + ret = btrfs_global_root_insert(root); + if (ret) { + btrfs_put_root(root); + break; + } + key.offset++; + } + btrfs_release_path(path); + + if (!found || ret) { + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = ret ? ret : -ENOENT; + else + ret = 0; + btrfs_err(fs_info, "failed to load root %s", name); + } + return ret; +} + +static int load_global_roots(struct btrfs_root *tree_root) +{ + struct btrfs_path *path; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = load_global_roots_objectid(tree_root, path, + BTRFS_EXTENT_TREE_OBJECTID, "extent"); + if (ret) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_CSUM_TREE_OBJECTID, "csum"); + if (ret) + goto out; + if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); +out: + btrfs_free_path(path); + return ret; +} + static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; @@ -2436,7 +2634,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) BUG_ON(!fs_info->tree_root); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + ret = load_global_roots(tree_root); + if (ret) + return ret; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; @@ -2448,38 +2650,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; - } - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ btrfs_init_devices_late(fs_info); - /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ - if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; - } - } - /* * This tree can share blocks with some other fs tree during relocation * and we need a proper setup by btrfs_get_fs_root @@ -2517,20 +2692,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; - } - } - return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2850,6 +3011,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { @@ -2885,6 +3047,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->zone_active_bgs_lock); spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); + rwlock_init(&fs_info->global_root_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); @@ -2916,9 +3079,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->global_root_tree = RB_ROOT; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2926,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ - /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - spin_lock_init(&fs_info->reada_lock); btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, @@ -2950,7 +3110,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); - set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2985,9 +3144,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; - spin_lock_init(&fs_info->send_reloc_lock); - fs_info->send_in_progress = 0; - fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } @@ -3415,7 +3571,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->subpage_info = subpage_info; } - ret = btrfs_init_workqueues(fs_info, fs_devices); + ret = btrfs_init_workqueues(fs_info); if (ret) { err = ret; goto fail_sb_buffer; @@ -3563,6 +3719,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_free_zone_cache(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, @@ -3978,11 +4136,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; @@ -4313,6 +4483,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) return btrfs_commit_transaction(trans); } +static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *trans; + struct btrfs_transaction *tmp; + bool found = false; + + if (list_empty(&fs_info->trans_list)) + return; + + /* + * This function is only called at the very end of close_ctree(), + * thus no other running transaction, no need to take trans_lock. + */ + ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); + list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { + struct extent_state *cached = NULL; + u64 dirty_bytes = 0; + u64 cur = 0; + u64 found_start; + u64 found_end; + + found = true; + while (!find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, EXTENT_DIRTY, &cached)) { + dirty_bytes += found_end + 1 - found_start; + cur = found_end + 1; + } + btrfs_warn(fs_info, + "transaction %llu (with %llu dirty metadata bytes) is not committed", + trans->transid, dirty_bytes); + btrfs_cleanup_one_transaction(trans, fs_info); + + if (trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + list_del_init(&trans->list); + + btrfs_put_transaction(trans); + trace_btrfs_transaction_commit(fs_info); + } + ASSERT(!found); +} + void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; @@ -4421,7 +4633,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_stop_all_workers(fs_info); /* We shouldn't have any transaction open at this point */ - ASSERT(list_empty(&fs_info->trans_list)); + warn_about_uncommitted_trans(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); @@ -4969,7 +5181,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); - trace_btrfs_transaction_commit(fs_info->tree_root); + trace_btrfs_transaction_commit(fs_info); spin_lock(&fs_info->trans_lock); } spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a2b5db4ba262..5e8bef4b7563 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -71,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); +int btrfs_global_root_insert(struct btrfs_root *root); +void btrfs_global_root_delete(struct btrfs_root *root); +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key); +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -103,6 +109,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } +static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + return btrfs_extent_root(fs_info, 0); +} + void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3fd736a02c1e..d89273c4b6b8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { + struct btrfs_root *root = btrfs_extent_root(fs_info, start); int ret; struct btrfs_key key; struct btrfs_path *path; @@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); btrfs_free_path(path); return ret; } @@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags) { + struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; @@ -153,7 +155,8 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + extent_root = btrfs_extent_root(fs_info, bytenr); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -171,7 +174,7 @@ search_again: if (ret == 0) { leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (item_size >= sizeof(*ei)) { ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -593,6 +596,7 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop, int *last_ref) { @@ -626,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) @@ -685,7 +689,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -709,6 +713,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -721,8 +726,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, - path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } @@ -787,7 +791,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -865,7 +869,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; btrfs_print_v0_err(fs_info); @@ -1007,7 +1011,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); @@ -1119,7 +1123,7 @@ void update_inline_extent_backref(struct btrfs_path *path, } else { *last_ref = 1; size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) @@ -1174,6 +1178,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data, int *last_ref) @@ -1185,11 +1190,11 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, update_inline_extent_backref(path, iref, -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, path, refs_to_drop, + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, last_ref); } else { *last_ref = 1; - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); } return ret; } @@ -1572,6 +1577,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -1601,8 +1607,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.offset = head->num_bytes; } + root = btrfs_extent_root(fs_info, key.objectid); again: - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { err = ret; goto out; @@ -1634,7 +1641,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; @@ -1844,8 +1851,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) { btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info->csum_root, - head->bytenr, head->num_bytes); + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(fs_info, head->bytenr); + ret = btrfs_del_csums(trans, csum_root, head->bytenr, + head->num_bytes); } } @@ -2285,7 +2295,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; @@ -2316,7 +2326,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, goto out; ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); /* If extent item has more than 1 inline ref then it's shared */ @@ -2920,7 +2930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -2936,6 +2946,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + extent_root = btrfs_extent_root(info, bytenr); + ASSERT(extent_root); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2996,9 +3009,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto err_dump; } /* Must be SHARED_* item, remove the backref first */ - ret = remove_extent_backref(trans, path, NULL, - refs_to_drop, - is_data, &last_ref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, is_data, + &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3068,7 +3081,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); + item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EINVAL; btrfs_print_v0_err(info); @@ -3122,8 +3135,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, path, iref, - refs_to_drop, is_data, + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3179,7 +3192,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info->csum_root, bytenr, + struct btrfs_root *csum_root; + csum_root = btrfs_csum_root(info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,20 +3290,20 @@ out_delayed_unlock: } void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid, 0, false); + root_id, 0, false); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + if (root_id != BTRFS_TREE_LOG_OBJECTID) { btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -3298,7 +3313,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache; bool must_pin = false; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) { btrfs_redirty_list_add(trans->transaction, buf); @@ -3790,23 +3805,35 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, spin_unlock(&fs_info->relocation_bg_lock); if (skip) return 1; + /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); if (block_group->ro || block_group->alloc_offset == block_group->zone_capacity) { - spin_unlock(&block_group->lock); - return 1; + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ } spin_unlock(&block_group->lock); - if (!btrfs_zone_activate(block_group)) - return 1; + if (!ret && !btrfs_zone_activate(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); spin_lock(&fs_info->relocation_bg_lock); + if (ret) + goto out; + ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); @@ -3947,6 +3974,28 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } +static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return true; + case BTRFS_EXTENT_ALLOC_ZONED: + /* + * If we have enough free space left in an already + * active block group and we can't activate any other + * zone now, do not allow allocating a new chunk and + * let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return false; + return true; + default: + BUG(); + } +} + static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { @@ -3975,7 +4024,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, bool full_search) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && @@ -3987,18 +4036,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 0; } - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) { - /* - * If we have enough free space left in an already active block - * group and we can't activate any other zone now, retry the - * active ones with a smaller allocation size. Returning early - * from here will tell btrfs_reserve_extent() to haven the - * size. - */ - return -ENOSPC; - } - if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) return 1; @@ -4034,6 +4071,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; int exist = 0; + /*Check if allocation policy allows to create a new chunk */ + if (!can_allocate_chunk(fs_info, ffe_ctl)) + return -ENOSPC; + trans = current->journal_info; if (trans) exist = 1; @@ -4570,6 +4611,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins, int ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; @@ -4589,8 +4631,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + extent_root = btrfs_extent_root(fs_info, ins->objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; @@ -4642,6 +4684,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; @@ -4673,8 +4716,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - &extent_key, size); + extent_root = btrfs_extent_root(fs_info, extent_key.objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, + size); if (ret) { btrfs_free_path(path); return ret; @@ -5472,7 +5516,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, goto owner_mismatch; } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6051,6 +6096,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int dev_ret = 0; int ret = 0; + if (range->start == U64_MAX) + return -EINVAL; + /* * Check range overflow if range->len is set. * The default range->len is U64_MAX. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4e03a6d3aa32..d6d48ecf823c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2314,8 +2314,8 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); - if (btrfs_is_zoned(fs_info)) - return btrfs_repair_one_zone(fs_info, logical); + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; @@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = mirror; atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, - &eb->bflags)) - btree_readahead_hook(eb, -EIO); } readpage_ok: if (likely(uptodate)) { @@ -3187,13 +3184,12 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) /** * Attempt to add a page to bio * - * @bio: destination bio + * @bio_ctrl: record both the bio, and its bio_flags * @page: page to add to the bio * @disk_bytenr: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @pg_offset: starting offset in the page * @size: portion of page that we want to write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @pg_offset: starting offset in the page * @bio_flags: flags of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. @@ -3283,8 +3279,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, else bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - if (!btrfs_is_zoned(fs_info) || - bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; } @@ -3339,7 +3334,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { struct btrfs_device *device; device = btrfs_zoned_get_device(fs_info, disk_bytenr, @@ -3785,12 +3780,13 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc) { const u64 page_end = page_offset(page) + PAGE_SIZE - 1; u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; + /* How many pages are started by btrfs_run_delalloc_range() */ + unsigned long nr_written = 0; int ret; int page_started = 0; @@ -3806,7 +3802,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, continue; } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, nr_written, wbc); + delalloc_end, &page_started, &nr_written, wbc); if (ret) { btrfs_page_set_error(inode->root->fs_info, page, page_offset(page), PAGE_SIZE); @@ -3829,16 +3825,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* did the fill delalloc function already unlock and start - * the IO? - */ + /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ if (page_started) { /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. + * We've unlocked the page, so we can't update the mapping's + * writeback index, just update nr_to_write. */ - wbc->nr_to_write -= *nr_written; + wbc->nr_to_write -= nr_written; return 1; } @@ -3910,7 +3903,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct writeback_control *wbc, struct extent_page_data *epd, loff_t i_size, - unsigned long nr_written, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -3929,7 +3921,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); unlock_page(page); return 1; } @@ -3938,7 +3929,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, nr_written + 1); + update_nr_written(wbc, 1); while (cur <= end) { u64 disk_bytenr; @@ -4076,7 +4067,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned long nr_written = 0; trace___extent_writepage(page, inode, wbc); @@ -4105,7 +4095,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; if (ret) @@ -4113,7 +4103,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, - nr_written, &nr); + &nr); if (ret == 1) return 0; @@ -4314,6 +4304,20 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) return; /* + * A read may stumble upon this buffer later, make sure that it gets an + * error and knows there was an error. + */ + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + /* + * We need to set the mapping with the io error as well because a write + * error will flip the file system readonly, and then syncfs() will + * return a 0 because we are readonly if we don't modify the err seq for + * the superblock. + */ + mapping_set_error(page->mapping, -EIO); + + /* * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ @@ -5175,8 +5179,6 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); - const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5188,11 +5190,9 @@ int extent_writepages(struct address_space *mapping, * Allow only a single thread to do the reloc work in zoned mode to * protect the write pointer updates. */ - if (data_reloc && zoned) - btrfs_inode_lock(inode, 0); + btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - if (data_reloc && zoned) - btrfs_inode_unlock(inode, 0); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); @@ -6597,6 +6597,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + /* + * We could have had EXTENT_BUFFER_UPTODATE cleared by the write + * operation, which could potentially still be in flight. In this case + * we simply want to return an error. + */ + if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) + return -EIO; + if (eb->fs_info->sectorsize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d1cbb64a78f3..90c5c38836ab 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; - csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item = btrfs_item_size(leaf, path->slots[0]); csums_in_item /= csum_size; if (csum_offset == csums_in_item) { @@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 disk_bytenr, u64 len, u8 *dst) { + struct btrfs_root *csum_root; struct btrfs_csum_item *item = NULL; struct btrfs_key key; const u32 sectorsize = fs_info->sectorsize; @@ -274,7 +275,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -285,13 +286,14 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, /* Current item doesn't contain the desired range, search again */ btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { ret = PTR_ERR(item); goto out; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -376,7 +378,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; - if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* @@ -534,7 +537,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, key.type == BTRFS_EXTENT_CSUM_KEY) { offset = (start - key.offset) >> fs_info->sectorsize_bits; if (offset * csum_size < - btrfs_item_size_nr(leaf, path->slots[0] - 1)) + btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } @@ -559,7 +562,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size_nr(leaf, path->slots[0]); + size = btrfs_item_size(leaf, path->slots[0]); csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; if (csum_end <= start) { path->slots[0]++; @@ -750,7 +753,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key->offset; @@ -801,7 +804,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root == fs_info->csum_root || + ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); @@ -834,7 +837,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (key.offset >= end_byte) break; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key.offset; @@ -1002,7 +1005,7 @@ again: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -1013,7 +1016,7 @@ again: u32 item_size; /* we found one, but it isn't big enough yet */ leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if ((item_size / csum_size) >= MAX_CSUM_ITEMS(fs_info, csum_size)) { /* already at max size, make a new one */ @@ -1070,7 +1073,7 @@ again: } extend_csum: - if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / csum_size) { int extend_nr; u64 tmp; @@ -1125,7 +1128,7 @@ extend_csum: diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); - diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + diff = diff - btrfs_item_size(leaf, path->slots[0]); diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); diff /= csum_size; diff *= csum_size; @@ -1162,7 +1165,7 @@ insert: csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f3fee88c8ee0..01a408db5683 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -23,6 +23,7 @@ #include "block-group.h" #include "discard.h" #include "subpage.h" +#include "inode-item.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -37,7 +38,7 @@ struct btrfs_trim_range { static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); + struct btrfs_free_space *info, bool update_stat); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); @@ -45,7 +46,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info); static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes); + u64 bytes, bool update_stats); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -288,9 +289,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct inode *inode) + struct inode *vfs_inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_truncate_control control = { + .inode = BTRFS_I(vfs_inode), + .new_size = 0, + .ino = btrfs_ino(BTRFS_I(vfs_inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; @@ -320,19 +330,26 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_free_path(path); } - btrfs_i_size_write(BTRFS_I(inode), 0); - truncate_pagecache(inode, 0); + btrfs_i_size_write(inode, 0); + truncate_pagecache(vfs_inode, 0); + + lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); fail: if (locked) @@ -666,7 +683,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; @@ -872,7 +889,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); ret = btrfs_add_free_space(block_group, info->offset, info->bytes); kmem_cache_free(btrfs_free_space_cachep, info); @@ -886,7 +903,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, bytes); if (ret) break; - bitmap_clear_bits(ctl, info, offset, bytes); + bitmap_clear_bits(ctl, info, offset, bytes, true); offset = info->offset; bytes = ctl->unit; } @@ -1581,6 +1598,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } /* + * This is a little subtle. We *only* have ->max_extent_size set if we actually + * searched through the bitmap and figured out the largest ->max_extent_size, + * otherwise it's 0. In the case that it's 0 we don't want to tell the + * allocator the wrong thing, we want to use the actual real max_extent_size + * we've found already if it's larger, or we want to use ->bytes. + * + * This matters because find_free_space() will skip entries who's ->bytes is + * less than the required bytes. So if we didn't search down this bitmap, we + * may pick some previous entry that has a smaller ->max_extent_size than we + * have. For example, assume we have two entries, one that has + * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set + * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will + * call into find_free_space(), and return with max_extent_size == 4K, because + * that first bitmap entry had ->max_extent_size set, but the second one did + * not. If instead we returned 8K we'd come in searching for 8K, and find the + * 8K contiguous range. + * + * Consider the other case, we have 2 8K chunks in that second entry and still + * don't have ->max_extent_size set. We'll return 16K, and the next time the + * allocator comes in it'll fully search our second bitmap, and this time it'll + * get an uptodate value of 8K as the maximum chunk size. Then we'll get the + * right allocation the next loop through. + */ +static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) +{ + if (entry->bitmap && entry->max_extent_size) + return entry->max_extent_size; + return entry->bytes; +} + +/* + * We want the largest entry to be leftmost, so this is inverted from what you'd + * normally expect. + */ +static bool entry_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct btrfs_free_space *entry, *exist; + + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + exist = rb_entry(parent, struct btrfs_free_space, bytes_index); + return get_max_extent_size(exist) < get_max_extent_size(entry); +} + +/* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just @@ -1592,15 +1653,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; - struct btrfs_free_space *entry, *prev = NULL; + struct btrfs_free_space *entry = NULL, *prev = NULL; /* find entry that is closest to the 'offset' */ - while (1) { - if (!n) { - entry = NULL; - break; - } - + while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; @@ -1610,6 +1666,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, n = n->rb_right; else break; + + entry = NULL; } if (bitmap_only) { @@ -1686,6 +1744,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, return NULL; while (1) { + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) @@ -1694,33 +1756,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (entry->offset + entry->bytes > offset) break; } - - n = rb_next(&entry->offset_index); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); } return entry; } -static inline void -__unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) +static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) { rb_erase(&info->offset_index, &ctl->free_space_offset); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } -} -static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - __unlink_free_space(ctl, info); - ctl->free_space -= info->bytes; + if (update_stat) + ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, @@ -1734,6 +1788,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, if (ret) return ret; + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; @@ -1744,9 +1800,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, - u64 offset, u64 bytes) +static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + ASSERT(info->bitmap); + + /* + * If our entry is empty it's because we're on a cluster and we don't + * want to re-link it into our ctl bytes index. + */ + if (RB_EMPTY_NODE(&info->bytes_index)) + return; + + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); +} + +static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; @@ -1762,6 +1834,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta++; @@ -1773,14 +1847,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } -} -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - __bitmap_clear_bits(ctl, info, offset, bytes); - ctl->free_space -= bytes; + if (update_stat) + ctl->free_space -= bytes; } static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, @@ -1797,9 +1866,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, bitmap_set(info->bitmap, start, count); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta--; @@ -1867,20 +1943,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; + relink_bitmap_entry(ctl, bitmap_info); return -1; } -static inline u64 get_max_extent_size(struct btrfs_free_space *entry) -{ - if (entry->bitmap) - return entry->max_extent_size; - return entry->bytes; -} - /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, - unsigned long align, u64 *max_extent_size) + unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; @@ -1890,16 +1960,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (!ctl->free_space_offset.rb_node) goto out; +again: + if (use_bytes_index) { + node = rb_first_cached(&ctl->free_space_bytes); + } else { + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), + 0, 1); + if (!entry) + goto out; + node = &entry->offset_index; + } - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); - if (!entry) - goto out; + for (; node; node = rb_next(node)) { + if (use_bytes_index) + entry = rb_entry(node, struct btrfs_free_space, + bytes_index); + else + entry = rb_entry(node, struct btrfs_free_space, + offset_index); - for (node = &entry->offset_index; node; node = rb_next(node)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); + /* + * If we are using the bytes index then all subsequent entries + * in this tree are going to be < bytes, so simply set the max + * extent size and exit the loop. + * + * If we're using the offset index then we need to keep going + * through the rest of the tree. + */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); + if (use_bytes_index) + break; continue; } @@ -1916,6 +2008,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, tmp = entry->offset; } + /* + * We don't break here if we're using the bytes index because we + * may have another entry that has the correct alignment that is + * the right size, so we don't want to miss that possibility. + * At worst this adds another loop through the logic, but if we + * broke here we could prematurely ENOSPC. + */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); @@ -1923,6 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bitmap) { + struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); @@ -1935,6 +2035,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, max(get_max_extent_size(entry), *max_extent_size); } + + /* + * The bitmap may have gotten re-arranged in the space + * index here because the max_extent_size may have been + * updated. Start from the beginning again if this + * happened. + */ + if (use_bytes_index && old_next != rb_next(node)) + goto again; continue; } @@ -1973,7 +2082,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } - unlink_free_space(ctl, bitmap_info); + unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; @@ -2011,7 +2120,7 @@ again: /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); - bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; @@ -2083,12 +2192,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); - /* - * We set some bytes, we have no idea what the max extent size is - * anymore. - */ - info->max_extent_size = 0; - return bytes_to_set; } @@ -2096,7 +2199,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2165,7 +2268,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, return 0; if (ctl->op == &free_space_op) - block_group = ctl->private; + block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we @@ -2310,10 +2413,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { - if (update_stat) - unlink_free_space(ctl, right_info); - else - __unlink_free_space(ctl, right_info); + unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; @@ -2323,10 +2423,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { - if (update_stat) - unlink_free_space(ctl, left_info); - else - __unlink_free_space(ctl, left_info); + unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); @@ -2362,10 +2459,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, end, bytes); - else - __bitmap_clear_bits(ctl, bitmap, end, bytes); + bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2419,10 +2513,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, info->offset, bytes); - else - __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2466,12 +2557,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; @@ -2486,6 +2577,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); + RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); @@ -2602,9 +2694,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, @@ -2635,9 +2725,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2696,7 +2784,7 @@ again: re_search = false; if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); @@ -2732,7 +2820,7 @@ again: } spin_unlock(&ctl->tree_lock); - ret = __btrfs_add_free_space(block_group->fs_info, ctl, + ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); @@ -2797,8 +2885,9 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; - ctl->private = block_group; + ctl->block_group = block_group; ctl->op = &free_space_op; + ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); @@ -2864,6 +2953,8 @@ static void __btrfs_return_cluster_to_free_space( } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); + rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, + entry_less); } cluster->root = RB_ROOT; spin_unlock(&cluster->lock); @@ -2879,7 +2970,7 @@ static void __btrfs_remove_free_space_cache_locked( while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); kmem_cache_free(btrfs_free_space_cachep, info); } else { free_bitmap(ctl, info); @@ -2893,8 +2984,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->private) - btrfs_discard_update_discardable(ctl->private); + if (ctl->block_group) + btrfs_discard_update_discardable(ctl->block_group); spin_unlock(&ctl->tree_lock); } @@ -2965,18 +3056,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + bool use_bytes_index = (offset == block_group->start); ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, - block_group->full_stripe_len, max_extent_size); + block_group->full_stripe_len, max_extent_size, + use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { - bitmap_clear_bits(ctl, entry, offset, bytes); + bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); @@ -2984,7 +3077,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; @@ -3006,8 +3099,7 @@ out: spin_unlock(&ctl->tree_lock); if (align_gap_len) - __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len, + __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } @@ -3078,7 +3170,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, } ret = search_start; - __bitmap_clear_bits(ctl, entry, ret, bytes); + bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } @@ -3254,6 +3346,17 @@ again: cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + + /* + * We need to know if we're currently on the normal space index when we + * manipulate the bitmap so that we know we need to remove and re-insert + * it into the space_index tree. Clear the bytes_index node here so the + * bitmap manipulation helpers know not to mess with the space_index + * until this bitmap entry is added back into the normal cache. + */ + RB_CLEAR_NODE(&entry->bytes_index); + ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); ASSERT(!ret); /* -EEXIST; Logic error */ @@ -3344,6 +3447,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; @@ -3535,13 +3639,13 @@ static int do_trimming(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) - __btrfs_add_free_space(fs_info, ctl, reserved_start, + __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); if (start + bytes < reserved_start + reserved_bytes) - __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); - __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); + __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3615,7 +3719,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim @@ -3641,7 +3745,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3828,7 +3932,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, bytes > (max_discard_size + minlen)) bytes = max_discard_size; - bitmap_clear_bits(ctl, entry, start, bytes); + bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 1f23088d43f9..15591b299895 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -22,6 +22,7 @@ enum btrfs_trim_state { struct btrfs_free_space { struct rb_node offset_index; + struct rb_node bytes_index; u64 offset; u64 bytes; u64 max_extent_size; @@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap( struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; + struct rb_root_cached free_space_bytes; u64 free_space; int extents_thresh; int free_extents; @@ -54,7 +56,7 @@ struct btrfs_free_space_ctl { s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; - void *private; + struct btrfs_block_group *block_group; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; }; @@ -101,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size, - enum btrfs_trim_state trim_state); +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, + u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a33bca94d133..655aad0f9e1c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -16,6 +16,18 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); +static struct btrfs_root *btrfs_free_space_root( + struct btrfs_block_group *block_group) +{ + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(block_group->fs_info, &key); +} + void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; @@ -51,7 +63,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; @@ -85,7 +97,7 @@ struct btrfs_free_space_info *search_free_space_info( struct btrfs_path *path, int cow) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; int ret; @@ -188,7 +200,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -326,7 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -586,7 +598,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size, int remove) { - struct btrfs_root *root = block_group->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; @@ -699,7 +711,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; @@ -851,7 +863,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; @@ -1046,7 +1058,7 @@ out: static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *extent_root = trans->fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; @@ -1080,6 +1092,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; + extent_root = btrfs_extent_root(trans->fs_info, key.objectid); ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; @@ -1157,7 +1170,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) ret = PTR_ERR(free_space_root); goto abort; } - fs_info->free_space_root = free_space_root; + ret = btrfs_global_root_insert(free_space_root); + if (ret) { + btrfs_put_root(free_space_root); + goto abort; + } node = rb_first(&fs_info->block_group_cache_tree); while (node) { @@ -1232,7 +1249,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *free_space_root = fs_info->free_space_root; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); int ret; trans = btrfs_start_transaction(tree_root, 0); @@ -1241,7 +1263,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); - fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); if (ret) @@ -1251,13 +1272,14 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) if (ret) goto abort; + btrfs_global_root_delete(free_space_root); list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); btrfs_clean_tree_block(free_space_root->node); btrfs_tree_unlock(free_space_root->node); - btrfs_free_tree_block(trans, free_space_root, free_space_root->node, - 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), + free_space_root->node, 0, 1); btrfs_put_root(free_space_root); @@ -1319,7 +1341,7 @@ out: int remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -1410,7 +1432,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; @@ -1488,7 +1510,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 37f36ffdaf6b..0eeb5ea87894 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,6 +4,7 @@ */ #include "ctree.h" +#include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" @@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, u32 cur_offset = 0; int len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); @@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( u32 cur_offset = 0; int ref_name_len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); /* @@ -139,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_extref_index(leaf, extref); @@ -208,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_ref_index(leaf, ref); @@ -256,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_item *item; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -282,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, goto out; leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); - ptr += btrfs_item_size(leaf, item) - ins_len; + ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); @@ -332,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ref) goto out; - old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + old_size = btrfs_item_size(path->nodes[0], path->slots[0]); btrfs_extend_item(path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -419,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root } return ret; } + +static inline void btrfs_trace_truncate(struct btrfs_inode *inode, + struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 offset, int extent_type, int slot) +{ + if (!inode) + return; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot, + offset); + else + trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset); +} + +/* + * Remove inode items from a given root. + * + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @control: The btrfs_truncate_control to control how and what we + * are truncating. + * + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 new_size = control->new_size; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u32 found_type = (u8)-1; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int ret; + u64 bytes_deleted = 0; + bool be_nice = false; + + ASSERT(control->inode || !control->clear_extent_range); + ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY); + + control->last_size = new_size; + control->sub_bytes = 0; + + /* + * For shareable roots we want to back off from time to time, this turns + * out to be subvolume roots, reloc roots, and data reloc roots. + */ + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + be_nice = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_BACK; + + key.objectid = control->ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + /* + * With a 16K leaf size and 128MiB extents, you can actually queue up a + * huge file in a single leaf. Most of the time that bytes_deleted is + * > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = 0; + /* There are no items in the tree for us to truncate, we're done */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + u64 clear_start = 0, clear_len = 0, extent_start = 0; + bool should_throttle = false; + + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + + if (found_key.objectid != control->ino) + break; + + if (found_type < control->min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + item_end += btrfs_file_extent_ram_bytes(leaf, fi); + + btrfs_trace_truncate(control->inode, leaf, fi, + found_key.offset, extent_type, + path->slots[0]); + item_end--; + } + if (found_type > control->min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + control->extents_found++; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + + clear_start = found_key.offset; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - extent_num_bytes); + if (extent_start != 0) + control->sub_bytes += num_dec; + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) + control->sub_bytes += num_dec; + } + clear_len = num_dec; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * We can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(path, size, 1); + } else if (!del_item) { + /* + * We have to bail so the last_size is set to + * just before this extent. + */ + ret = BTRFS_NEED_TRUNCATE_BLOCK; + break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; + } + + control->sub_bytes += item_end + 1 - new_size; + } +delete: + /* + * We only want to clear the file extent range if we're + * modifying the actual inode's mapping, which is just the + * normal truncate path. + */ + if (control->clear_extent_range) { + ret = btrfs_inode_clear_file_extent_range(control->inode, + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (del_item) { + ASSERT(!pending_del_nr || + ((path->slots[0] + 1) == pending_del_slot)); + + control->last_size = found_key.offset; + if (!pending_del_nr) { + /* No pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* Hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } + } else { + control->last_size = new_size; + break; + } + + if (del_item && extent_start != 0 && !control->skip_ref_updates) { + struct btrfs_ref ref = { 0 }; + + bytes_deleted += extent_num_bytes; + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + control->ino, extent_offset, + root->root_key.objectid, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + if (be_nice) { + if (btrfs_should_throttle_delayed_refs(trans)) + should_throttle = true; + } + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || + should_throttle) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + pending_del_nr = 0; + } + btrfs_release_path(path); + + /* + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. + */ + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } + } + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } + } + + ASSERT(control->last_size >= new_size); + if (!ret && control->last_size > new_size) + control->last_size = new_size; + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h new file mode 100644 index 000000000000..a8fc16d0147f --- /dev/null +++ b/fs/btrfs/inode-item.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_INODE_ITEM_H +#define BTRFS_INODE_ITEM_H + +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_path; +struct btrfs_key; +struct btrfs_inode_extref; +struct btrfs_inode; +struct extent_buffer; + +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define BTRFS_NEED_TRUNCATE_BLOCK 1 + +struct btrfs_truncate_control { + /* + * IN: the inode we're operating on, this can be NULL if + * ->clear_extent_range is false. + */ + struct btrfs_inode *inode; + + /* IN: the size we're truncating to. */ + u64 new_size; + + /* OUT: the number of extents truncated. */ + u64 extents_found; + + /* OUT: the last size we truncated this inode to. */ + u64 last_size; + + /* OUT: the number of bytes to sub from this inode. */ + u64 sub_bytes; + + /* IN: the ino we are truncating. */ + u64 ino; + + /* + * IN: minimum key type to remove. All key types with this type are + * removed only if their offset >= new_size. + */ + u32 min_type; + + /* + * IN: true if we don't want to do extent reference updates for any file + * extents we drop. + */ + bool skip_ref_updates; + + /* + * IN: true if we need to clear the file extent range for the inode as + * we drop the file extent items. + */ + bool clear_extent_range; +}; + +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control); +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +struct btrfs_inode_extref *btrfs_lookup_inode_extref( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8c911a4a320..3b2403b6127f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -54,6 +54,7 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "inode-item.h" struct btrfs_iget_args { u64 ino; @@ -61,8 +62,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 reserve; - loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; }; @@ -1532,11 +1531,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { - int ret; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; + int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, + ret = btrfs_lookup_csums_range(csum_root, bytenr, bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; @@ -2518,7 +2518,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - !fs_info->csum_root; + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2586,11 +2586,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; + struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); + if (!csum_root) + csum_root = btrfs_csum_root(trans->fs_info, + sum->bytenr); + ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -3316,7 +3320,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (!root->fs_info->csum_root) + if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) return 0; ASSERT(page_offset(page) <= start && @@ -3477,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); @@ -3635,8 +3639,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* release the path since we're done with it */ btrfs_release_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) @@ -4615,389 +4617,6 @@ out: } /* - * Return this if we need to call truncate_block for the last bit of the - * truncate. - */ -#define NEED_TRUNCATE_BLOCK 1 - -/* - * Remove inode items from a given root. - * - * @trans: A transaction handle. - * @root: The root from which to remove items. - * @inode: The inode whose items we want to remove. - * @new_size: The new i_size for the inode. This is only applicable when - * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. - * @min_type: The minimum key type to remove. All keys with a type - * greater than this value are removed and all keys with - * this type are removed only if their offset is >= @new_size. - * @extents_found: Output parameter that will contain the number of file - * extent items that were removed or adjusted to the new - * inode i_size. The caller is responsible for initializing - * the counter. Also, it can be NULL if the caller does not - * need this counter. - * - * Remove all keys associated with the inode from the given root that have a key - * with a type greater than or equals to @min_type. When @min_type has a value of - * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value - * greater than or equals to @new_size. If a file extent item that starts before - * @new_size and ends after it is found, its length is adjusted. - * - * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is - * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. - */ -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - u64 new_size, u32 min_type, - u64 *extents_found) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - struct btrfs_key found_key; - u64 extent_start = 0; - u64 extent_num_bytes = 0; - u64 extent_offset = 0; - u64 item_end = 0; - u64 last_size = new_size; - u32 found_type = (u8)-1; - int found_extent; - int del_item; - int pending_del_nr = 0; - int pending_del_slot = 0; - int extent_type = -1; - int ret; - u64 ino = btrfs_ino(inode); - u64 bytes_deleted = 0; - bool be_nice = false; - bool should_throttle = false; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); - struct extent_state *cached_state = NULL; - - BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - - /* - * For non-free space inodes and non-shareable roots, we want to back - * off from time to time. This means all inodes in subvolume roots, - * reloc roots, and data reloc roots. - */ - if (!btrfs_is_free_space_inode(inode) && - test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - be_nice = true; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_BACK; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - - /* - * We want to drop from the next block forward in case this - * new size is not block aligned since we will be keeping the - * last block of the extent just the way it is. - */ - btrfs_drop_extent_cache(inode, ALIGN(new_size, - fs_info->sectorsize), - (u64)-1, 0); - } - - /* - * This function is also used to drop the items in the log tree before - * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the logged items. So we shouldn't kill the delayed - * items. - */ - if (min_type == 0 && root == inode->root) - btrfs_kill_delayed_inode_items(inode); - - key.objectid = ino; - key.offset = (u64)-1; - key.type = (u8)-1; - -search_again: - /* - * with a 16K leaf size and 128MB extents, you can actually queue - * up a huge file in a single leaf. Most of the time that - * bytes_deleted is > 0, it will be huge by the time we get here - */ - if (be_nice && bytes_deleted > SZ_32M && - btrfs_should_end_transaction(trans)) { - ret = -EAGAIN; - goto out; - } - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = 0; - /* there are no items in the tree for us to truncate, we're - * done - */ - if (path->slots[0] == 0) - goto out; - path->slots[0]--; - } - - while (1) { - u64 clear_start = 0, clear_len = 0; - - fi = NULL; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; - - if (found_key.objectid != ino) - break; - - if (found_type < min_type) - break; - - item_end = found_key.offset; - if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - item_end += - btrfs_file_extent_num_bytes(leaf, fi); - - trace_btrfs_truncate_show_fi_regular( - inode, leaf, fi, found_key.offset); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_ram_bytes(leaf, - fi); - - trace_btrfs_truncate_show_fi_inline( - inode, leaf, fi, path->slots[0], - found_key.offset); - } - item_end--; - } - if (found_type > min_type) { - del_item = 1; - } else { - if (item_end < new_size) - break; - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; - } - found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ - if (found_type != BTRFS_EXTENT_DATA_KEY) - goto delete; - - if (extents_found != NULL) - (*extents_found)++; - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - u64 num_dec; - - clear_start = found_key.offset; - extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item) { - u64 orig_num_bytes = - btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = ALIGN(new_size - - found_key.offset, - fs_info->sectorsize); - clear_start = ALIGN(new_size, fs_info->sectorsize); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_num_bytes); - num_dec = (orig_num_bytes - - extent_num_bytes); - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state) && - extent_start != 0) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - btrfs_mark_buffer_dirty(leaf); - } else { - extent_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, - fi); - extent_offset = found_key.offset - - btrfs_file_extent_offset(leaf, fi); - - /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_bytes(leaf, fi); - if (extent_start != 0) { - found_extent = 1; - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state)) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - } - } - clear_len = num_dec; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - /* - * we can't truncate inline items that have had - * special encodings - */ - if (!del_item && - btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0 && - btrfs_file_extent_compression(leaf, fi) == 0) { - u32 size = (u32)(new_size - found_key.offset); - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); - } else if (!del_item) { - /* - * We have to bail so the last_size is set to - * just before this extent. - */ - ret = NEED_TRUNCATE_BLOCK; - break; - } else { - /* - * Inline extents are special, we just treat - * them as a full sector worth in the file - * extent tree just for simplicity sake. - */ - clear_len = fs_info->sectorsize; - } - - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(&inode->vfs_inode, - item_end + 1 - new_size); - } -delete: - /* - * We use btrfs_truncate_inode_items() to clean up log trees for - * multiple fsyncs, and in this case we don't want to clear the - * file extent range because it's just the log. - */ - if (root == inode->root) { - ret = btrfs_inode_clear_file_extent_range(inode, - clear_start, clear_len); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - } - - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (del_item) { - if (!pending_del_nr) { - /* no pending yet, add ourselves */ - pending_del_slot = path->slots[0]; - pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { - /* hop on the pending chunk */ - pending_del_nr++; - pending_del_slot = path->slots[0]; - } else { - BUG(); - } - } else { - break; - } - should_throttle = false; - - if (found_extent && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_ref ref = { 0 }; - - bytes_deleted += extent_num_bytes; - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - ino, extent_offset, - root->root_key.objectid, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } - } - - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - - if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot || - should_throttle) { - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - pending_del_nr = 0; - } - btrfs_release_path(path); - - /* - * We can generate a lot of delayed refs, so we need to - * throttle every once and a while and make sure we're - * adding enough space to keep up with the work we are - * generating. Since we hold a transaction here we - * can't flush, and we don't want to FLUSH_LIMIT because - * we could have generated too many delayed refs to - * actually allocate, so just bail if we're short and - * let the normal reservation dance happen higher up. - */ - if (should_throttle) { - ret = btrfs_delayed_refs_rsv_refill(fs_info, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - ret = -EAGAIN; - break; - } - } - goto search_again; - } else { - path->slots[0]--; - } - } -out: - if (ret >= 0 && pending_del_nr) { - int err; - - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; - } - } - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ASSERT(last_size >= new_size); - if (!ret && last_size > new_size) - last_size = new_size; - btrfs_inode_safe_disk_i_size_write(inode, last_size); - unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - } - - btrfs_free_path(path); - return ret; -} - -/* * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing @@ -5525,7 +5144,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); int ret; @@ -5540,18 +5158,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * - * If we cannot make our reservation we'll attempt to steal from the - * global reserve, because we really want to be able to free up space. + * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, + * if we fail to make this reservation we can re-try without the + * delayed_refs_extra so we can make some forward progress. */ - ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { - /* - * Try to steal from the global reserve if there is space for - * it. - */ - if (btrfs_check_space_for_delayed_refs(fs_info) || - btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); @@ -5610,10 +5226,22 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * This makes sure the inode item in tree is uptodate and the space for + * the inode update is released. + */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto no_delete; + /* + * This drops any pending insert or delete operations we have for this + * inode. We could have a delayed dir index deletion queued up, but + * we're removing the inode completely so that'll be taken care of in + * the truncate. + */ + btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; @@ -5623,14 +5251,20 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .new_size = 0, + .min_type = 0, + }; + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6998,8 +6632,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); - inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(path->slots[0])); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; @@ -7773,6 +7406,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + bool can_nocow = false; + bool space_reserved = false; int ret = 0; /* @@ -7787,9 +7424,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else @@ -7799,53 +7433,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; + btrfs_inc_nocow_writers(fs_info, block_start)) + can_nocow = true; + } - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em = em2; - } + goto out; + } + space_reserved = true; - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(fs_info, len); - goto skip_cow; + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; } - } - /* this will cow the extent */ - free_extent_map(em); - *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + } else { + const u64 prev_len = len; + + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + /* We have to COW, so need to reserve metadata and data space. */ + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, len); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start + len, prev_len - len, + true); } - len = min(len, em->len - (start - em->start)); + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), len); -skip_cow: /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - - dio_data->reserve -= len; out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + if (can_nocow) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } else { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start, len, true); + extent_changeset_free(dio_data->data_reserved); + dio_data->data_reserved = NULL; + } + } return ret; } @@ -7887,18 +7560,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } - } iomap->private = dio_data; @@ -7991,14 +7652,8 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, start, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } + kfree(dio_data); + return ret; } @@ -8028,14 +7683,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ret = -ENOTBLK; } - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + if (write) extent_changeset_free(dio_data->data_reserved); - } out: kfree(dio_data); iomap->private = NULL; @@ -8884,6 +8533,12 @@ out_noreserve: static int btrfs_truncate(struct inode *inode, bool skip_writeback) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; @@ -8891,7 +8546,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); - u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8952,10 +8606,30 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - inode->i_size, - BTRFS_EXTENT_DATA_KEY, - &extents_found); + struct extent_state *cached_state = NULL; + const u64 new_size = inode->i_size; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + + control.new_size = new_size; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* + * We want to drop from the next block forward in case this new + * size is not block aligned since we will be keeping the last + * block of the extent just the way it is. + */ + btrfs_drop_extent_cache(BTRFS_I(inode), + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, 0); + + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8983,11 +8657,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* * We can't call btrfs_truncate_block inside a trans handle as we could - * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know - * we've truncated everything except the last little bit, and can do - * btrfs_truncate_block and then update the disk_i_size. + * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we + * know we've truncated everything except the last little bit, and can + * do btrfs_truncate_block and then update the disk_i_size. */ - if (ret == NEED_TRUNCATE_BLOCK) { + if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -9031,7 +8705,7 @@ out: * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ - if (extents_found > 0) + if (control.extents_found > 0) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; @@ -10595,9 +10269,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; + unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, PAGE_SIZE) >> PAGE_SHIFT; @@ -10605,6 +10289,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fb8cc9642ac4..a5bd6926f7ff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, * * Compatibility: * - the same type is already running + * - when trying to add a device and balance has been paused * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller * must check the condition first that would allow none -> @type */ @@ -394,7 +395,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type) + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) return true; spin_unlock(&fs_info->super_lock); @@ -414,6 +417,29 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} + static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -518,7 +544,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; - int err; dev_t anon_dev = 0; u64 objectid; u64 index = 0; @@ -617,11 +642,13 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the * filesystem in an inconsistent state (an extent item in the - * extent tree without backreferences). Also no need to have - * the tree block locked since it is not in any tree at this - * point, so no other task can find it and use it. + * extent tree with a backreference for a root that does not + * exists). */ - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_tree_lock(leaf); + btrfs_clean_tree_block(leaf); + btrfs_tree_unlock(leaf); + btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); goto fail; } @@ -696,9 +723,10 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret) + btrfs_end_transaction(trans); + else + ret = btrfs_commit_transaction(trans); if (!ret) { inode = btrfs_lookup_dentry(dir, dentry); @@ -2082,7 +2110,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); - item_len = btrfs_item_size_nr(leaf, i); + item_len = btrfs_item_size(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) @@ -2536,7 +2564,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); - item_len = btrfs_item_size_nr(leaf, slot); + item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { @@ -2738,7 +2766,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); - item_len = btrfs_item_size_nr(leaf, slot) + item_len = btrfs_item_size(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); @@ -3146,13 +3174,25 @@ out: static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; + bool restore_op = false; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { + if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + /* + * We can do the device add because we have a paused balanced, + * change the exclusive op type and remember we should bring + * back the paused balance + */ + fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; + btrfs_exclop_start_unlock(fs_info); + restore_op = true; + } vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { @@ -3168,7 +3208,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - btrfs_exclop_finish(fs_info); + if (restore_op) + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + else + btrfs_exclop_finish(fs_info); return ret; } @@ -3187,10 +3230,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) return -EPERM; vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -3622,7 +3663,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, { struct btrfs_trans_handle *trans; u64 transid; - int ret; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -3634,11 +3674,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + btrfs_commit_transaction_async(trans); out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -3985,6 +4021,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; + if (!arg) + btrfs_warn(fs_info, + "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4057,6 +4097,7 @@ locked: spin_lock(&fs_info->balance_lock); bctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); goto do_balance; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 65cb0766e62d..0fb90cbe7669 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -125,6 +125,7 @@ static inline size_t read_compress_length(const char *buf) static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, struct page **out_pages, + unsigned long max_nr_page, u32 *cur_out, const u32 sectorsize) { @@ -133,6 +134,9 @@ static int copy_compressed_data_to_page(char *compressed_data, struct page *cur_page; char *kaddr; + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + /* * We never allow a segment header crossing sector boundary, previous * run should ensure we have enough space left inside the sector. @@ -161,6 +165,10 @@ static int copy_compressed_data_to_page(char *compressed_data, orig_out + compressed_size - *cur_out); kunmap(cur_page); + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { @@ -203,6 +211,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; struct page *page_in = NULL; char *sizes_ptr; + const unsigned long max_nr_page = *out_pages; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -210,6 +219,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; + ASSERT(max_nr_page > 0); *out_pages = 0; *total_out = 0; *total_in = 0; @@ -248,7 +258,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, &cur_out, sectorsize); + pages, max_nr_page, + &cur_out, sectorsize); if (ret < 0) goto out; @@ -279,6 +290,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = cur_out; *total_in = cur_in - start; out: + if (page_in) + put_page(page_in); *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aae1027bd76a..0775ae9f4419 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u64 flags; u64 offset; int ref_index = 0; @@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_fs_info *fs_info; int i; u32 type, nr; - struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; @@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = key.type; pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", i, key.objectid, type, key.offset, - btrfs_item_offset(l, item), btrfs_item_size(l, item)); + btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); @@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_UUID_KEY_SUBVOL: case BTRFS_UUID_KEY_RECEIVED_SUBVOL: print_uuid_item(l, btrfs_item_ptr_offset(l, i), - btrfs_item_size_nr(l, i)); + btrfs_item_size(l, i)); break; } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b1cb5a8c2999..1a6d2d5b4b33 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root, di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); cur = 0; - total_len = btrfs_item_size_nr(leaf, slot); + total_len = btrfs_item_size(leaf, slot); while (cur < total_len) { u32 name_len = btrfs_dir_name_len(leaf, di); @@ -377,8 +377,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, */ if (need_reserve) { num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - num_bytes, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_block_rsv_add(fs_info, trans->block_rsv, + num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (ret) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index db680f5be745..8928275823a1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) int ret = 0; int slot; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to enable quotas, because we will unlock + * and relock qgroup_ioctl_lock before setting fs_info->quota_root + * and before setting BTRFS_FS_QUOTA_ENABLED. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1117,8 +1125,19 @@ out_add_root: goto out_free_path; } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* + * Commit the transaction while not holding qgroup_ioctl_lock, to avoid + * a deadlock with tasks concurrently doing other qgroup operations, such + * adding/removing qgroups or adding/deleting qgroup relations for example, + * because all qgroup operations first start or join a transaction and then + * lock the qgroup_ioctl_lock mutex. + * We are safe from a concurrent task trying to enable quotas, by calling + * this function, since we are serialized by fs_info->subvol_sem. + */ ret = btrfs_commit_transaction(trans); trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; @@ -1219,7 +1238,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_lock(quota_root->node); btrfs_clean_tree_block(quota_root->node); btrfs_tree_unlock(quota_root->node); - btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(quota_root), + quota_root->node, 0, 1); btrfs_put_root(quota_root); @@ -3141,6 +3161,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; @@ -3150,7 +3171,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int ret; mutex_lock(&fs_info->qgroup_rescan_lock); - ret = btrfs_search_slot_for_read(fs_info->extent_root, + extent_root = btrfs_extent_root(fs_info, + fs_info->qgroup_rescan_progress.objectid); + ret = btrfs_search_slot_for_read(extent_root, &fs_info->qgroup_rescan_progress, path, 1, 0); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c deleted file mode 100644 index eb96fdc3be25..000000000000 --- a/fs/btrfs/reada.c +++ /dev/null @@ -1,1086 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2011 STRATO. All rights reserved. - */ - -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include "ctree.h" -#include "volumes.h" -#include "disk-io.h" -#include "transaction.h" -#include "dev-replace.h" -#include "block-group.h" - -#undef DEBUG - -/* - * This is the implementation for the generic read ahead framework. - * - * To trigger a readahead, btrfs_reada_add must be called. It will start - * a read ahead for the given range [start, end) on tree root. The returned - * handle can either be used to wait on the readahead to finish - * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). - * - * The read ahead works as follows: - * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. - * reada_start_machine will then search for extents to prefetch and trigger - * some reads. When a read finishes for a node, all contained node/leaf - * pointers that lie in the given range will also be enqueued. The reads will - * be triggered in sequential order, thus giving a big win over a naive - * enumeration. It will also make use of multi-device layouts. Each disk - * will have its on read pointer and all disks will by utilized in parallel. - * Also will no two disks read both sides of a mirror simultaneously, as this - * would waste seeking capacity. Instead both disks will read different parts - * of the filesystem. - * Any number of readaheads can be started in parallel. The read order will be - * determined globally, i.e. 2 parallel readaheads will normally finish faster - * than the 2 started one after another. - */ - -#define MAX_IN_FLIGHT 6 - -struct reada_extctl { - struct list_head list; - struct reada_control *rc; - u64 generation; -}; - -struct reada_extent { - u64 logical; - u64 owner_root; - struct btrfs_key top; - struct list_head extctl; - int refcnt; - spinlock_t lock; - struct reada_zone *zones[BTRFS_MAX_MIRRORS]; - int nzones; - int scheduled; - int level; -}; - -struct reada_zone { - u64 start; - u64 end; - u64 elems; - struct list_head list; - spinlock_t lock; - int locked; - struct btrfs_device *device; - struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl - * self */ - int ndevs; - struct kref refcnt; -}; - -struct reada_machine_work { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); -static void reada_control_release(struct kref *kref); -static void reada_zone_release(struct kref *kref); -static void reada_start_machine(struct btrfs_fs_info *fs_info); -static void __reada_start_machine(struct btrfs_fs_info *fs_info); - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level); - -/* recurses */ -/* in case of err, eb might be NULL */ -static void __readahead_hook(struct btrfs_fs_info *fs_info, - struct reada_extent *re, struct extent_buffer *eb, - int err) -{ - int nritems; - int i; - u64 bytenr; - u64 generation; - struct list_head list; - - spin_lock(&re->lock); - /* - * just take the full list from the extent. afterwards we - * don't need the lock anymore - */ - list_replace_init(&re->extctl, &list); - re->scheduled = 0; - spin_unlock(&re->lock); - - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - if (err) - goto cleanup; - - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - if (!btrfs_header_level(eb)) - goto cleanup; - - nritems = btrfs_header_nritems(eb); - generation = btrfs_header_generation(eb); - for (i = 0; i < nritems; i++) { - struct reada_extctl *rec; - u64 n_gen; - struct btrfs_key key; - struct btrfs_key next_key; - - btrfs_node_key_to_cpu(eb, &key, i); - if (i + 1 < nritems) - btrfs_node_key_to_cpu(eb, &next_key, i + 1); - else - next_key = re->top; - bytenr = btrfs_node_blockptr(eb, i); - n_gen = btrfs_node_ptr_generation(eb, i); - - list_for_each_entry(rec, &list, list) { - struct reada_control *rc = rec->rc; - - /* - * if the generation doesn't match, just ignore this - * extctl. This will probably cut off a branch from - * prefetch. Alternatively one could start a new (sub-) - * prefetch for this branch, starting again from root. - * FIXME: move the generation check out of this loop - */ -#ifdef DEBUG - if (rec->generation != generation) { - btrfs_debug(fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); - } -#endif - if (rec->generation == generation && - btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && - btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - btrfs_header_owner(eb), n_gen, - btrfs_header_level(eb) - 1); - } - } - -cleanup: - /* - * free extctl records - */ - while (!list_empty(&list)) { - struct reada_control *rc; - struct reada_extctl *rec; - - rec = list_first_entry(&list, struct reada_extctl, list); - list_del(&rec->list); - rc = rec->rc; - kfree(rec); - - kref_get(&rc->refcnt); - if (atomic_dec_and_test(&rc->elems)) { - kref_put(&rc->refcnt, reada_control_release); - wake_up(&rc->wait); - } - kref_put(&rc->refcnt, reada_control_release); - - reada_extent_put(fs_info, re); /* one ref for each entry */ - } - - return; -} - -int btree_readahead_hook(struct extent_buffer *eb, int err) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int ret = 0; - struct reada_extent *re; - - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, - eb->start >> fs_info->sectorsize_bits); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - if (!re) { - ret = -1; - goto start_machine; - } - - __readahead_hook(fs_info, re, eb, err); - reada_extent_put(fs_info, re); /* our ref */ - -start_machine: - reada_start_machine(fs_info); - return ret; -} - -static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, - struct btrfs_io_context *bioc) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; - struct reada_zone *zone; - struct btrfs_block_group *cache = NULL; - u64 start; - u64 end; - int i; - - zone = NULL; - spin_lock(&fs_info->reada_lock); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) { - kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - return zone; - } - - spin_unlock(&fs_info->reada_lock); - - cache = btrfs_lookup_block_group(fs_info, logical); - if (!cache) - return NULL; - - start = cache->start; - end = start + cache->length - 1; - btrfs_put_block_group(cache); - - zone = kzalloc(sizeof(*zone), GFP_KERNEL); - if (!zone) - return NULL; - - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - kfree(zone); - return NULL; - } - - zone->start = start; - zone->end = end; - INIT_LIST_HEAD(&zone->list); - spin_lock_init(&zone->lock); - zone->locked = 0; - kref_init(&zone->refcnt); - zone->elems = 0; - zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bioc->num_stripes; ++i) { - /* bounds have already been checked */ - zone->devs[i] = bioc->stripes[i].dev; - } - zone->ndevs = bioc->num_stripes; - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> fs_info->sectorsize_bits), - zone); - - if (ret == -EEXIST) { - kfree(zone); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) - kref_get(&zone->refcnt); - else - zone = NULL; - } - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - - return zone; -} - -static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, - u64 logical, - struct btrfs_key *top, - u64 owner_root, int level) -{ - int ret; - struct reada_extent *re = NULL; - struct reada_extent *re_exist = NULL; - struct btrfs_io_context *bioc = NULL; - struct btrfs_device *dev; - struct btrfs_device *prev_dev; - u64 length; - int real_stripes; - int nzones = 0; - unsigned long index = logical >> fs_info->sectorsize_bits; - int dev_replace_is_ongoing; - int have_zone = 0; - - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (re) - return re; - - re = kzalloc(sizeof(*re), GFP_KERNEL); - if (!re) - return NULL; - - re->logical = logical; - re->top = *top; - INIT_LIST_HEAD(&re->extctl); - spin_lock_init(&re->lock); - re->refcnt = 1; - re->owner_root = owner_root; - re->level = level; - - /* - * map block - */ - length = fs_info->nodesize; - ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bioc, 0); - if (ret || !bioc || length < fs_info->nodesize) - goto error; - - if (bioc->num_stripes > BTRFS_MAX_MIRRORS) { - btrfs_err(fs_info, - "readahead: more than %d copies not supported", - BTRFS_MAX_MIRRORS); - goto error; - } - - real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - for (nzones = 0; nzones < real_stripes; ++nzones) { - struct reada_zone *zone; - - dev = bioc->stripes[nzones].dev; - - /* cannot read ahead on missing device. */ - if (!dev->bdev) - continue; - - zone = reada_find_zone(dev, logical, bioc); - if (!zone) - continue; - - re->zones[re->nzones++] = zone; - spin_lock(&zone->lock); - if (!zone->elems) - kref_get(&zone->refcnt); - ++zone->elems; - spin_unlock(&zone->lock); - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - if (re->nzones == 0) { - /* not a single zone found, error and out */ - goto error; - } - - /* Insert extent in reada tree + all per-device trees, all or nothing */ - down_read(&fs_info->dev_replace.rwsem); - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&fs_info->reada_tree, index, re); - if (ret == -EEXIST) { - re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - re_exist->refcnt++; - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - if (ret) { - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - radix_tree_preload_end(); - prev_dev = NULL; - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( - &fs_info->dev_replace); - for (nzones = 0; nzones < re->nzones; ++nzones) { - dev = re->zones[nzones]->device; - - if (dev == prev_dev) { - /* - * in case of DUP, just add the first zone. As both - * are on the same device, there's nothing to gain - * from adding both. - * Also, it wouldn't work, as the tree is per device - * and adding would fail with EEXIST - */ - continue; - } - if (!dev->bdev) - continue; - - if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) - continue; - - if (dev_replace_is_ongoing && - dev == fs_info->dev_replace.tgtdev) { - /* - * as this device is selected for reading only as - * a last resort, skip it for read ahead. - */ - continue; - } - prev_dev = dev; - ret = radix_tree_insert(&dev->reada_extents, index, re); - if (ret) { - while (--nzones >= 0) { - dev = re->zones[nzones]->device; - BUG_ON(dev == NULL); - /* ignore whether the entry was inserted */ - radix_tree_delete(&dev->reada_extents, index); - } - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - have_zone = 1; - } - if (!have_zone) - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - - if (!have_zone) - goto error; - - btrfs_put_bioc(bioc); - return re; - -error: - for (nzones = 0; nzones < re->nzones; ++nzones) { - struct reada_zone *zone; - - zone = re->zones[nzones]; - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* - * no fs_info->reada_lock needed, as this can't be - * the last ref - */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - btrfs_put_bioc(bioc); - kfree(re); - return re_exist; -} - -static void reada_extent_put(struct btrfs_fs_info *fs_info, - struct reada_extent *re) -{ - int i; - unsigned long index = re->logical >> fs_info->sectorsize_bits; - - spin_lock(&fs_info->reada_lock); - if (--re->refcnt) { - spin_unlock(&fs_info->reada_lock); - return; - } - - radix_tree_delete(&fs_info->reada_tree, index); - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - radix_tree_delete(&zone->device->reada_extents, index); - } - - spin_unlock(&fs_info->reada_lock); - - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* no fs_info->reada_lock needed, as this can't be - * the last ref */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - - kfree(re); -} - -static void reada_zone_release(struct kref *kref) -{ - struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); - struct btrfs_fs_info *fs_info = zone->device->fs_info; - - lockdep_assert_held(&fs_info->reada_lock); - - radix_tree_delete(&zone->device->reada_zones, - zone->end >> fs_info->sectorsize_bits); - - kfree(zone); -} - -static void reada_control_release(struct kref *kref) -{ - struct reada_control *rc = container_of(kref, struct reada_control, - refcnt); - - kfree(rc); -} - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level) -{ - struct btrfs_fs_info *fs_info = rc->fs_info; - struct reada_extent *re; - struct reada_extctl *rec; - - /* takes one ref */ - re = reada_find_extent(fs_info, logical, top, owner_root, level); - if (!re) - return -1; - - rec = kzalloc(sizeof(*rec), GFP_KERNEL); - if (!rec) { - reada_extent_put(fs_info, re); - return -ENOMEM; - } - - rec->rc = rc; - rec->generation = generation; - atomic_inc(&rc->elems); - - spin_lock(&re->lock); - list_add_tail(&rec->list, &re->extctl); - spin_unlock(&re->lock); - - /* leave the ref on the extent */ - - return 0; -} - -/* - * called with fs_info->reada_lock held - */ -static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) -{ - int i; - unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; - - for (i = 0; i < zone->ndevs; ++i) { - struct reada_zone *peer; - peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); - if (peer && peer->device != zone->device) - peer->locked = lock; - } -} - -/* - * called with fs_info->reada_lock held - */ -static int reada_pick_zone(struct btrfs_device *dev) -{ - struct reada_zone *top_zone = NULL; - struct reada_zone *top_locked_zone = NULL; - u64 top_elems = 0; - u64 top_locked_elems = 0; - unsigned long index = 0; - int ret; - - if (dev->reada_curr_zone) { - reada_peer_zones_set_lock(dev->reada_curr_zone, 0); - kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); - dev->reada_curr_zone = NULL; - } - /* pick the zone with the most elements */ - while (1) { - struct reada_zone *zone; - - ret = radix_tree_gang_lookup(&dev->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; - if (zone->locked) { - if (zone->elems > top_locked_elems) { - top_locked_elems = zone->elems; - top_locked_zone = zone; - } - } else { - if (zone->elems > top_elems) { - top_elems = zone->elems; - top_zone = zone; - } - } - } - if (top_zone) - dev->reada_curr_zone = top_zone; - else if (top_locked_zone) - dev->reada_curr_zone = top_locked_zone; - else - return 0; - - dev->reada_next = dev->reada_curr_zone->start; - kref_get(&dev->reada_curr_zone->refcnt); - reada_peer_zones_set_lock(dev->reada_curr_zone, 1); - - return 1; -} - -static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, int level, int mirror_num, - struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); - if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); - if (ret) { - free_extent_buffer_stale(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer_stale(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - -static int reada_start_machine_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - struct reada_extent *re = NULL; - int mirror_num = 0; - struct extent_buffer *eb = NULL; - u64 logical; - int ret; - int i; - - spin_lock(&fs_info->reada_lock); - if (dev->reada_curr_zone == NULL) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - } - /* - * FIXME currently we issue the reads one extent at a time. If we have - * a contiguous block of extents, we could also coagulate them or use - * plugging to speed things up - */ - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - if (ret == 0 || re->logical > dev->reada_curr_zone->end) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - re = NULL; - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - } - if (ret == 0) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - dev->reada_next = re->logical + fs_info->nodesize; - re->refcnt++; - - spin_unlock(&fs_info->reada_lock); - - spin_lock(&re->lock); - if (re->scheduled || list_empty(&re->extctl)) { - spin_unlock(&re->lock); - reada_extent_put(fs_info, re); - return 0; - } - re->scheduled = 1; - spin_unlock(&re->lock); - - /* - * find mirror num - */ - for (i = 0; i < re->nzones; ++i) { - if (re->zones[i]->device == dev) { - mirror_num = i + 1; - break; - } - } - logical = re->logical; - - atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, - re->level, mirror_num, &eb); - if (ret) - __readahead_hook(fs_info, re, NULL, ret); - else if (eb) - __readahead_hook(fs_info, re, eb, ret); - - if (eb) - free_extent_buffer(eb); - - atomic_dec(&dev->reada_in_flight); - reada_extent_put(fs_info, re); - - return 1; - -} - -static void reada_start_machine_worker(struct btrfs_work *work) -{ - struct reada_machine_work *rmw; - int old_ioprio; - - rmw = container_of(work, struct reada_machine_work, work); - - old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), - task_nice_ioprio(current)); - set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(rmw->fs_info); - set_task_ioprio(current, old_ioprio); - - atomic_dec(&rmw->fs_info->reada_works_cnt); - - kfree(rmw); -} - -/* Try to start up to 10k READA requests for a group of devices */ -static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) -{ - u64 enqueued; - u64 total = 0; - struct btrfs_device *device; - - do { - enqueued = 0; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (atomic_read(&device->reada_in_flight) < - MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(device); - } - total += enqueued; - } while (enqueued && total < 10000); - - return total; -} - -static void __reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; - int i; - u64 enqueued = 0; - - mutex_lock(&fs_devices->device_list_mutex); - - enqueued += reada_start_for_fsdevs(fs_devices); - list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) - enqueued += reada_start_for_fsdevs(seed_devs); - - mutex_unlock(&fs_devices->device_list_mutex); - if (enqueued == 0) - return; - - /* - * If everything is already in the cache, this is effectively single - * threaded. To a) not hold the caller for too long and b) to utilize - * more cores, we broke the loop above after 10000 iterations and now - * enqueue to workers to finish it. This will distribute the load to - * the cores. - */ - for (i = 0; i < 2; ++i) { - reada_start_machine(fs_info); - if (atomic_read(&fs_info->reada_works_cnt) > - BTRFS_MAX_MIRRORS * 2) - break; - } -} - -static void reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct reada_machine_work *rmw; - - rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); - if (!rmw) { - /* FIXME we cannot handle this properly right now */ - BUG(); - } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); - rmw->fs_info = fs_info; - - btrfs_queue_work(fs_info->readahead_workers, &rmw->work); - atomic_inc(&fs_info->reada_works_cnt); -} - -#ifdef DEBUG -static void dump_devs(struct btrfs_fs_info *fs_info, int all) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - unsigned long index; - int ret; - int i; - int j; - int cnt; - - spin_lock(&fs_info->reada_lock); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid, - atomic_read(&device->reada_in_flight)); - index = 0; - while (1) { - struct reada_zone *zone; - ret = radix_tree_gang_lookup(&device->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - pr_debug(" zone %llu-%llu elems %llu locked %d devs", - zone->start, zone->end, zone->elems, - zone->locked); - for (j = 0; j < zone->ndevs; ++j) { - pr_cont(" %lld", - zone->devs[j]->devid); - } - if (device->reada_curr_zone == zone) - pr_cont(" curr off %llu", - device->reada_next - zone->start); - pr_cont("\n"); - index = (zone->end >> fs_info->sectorsize_bits) + 1; - } - cnt = 0; - index = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&device->reada_extents, - (void **)&re, index, 1); - if (ret == 0) - break; - pr_debug(" re: logical %llu size %u empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - if (++cnt > 15) - break; - } - } - - index = 0; - cnt = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, - index, 1); - if (ret == 0) - break; - if (!re->scheduled) { - index = (re->logical >> fs_info->sectorsize_bits) + 1; - continue; - } - pr_debug("re: logical %llu size %u list empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - } - spin_unlock(&fs_info->reada_lock); -} -#endif - -/* - * interface - */ -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *key_start, struct btrfs_key *key_end) -{ - struct reada_control *rc; - u64 start; - u64 generation; - int ret; - int level; - struct extent_buffer *node; - static struct btrfs_key max_key = { - .objectid = (u64)-1, - .type = (u8)-1, - .offset = (u64)-1 - }; - - rc = kzalloc(sizeof(*rc), GFP_KERNEL); - if (!rc) - return ERR_PTR(-ENOMEM); - - rc->fs_info = root->fs_info; - rc->key_start = *key_start; - rc->key_end = *key_end; - atomic_set(&rc->elems, 0); - init_waitqueue_head(&rc->wait); - kref_init(&rc->refcnt); - kref_get(&rc->refcnt); /* one ref for having elements */ - - node = btrfs_root_node(root); - start = node->start; - generation = btrfs_header_generation(node); - level = btrfs_header_level(node); - free_extent_buffer(node); - - ret = reada_add_block(rc, start, &max_key, root->root_key.objectid, - generation, level); - if (ret) { - kfree(rc); - return ERR_PTR(ret); - } - - reada_start_machine(root->fs_info); - - return rc; -} - -#ifdef DEBUG -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - 5 * HZ); - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - } - - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#else -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - (HZ + 9) / 10); - } - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#endif - -void btrfs_reada_detach(void *handle) -{ - struct reada_control *rc = handle; - - kref_put(&rc->refcnt, reada_control_release); -} - -/* - * Before removing a device (device replace or device remove ioctls), call this - * function to wait for all existing readahead requests on the device and to - * make sure no one queues more readahead requests for the device. - * - * Must be called without holding neither the device list mutex nor the device - * replace semaphore, otherwise it will deadlock. - */ -void btrfs_reada_remove_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - - /* Serialize with readahead extent creation at reada_find_extent(). */ - spin_lock(&fs_info->reada_lock); - set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&fs_info->reada_lock); - - /* - * There might be readahead requests added to the radix trees which - * were not yet added to the readahead work queue. We need to start - * them and wait for their completion, otherwise we can end up with - * use-after-free problems when dropping the last reference on the - * readahead extents and their zones, as they need to access the - * device structure. - */ - reada_start_machine(fs_info); - btrfs_flush_workqueue(fs_info->readahead_workers); -} - -/* - * If when removing a device (device replace or device remove ioctls) an error - * happens after calling btrfs_reada_remove_dev(), call this to undo what that - * function did. This is safe to call even if btrfs_reada_remove_dev() was not - * called before. - */ -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) -{ - spin_lock(&dev->fs_info->reada_lock); - clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&dev->fs_info->reada_lock); -} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e2b9f8616501..a248f46cfe72 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct extent_buffer *leaf = path->nodes[0]; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; int type, ret; @@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, /* Walk down all roots and build the ref tree, meant to be called at mount */ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *eb; int tree_block_level = 0; @@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - eb = btrfs_read_lock_root_node(fs_info->extent_root); + extent_root = btrfs_extent_root(fs_info, 0); + eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * would have had to added a ref key item which may appear on a * different leaf from the original extent item. */ - ret = walk_down_tree(fs_info->extent_root, path, level, + ret = walk_down_tree(extent_root, path, level, &bytenr, &num_bytes, &tree_block_level); if (ret) break; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index e0f93b357548..a3930da4eb3f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -439,7 +439,7 @@ process_slot: break; } next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); + size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 33a0ee7ac590..f5465197996d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -26,6 +26,7 @@ #include "misc.h" #include "subpage.h" #include "zoned.h" +#include "inode-item.h" /* * Relocation overview @@ -1736,7 +1737,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, memset(&next_key, 0, sizeof(next_key)); while (1) { - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) goto out; @@ -1855,7 +1857,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; @@ -2323,8 +2325,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * If we get an enospc just kick back -EAGAIN so we know to drop the * transaction and try to refill when we can flush all the things. */ - ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_LIMIT); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) @@ -3149,7 +3151,7 @@ static int add_tree_block(struct reloc_control *rc, u64 owner = 0; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { @@ -3550,7 +3552,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; - ret = btrfs_block_rsv_refill(rc->extent_root, + ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) @@ -3598,9 +3600,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { rc->reserved_bytes = 0; - ret = btrfs_block_rsv_refill(rc->extent_root, - rc->block_rsv, rc->block_rsv->size, - BTRFS_RESERVE_FLUSH_ALL); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { err = ret; break; @@ -3858,25 +3860,14 @@ out: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started - * -EAGAIN can not start because there are ongoing send operations */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { - spin_lock(&fs_info->send_reloc_lock); - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run relocation while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - spin_unlock(&fs_info->send_reloc_lock); - return -EAGAIN; - } if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ - spin_unlock(&fs_info->send_reloc_lock); btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } - spin_unlock(&fs_info->send_reloc_lock); if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); @@ -3898,9 +3889,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); - spin_lock(&fs_info->send_reloc_lock); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); - spin_unlock(&fs_info->send_reloc_lock); atomic_set(&fs_info->reloc_cancel_req, 0); } @@ -3963,7 +3952,7 @@ static const char *stage_to_string(int stage) int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { struct btrfs_block_group *bg; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; @@ -4214,7 +4203,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out_end; } - rc->extent_root = fs_info->extent_root; + rc->extent_root = btrfs_extent_root(fs_info, 0); set_reloc_control(rc); @@ -4305,6 +4294,7 @@ out: int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; int ret; @@ -4316,7 +4306,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + inode->index_cnt; - ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 12ceb14a1141..3d68d2dcd83e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, u32 len; int need_reset = 0; - len = btrfs_item_size_nr(eb, slot); + len = btrfs_item_size(eb, slot); read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), min_t(u32, len, sizeof(*item))); if (len < sizeof(*item)) @@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); - old_len = btrfs_item_size_nr(l, slot); + old_len = btrfs_item_size(l, slot); /* * If this is the first time we update the root item which originated @@ -334,7 +334,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret < 0); + if (ret < 0) + goto out; if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -501,7 +502,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf82ea6f54fb..2e9a322773f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -39,21 +39,20 @@ struct scrub_block; struct scrub_ctx; /* - * the following three values only influence the performance. + * The following three values only influence the performance. + * * The last one configures the number of parallel and outstanding I/O - * operations. The first two values configure an upper limit for the number + * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ -#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ +#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ /* - * the following value times PAGE_SIZE needs to be large enough to match the + * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. - * Values larger than BTRFS_STRIPE_LEN are not supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ +#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; @@ -73,8 +72,8 @@ struct scrub_page { u64 physical_for_dev_replace; atomic_t refs; u8 mirror_num; - int have_csum:1; - int io_error:1; + unsigned int have_csum:1; + unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; @@ -88,11 +87,7 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; -#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO - struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; -#else - struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; -#endif + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; int page_count; int next_free; struct btrfs_work work; @@ -163,7 +158,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_rd_bio; + int pages_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -174,7 +169,6 @@ struct scrub_ctx { struct scrub_bio *wr_curr_bio; struct mutex wr_lock; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ struct btrfs_device *wr_tgtdev; bool flush_all_writes; @@ -578,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -616,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->wr_curr_bio = NULL; if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); - sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } @@ -758,7 +751,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -852,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; - if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) - return btrfs_repair_one_zone(fs_info, logical); + if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) + return 0; /* * We must use GFP_NOFS because the scrub task might be waiting for a @@ -1313,7 +1306,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); @@ -1675,7 +1668,7 @@ again: sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_wr_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -1708,7 +1701,7 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1755,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2101,7 +2094,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_rd_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -2135,7 +2128,7 @@ again: scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_rd_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_submit(sctx); return 0; @@ -2297,7 +2290,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); scrub_page_get(spage); sblock->pagev[index] = spage; spage->sblock = sblock; @@ -2369,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2631,7 +2624,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); /* For scrub block */ scrub_page_get(spage); sblock->pagev[index] = spage; @@ -2892,15 +2885,15 @@ static void scrub_parity_put(struct scrub_parity *sparity) static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, - struct btrfs_path *path, u64 logic_start, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; struct btrfs_io_context *bioc = NULL; + struct btrfs_path *path; u64 flags; int ret; int slot; @@ -2919,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + path->search_commit_root = 1; + path->skip_locking = 1; + ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); @@ -2928,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); + btrfs_free_path(path); return -ENOMEM; } @@ -3060,6 +3064,7 @@ again: extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); + csum_root = btrfs_csum_root(fs_info, extent_logical); ret = btrfs_lookup_csums_range(csum_root, extent_logical, extent_logical + extent_len - 1, @@ -3116,7 +3121,7 @@ out: scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); - btrfs_release_path(path); + btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -3161,17 +3166,18 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - struct btrfs_block_group *cache) + int stripe_index, u64 dev_extent_len) { - struct btrfs_path *path, *ppath; + struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root; + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; struct blk_plug plug; + const u64 chunk_logical = bg->start; u64 flags; int ret; int slot; @@ -3183,10 +3189,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical_end; u64 generation; int mirror_num; - struct reada_control *reada1; - struct reada_control *reada2; struct btrfs_key key; - struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; u64 extent_logical; @@ -3202,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[num].physical; + physical = map->stripes[stripe_index].physical; offset = 0; - nstripes = div64_u64(length, map->stripe_len); + nstripes = div64_u64(dev_extent_len, map->stripe_len); mirror_num = 1; increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * num; + offset = map->stripe_len * stripe_index; increment = map->stripe_len * map->num_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (num / map->sub_stripes); + offset = map->stripe_len * (stripe_index / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes + 1; + mirror_num = stripe_index % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, num, map, &offset, NULL); + get_raid56_logic_offset(physical, stripe_index, map, &offset, + NULL); increment = map->stripe_len * nr_data_stripes(map); } @@ -3228,12 +3232,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - ppath = btrfs_alloc_path(); - if (!ppath) { - btrfs_free_path(path); - return -ENOMEM; - } - /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -3241,20 +3239,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ path->search_commit_root = 1; path->skip_locking = 1; + path->reada = READA_FORWARD; - ppath->search_commit_root = 1; - ppath->skip_locking = 1; - /* - * trigger the readahead for extent tree csum tree and wait for - * completion. During readahead, the scrub is officially paused - * to not hold off transaction commits - */ - logical = base + offset; + logical = chunk_logical + offset; physical_end = physical + nstripes * map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, num, + get_raid56_logic_offset(physical_end, stripe_index, map, &logic_end, NULL); - logic_end += base; + logic_end += chunk_logical; } else { logic_end = logical + increment * nstripes; } @@ -3262,32 +3254,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - /* FIXME it might be better to start readahead at commit root */ - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - key_end.objectid = logic_end; - key_end.type = BTRFS_METADATA_ITEM_KEY; - key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key, &key_end); - - if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); - } else { - reada2 = NULL; - } - - if (!IS_ERR(reada1)) - btrfs_reada_wait(reada1); - if (!IS_ERR_OR_NULL(reada2)) - btrfs_reada_wait(reada2); - + root = btrfs_extent_root(fs_info, logical); + csum_root = btrfs_csum_root(fs_info, logical); /* * collect all data csums for the stripe to avoid seeking during @@ -3333,16 +3301,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, map, - &logical, + ret = get_raid56_logic_offset(physical, stripe_index, + map, &logical, &stripe_logical); - logical += base; + logical += chunk_logical; if (ret) { /* it is parity strip */ - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, + stripe_logical, stripe_end); if (ret) goto out; @@ -3419,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Continuing would prevent reusing its device extents * for new block groups for a long time. */ - spin_lock(&cache->lock); - if (cache->removed) { - spin_unlock(&cache->lock); + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); ret = 0; goto out; } - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); @@ -3504,16 +3472,16 @@ again: loop: physical += map->stripe_len; ret = get_raid56_logic_offset(physical, - num, map, &logical, - &stripe_logical); - logical += base; + stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; if (ret && physical < physical_end) { - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, - map, scrub_dev, ppath, + map, scrub_dev, stripe_logical, stripe_end); if (ret) @@ -3543,8 +3511,8 @@ skip: physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[num].physical + - length; + sctx->stat.last_physical = map->stripes[stripe_index].physical + + dev_extent_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3560,14 +3528,14 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); - btrfs_free_path(ppath); if (sctx->is_dev_replace && ret >= 0) { int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, base + offset, - map->stripes[num].physical, - physical_end); + ret2 = sync_write_pointer_for_zoned(sctx, + chunk_logical + offset, + map->stripes[stripe_index].physical, + physical_end); if (ret2) ret = ret2; } @@ -3576,10 +3544,10 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct btrfs_device *scrub_dev, - u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group *cache) + u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; @@ -3589,7 +3557,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, int ret = 0; read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, chunk_offset, 1); + em = lookup_extent_mapping(map_tree, bg->start, bg->length); read_unlock(&map_tree->lock); if (!em) { @@ -3597,26 +3565,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * Might have been an unused block group deleted by the cleaner * kthread or relocation. */ - spin_lock(&cache->lock); - if (!cache->removed) + spin_lock(&bg->lock); + if (!bg->removed) ret = -EINVAL; - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); return ret; } - - map = em->map_lookup; - if (em->start != chunk_offset) + if (em->start != bg->start) goto out; - - if (em->len < length) + if (em->len < dev_extent_len) goto out; + map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, cache); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i, + dev_extent_len); if (ret) goto out; } @@ -3654,7 +3620,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; - u64 length; u64 chunk_offset; int ret = 0; int ro_set; @@ -3678,6 +3643,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, key.type = BTRFS_DEV_EXTENT_KEY; while (1) { + u64 dev_extent_len; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) break; @@ -3714,9 +3681,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - length = btrfs_dev_extent_length(l, dev_extent); + dev_extent_len = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) + if (found_key.offset + dev_extent_len <= start) goto skip; chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); @@ -3850,13 +3817,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); - dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_right = found_key.offset + dev_extent_len; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache); + ASSERT(cache->start == chunk_offset); + ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, + dev_extent_len); /* * flush, submit all pending read and write bios, afterwards @@ -3937,7 +3905,7 @@ skip_unfreeze: break; } skip: - key.offset = found_key.offset + length; + key.offset = found_key.offset + dev_extent_len; btrfs_release_path(path); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 040324d71118..d8ccb62aa7d2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "compression.h" #include "xattr.h" +#include "print-tree.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -98,6 +99,15 @@ struct send_ctx { struct btrfs_key *cmp_key; /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + + /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. */ @@ -898,7 +908,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; @@ -930,12 +939,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(slot); - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*iref); } else { ptr = btrfs_item_ptr_offset(eb, slot); - total = btrfs_item_size_nr(eb, slot); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*extref); } @@ -1004,7 +1012,7 @@ out: typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx); + void *ctx); /* * Helper function to iterate the entries in ONE btrfs_dir_item. @@ -1018,7 +1026,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, { int ret = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; @@ -1030,7 +1037,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, u32 total; int slot; int num; - u8 type; /* * Start with a small buffer (1 page). If later we end up needing more @@ -1047,20 +1053,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); num = 0; while (cur < total) { name_len = btrfs_dir_name_len(eb, di); data_len = btrfs_dir_data_len(eb, di); - type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (type == BTRFS_FT_XATTR) { + if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; @@ -1110,7 +1114,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, cur += len; ret = iterate(num, &di_key, buf, name_len, buf + name_len, - data_len, type, ctx); + data_len, ctx); if (ret < 0) goto out; if (ret) { @@ -1427,6 +1431,26 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + ret = -ENOENT; + goto out; + } + up_read(&fs_info->commit_root_sem); + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; @@ -1692,8 +1716,7 @@ out: */ static int lookup_dir_item_inode(struct btrfs_root *root, u64 dir, const char *name, int name_len, - u64 *found_inode, - u8 *found_type) + u64 *found_inode) { int ret = 0; struct btrfs_dir_item *di; @@ -1716,7 +1739,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } *found_inode = key.objectid; - *found_type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); @@ -1839,7 +1861,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; - u8 other_type = 0; if (!sctx->parent_root) goto out; @@ -1867,7 +1888,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, } ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, - &other_inode, &other_type); + &other_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1912,7 +1933,6 @@ static int did_overwrite_ref(struct send_ctx *sctx, int ret = 0; u64 gen; u64 ow_inode; - u8 other_type; if (!sctx->parent_root) goto out; @@ -1936,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, - &ow_inode, &other_type); + &ow_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -3622,7 +3642,7 @@ static int is_ancestor(struct btrfs_root *root, key.type != BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); while (cur_offset < item_size) { u64 parent; u64 parent_gen; @@ -4651,9 +4671,8 @@ out: } static int __process_new_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *name, int name_len, const char *data, + int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4697,8 +4716,7 @@ out: static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *data, int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4743,10 +4761,8 @@ struct find_xattr_ctx { int found_data_len; }; -static int __find_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *vctx) +static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, + int name_len, const char *data, int data_len, void *vctx) { struct find_xattr_ctx *ctx = vctx; @@ -4796,7 +4812,7 @@ static int find_xattr(struct btrfs_root *root, static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4808,12 +4824,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); } else if (ret >= 0) { if (data_len != found_data_len || memcmp(data, found_data, data_len)) { ret = __process_new_xattr(num, di_key, name, name_len, - data, data_len, type, ctx); + data, data_len, ctx); } else { ret = 0; } @@ -4826,7 +4842,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4835,7 +4851,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); else if (ret >= 0) ret = 0; @@ -6566,7 +6582,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *)(ptr + @@ -6597,6 +6613,50 @@ static int changed_cb(struct btrfs_path *left_path, { int ret = 0; + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { @@ -6643,14 +6703,46 @@ out: return ret; } +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_path *path; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -6661,6 +6753,10 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -6668,15 +6764,35 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; @@ -6694,6 +6810,20 @@ out: return ret; } +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; @@ -6703,6 +6833,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_max; u64 reada_done = 0; + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) @@ -6726,6 +6858,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + return 0; } @@ -6739,8 +6875,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, path->slots[*level]++; while (path->slots[*level] >= nritems) { - if (*level == root_level) + if (*level == root_level) { + path->slots[*level] = nritems - 1; return -1; + } /* move upnext */ path->slots[*level] = 0; @@ -6772,14 +6910,20 @@ static int tree_advance(struct btrfs_path *path, } else { ret = tree_move_down(path, level, reada_min_gen); } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + return ret; } @@ -6791,8 +6935,8 @@ static int tree_compare_item(struct btrfs_path *left_path, int len1, len2; unsigned long off1, off2; - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); if (len1 != len2) return 1; @@ -6809,6 +6953,97 @@ static int tree_compare_item(struct btrfs_path *left_path, } /* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + +/* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. * If shared tree blocks are encountered, whole subtrees are skipped, making @@ -6836,10 +7071,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, int right_root_level; int left_level; int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; @@ -6907,12 +7142,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); @@ -6920,9 +7161,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" @@ -6932,7 +7172,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -6947,11 +7186,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; + sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { - cond_resched(); + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, @@ -6960,7 +7214,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -6971,54 +7225,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; - goto out; + goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); - if (ret < 0) - goto out; advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; @@ -7032,11 +7287,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; advance_right = ADVANCE; } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { @@ -7076,6 +7333,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, } } +out_unlock: + up_read(&fs_info->commit_root_sem); out: btrfs_free_path(left_path); btrfs_free_path(right_path); @@ -7425,21 +7684,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - spin_lock(&fs_info->send_reloc_lock); - if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { - spin_unlock(&fs_info->send_reloc_lock); - btrfs_warn_rl(fs_info, - "cannot run send because a relocation operation is in progress"); - ret = -EAGAIN; - goto out; - } - fs_info->send_in_progress++; - spin_unlock(&fs_info->send_reloc_lock); - ret = send_subvol(sctx); - spin_lock(&fs_info->send_reloc_lock); - fs_info->send_in_progress--; - spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 48d77f360a24..294242c194d8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, enum btrfs_flush_state state, bool for_preempt) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -844,6 +844,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + if (!ticket->steal) + return false; + if (global_rsv->space_info != space_info) return false; @@ -899,8 +902,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (!aborted && ticket->steal && - steal_from_global_rsv(fs_info, space_info, ticket)) + if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) return true; if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) @@ -1260,18 +1262,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int states_nr) { u64 to_reclaim; - int flush_state; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); - if (!to_reclaim) { + /* + * This is the priority reclaim path, so to_reclaim could be >0 still + * because we may have only satisified the priority tickets and still + * left non priority tickets on the list. We would then have + * to_reclaim but ->bytes == 0. + */ + if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - flush_state = 0; - do { + while (flush_state < states_nr) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, to_reclaim, states[flush_state], false); flush_state++; @@ -1280,23 +1287,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - } while (flush_state < states_nr); + } + + /* Attempt to steal from the global rsv if we can. */ + if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + } + + /* + * We must run try_granting_tickets here because we could be a large + * ticket in front of a smaller ticket that can now be satisfied with + * the available space. + */ + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + spin_lock(&space_info->lock); + + /* We could have been granted before we got here. */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + while (!space_info->full) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); } + + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1378,25 +1411,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, break; } - spin_lock(&space_info->lock); ret = ticket->error; - if (ticket->bytes || ticket->error) { - /* - * We were a priority ticket, so we need to delete ourselves - * from the list. Because we could have other priority tickets - * behind us that require less space, run - * btrfs_try_granting_tickets() to see if their reservations can - * now be made. - */ - if (!list_empty(&ticket->list)) { - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); - } - - if (!ret) - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); /* * Check that we can't have an error set if the reservation succeeded, @@ -1438,6 +1453,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, space_info->clamp = min(space_info->clamp + 1, 8); } +static inline bool can_steal(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_EVICT); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1511,7 +1532,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); - ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1567,7 +1588,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, /** * Trye to reserve metadata bytes from the block_rsv's space * - * @root: the root we're allocating for + * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1579,22 +1600,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index cb5056472e79..d841fed73492 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1c54a2c787c..0ec09fe01be6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, new_pool_size); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f9eff3b0f77c..beb7f72d50b8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1537,6 +1537,16 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); +} +BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); + static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1572,6 +1582,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), + BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 3a4099a2bf05..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; + btrfs_global_root_delete(root); btrfs_put_root(root); } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 2a95f7224e18..51a8b075c259 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) struct btrfs_path *path = NULL; struct btrfs_root *root = NULL; struct extent_buffer *eb; - struct btrfs_item *item; char *value = "mary had a little lamb"; char *split1 = "mary had a little"; char *split2 = " lamb"; @@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.offset = 0; btrfs_setup_item_for_insert(root, path, &key, value_len); - item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); @@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split1)) { + if (btrfs_item_size(eb, 0) != strlen(split1)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 1) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split3)) { + if (btrfs_item_size(eb, 0) != strlen(split3)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split4)) { + if (btrfs_item_size(eb, 1) != strlen(split4)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(2); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 2) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index c2e72e7a8ff0..a232b15b8021 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } +#define STATE_FLAG_STR_LEN 256 + +#define PRINT_ONE_FLAG(state, dest, cur, name) \ +({ \ + if (state->state & EXTENT_##name) \ + cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \ + "%s" #name, cur == 0 ? "" : "|"); \ +}) + +static void extent_flag_to_str(const struct extent_state *state, char *dest) +{ + int cur = 0; + + dest[0] = 0; + PRINT_ONE_FLAG(state, dest, cur, DIRTY); + PRINT_ONE_FLAG(state, dest, cur, UPTODATE); + PRINT_ONE_FLAG(state, dest, cur, LOCKED); + PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DELALLOC); + PRINT_ONE_FLAG(state, dest, cur, DEFRAG); + PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); + PRINT_ONE_FLAG(state, dest, cur, NODATASUM); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); + PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); + PRINT_ONE_FLAG(state, dest, cur, DAMAGED); + PRINT_ONE_FLAG(state, dest, cur, NORESERVE); + PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); +} + +static void dump_extent_io_tree(const struct extent_io_tree *tree) +{ + struct rb_node *node; + char flags_str[STATE_FLAG_STR_LEN]; + + node = rb_first(&tree->state); + test_msg("io tree content:"); + while (node) { + struct extent_state *state; + + state = rb_entry(node, struct extent_state, rb_node); + extent_flag_to_str(state, flags_str); + test_msg(" start=%llu len=%llu flags=%s", state->start, + state->end + 1 - state->start, flags_str); + node = rb_next(node); + } +} + static int test_find_delalloc(u32 sectorsize) { struct inode *inode; @@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: + if (ret) + dump_extent_io_tree(tmp); clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) @@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void) ret = 0; out: + if (ret) + dump_extent_io_tree(&tree); clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8f05c1eb833f..5930cdcae5cb 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return 0; } +static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return true; +} + +static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) +{ + const struct btrfs_free_space_op test_free_space_ops = { + .use_bitmap = bytes_index_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + u64 offset, max_extent_size, bytes; + int ret, i; + + test_msg("running bytes index tests"); + + /* First just validate that it does everything in order. */ + offset = 0; + for (i = 0; i < 10; i++) { + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 0); + if (ret) { + test_err("couldn't add extent entry %d\n", ret); + return ret; + } + offset += bytes + sectorsize; + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps do the correct thing. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + for (i = 0; i < 2; i++) { + offset = i * BITS_PER_BITMAP * sectorsize; + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps with different ->max_extent_size. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; + + ret = test_add_free_space_entry(cache, 0, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + + offset = BITS_PER_BITMAP * sectorsize; + ret = test_add_free_space_entry(cache, offset, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap_entry"); + return ret; + } + + /* + * Now set a bunch of sectorsize extents in the first entry so it's + * ->bytes is large. + */ + for (i = 2; i < 20; i += 2) { + offset = sectorsize * i; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error populating sparse bitmap %d", ret); + return ret; + } + } + + /* + * Now set a contiguous extent in the second bitmap so its + * ->max_extent_size is larger than the first bitmaps. + */ + offset = (BITS_PER_BITMAP * sectorsize) + sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding contiguous extent %d", ret); + return ret; + } + + /* + * Since we don't set ->max_extent_size unless we search everything + * should be indexed on bytes. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (10 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3, + 0, &max_extent_size); + if (offset != 0) { + test_err("found space to alloc even though we don't have enough space"); + return -EINVAL; + } + + if (max_extent_size != (2 * sectorsize)) { + test_err("got the wrong max_extent size %llu expected %llu", + max_extent_size, (unsigned long long)(2 * sectorsize)); + return -EINVAL; + } + + /* + * The search should have re-arranged the bytes index to use the + * ->max_extent_size, validate it's now what we expect it to be. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (2 * sectorsize)) { + test_err("error, the bytes index wasn't recalculated properly"); + return -EINVAL; + } + + /* Add another sectorsize to re-arrange the tree back to ->bytes. */ + offset = (BITS_PER_BITMAP * sectorsize) - sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding extent to the sparse entry %d", ret); + return ret; + } + + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (11 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + /* + * Now make sure we find our correct entry after searching that will + * result in a re-arranging of the tree. + */ + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2, + 0, &max_extent_size); + if (offset != (BITS_PER_BITMAP * sectorsize)) { + test_err("error, found %llu instead of %llu for our alloc", + offset, + (unsigned long long)(BITS_PER_BITMAP * sectorsize)); + return -EINVAL; + } + + cache->free_space_ctl->op = orig_free_space_ops; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; @@ -858,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; } - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); ret = test_extents(cache); if (ret) @@ -871,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); + if (ret) + goto out; + ret = test_bytes_index(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 2c783d2f5228..13734ed43bfc 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); - root->fs_info->free_space_root = root; + root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 19ba7d5b7d8f..eee1e4459541 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) } /* We are using this root as our extent root */ - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); /* * Some of the paths we test assume we have a filled out fs_info, so we diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1c3a1189c0bd..03de89b45f27 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root, *tmp; struct btrfs_caching_control *caching_ctl, *next; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); @@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { - WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node); /* @@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { @@ -692,7 +701,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->root = root; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; @@ -1236,6 +1244,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct extent_buffer *eb; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); @@ -1267,9 +1281,8 @@ again: root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); - if (root != fs_info->extent_root) - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1299,9 +1312,6 @@ again: if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; - list_add_tail(&fs_info->extent_root->dirty_list, - &trans->transaction->switch_commits); - /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; @@ -1327,7 +1337,8 @@ void btrfs_add_dead_root(struct btrfs_root *root) } /* - * update all the cowonly tree roots on disk + * Update each subvolume root and its relocation root, if it exists, in the tree + * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { @@ -1336,6 +1347,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) int i; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, @@ -1348,6 +1365,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root = gang[i]; int ret2; + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1472,12 +1497,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return ret; } - /* - * We are going to commit transaction, see btrfs_commit_transaction() - * comment for reason locking tree_log_mutex - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) goto out; @@ -1513,8 +1532,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, "Error while writing out transaction for qgroup"); out: - mutex_unlock(&fs_info->tree_log_mutex); - /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. @@ -1578,7 +1595,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { - pending->error = btrfs_block_rsv_add(root, + pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); @@ -1861,50 +1878,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } -/* - * commit transactions asynchronously. once btrfs_commit_transaction_async - * returns, any subsequent transaction will not be allowed to join. - */ -struct btrfs_async_commit { - struct btrfs_trans_handle *newtrans; - struct work_struct work; -}; - -static void do_async_commit(struct work_struct *work) -{ - struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work); - - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); - - current->journal_info = ac->newtrans; - - btrfs_commit_transaction(ac->newtrans); - kfree(ac); -} - -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_async_commit *ac; struct btrfs_transaction *cur_trans; - ac = kmalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - return -ENOMEM; - - INIT_WORK(&ac->work, do_async_commit); - ac->newtrans = btrfs_join_transaction(trans->root); - if (IS_ERR(ac->newtrans)) { - int err = PTR_ERR(ac->newtrans); - kfree(ac); - return err; - } + /* Kick the transaction kthread. */ + set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; @@ -1913,28 +1894,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_end_transaction(trans); /* - * Tell lockdep we've released the freeze rwsem, since the - * async commit thread will be the one to unlock it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_release(fs_info->sb, SB_FREEZE_FS); - - schedule_work(&ac->work); - /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); - if (current->journal_info == trans) - current->journal_info = NULL; - btrfs_put_transaction(cur_trans); - return 0; } - static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1986,7 +1954,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; @@ -2200,6 +2168,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* + * We've started the commit, clear the flag in case we were triggered to + * do an async commit but somebody else started before the transaction + * kthread could do the work. + */ + clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; @@ -2246,24 +2221,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); - /* btrfs_commit_tree_roots is responsible for getting the - * various roots consistent with each other. Every pointer - * in the tree of tree roots has to point to the most up to date - * root for every subvolume and other tree. So, we have to keep - * the tree logging code from jumping in and changing any - * of the trees. - * - * At this point in the commit, there can't be any tree-log - * writers, but a little lower down we drop the trans mutex - * and let new people in. By holding the tree_log_mutex - * from now until after the super is written, we avoid races - * with the tree-log code. - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * Since the transaction is done, we can apply the pending changes @@ -2282,11 +2242,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) - goto unlock_tree_log; + goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also @@ -2294,7 +2254,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - goto unlock_tree_log; + goto unlock_reloc; } cur_trans = fs_info->running_transaction; @@ -2327,6 +2287,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_chunk_metadata(trans); + /* + * Before changing the transaction state to TRANS_STATE_UNBLOCKED and + * setting fs_info->running_transaction to NULL, lock tree_log_mutex to + * make sure that before we commit our superblock, no other task can + * start a new transaction and commit a log tree before we commit our + * superblock. Anyone trying to commit a log tree locks this mutex before + * writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; @@ -2339,10 +2309,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); - /* - * reloc_mutex has been unlocked, tree_log_mutex is still held - * but we can't jump to unlock_tree_log causing double unlock - */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2393,7 +2359,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); btrfs_scrub_continue(fs_info); @@ -2404,8 +2370,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; -unlock_tree_log: - mutex_unlock(&fs_info->tree_log_mutex); unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); scrub_continue: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ba45065f9451..1852ed9de7fd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -135,7 +135,6 @@ struct btrfs_trans_handle { bool removing_chunk; bool reloc_reserved; bool in_fsync; - struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; }; @@ -217,7 +216,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7733e8ac0a69..72e1c942197d 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 extent_end; if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { @@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, key->offset, sectorsize); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { + if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", - btrfs_item_size_nr(leaf, slot), csumsize); + btrfs_item_size(leaf, slot), csumsize); return -EUCLEAN; } if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { u64 prev_csum_end; u32 prev_item_size; - prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_item_size = btrfs_item_size(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; if (unlikely(prev_csum_end > key->offset)) { @@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf, { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u32 cur = 0; if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) @@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 type; @@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, { int num_stripes; - if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { + if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; @@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, goto out; if (unlikely(btrfs_chunk_item_size(num_stripes) != - btrfs_item_size_nr(leaf, slot))) { + btrfs_item_size(leaf, slot))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } @@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (unlikely(ret < 0)) return ret; - if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && - btrfs_item_size_nr(leaf, slot) != + if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && + btrfs_item_size(leaf, slot) != btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", - btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_item_size(leaf, slot), sizeof(ri), btrfs_legacy_root_item_size()); return -EUCLEAN; } @@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, * And since we allow geneartion_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - btrfs_item_size_nr(leaf, slot)); + btrfs_item_size(leaf, slot)); /* Generation related */ if (unlikely(btrfs_root_generation(&ri) > @@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf, bool is_tree_block = false; unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, if (key->type == BTRFS_SHARED_DATA_REF_KEY) expect_item_size = sizeof(struct btrfs_shared_data_ref); - if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { + if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } @@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf, { struct btrfs_extent_data_ref *dref; unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); - const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + const unsigned long end = ptr + btrfs_item_size(leaf, slot); - if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { + if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } @@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf, if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ - if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { + if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; } ptr = btrfs_item_ptr_offset(leaf, slot); - end = ptr + btrfs_item_size_nr(leaf, slot); + end = ptr + btrfs_item_size(leaf, slot); while (ptr < end) { u16 namelen; @@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) if (slot == 0) item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); else - item_end_expected = btrfs_item_offset_nr(leaf, + item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { + if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %u expect %u", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), item_end_expected); return -EUCLEAN; } @@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_end_nr(leaf, slot) > + if (unlikely(btrfs_item_data_end(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 7c45d960b53c..b6cf39f4e7e4 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int next_key_ret = 0; u64 last_ret = 0; - if (root->fs_info->extent_root == root) { - /* - * there's recursion here right now in the tree locking, - * we can't defrag the extent root without deadlock - */ - goto out; - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ab33caf016f..c1ddbe800897 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,6 +20,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "inode-item.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -386,7 +387,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); /* Our caller must have done a search for the key for us. */ @@ -409,7 +410,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, if (ret == 0) { char *src_copy; char *dst_copy; - u32 dst_size = btrfs_item_size_nr(path->nodes[0], + u32 dst_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (dst_size != item_size) goto insert; @@ -503,7 +504,7 @@ insert: /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; - found_size = btrfs_item_size_nr(path->nodes[0], + found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) btrfs_truncate_item(path, item_size, 1); @@ -872,17 +873,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, + sums->bytenr); if (!ret) - ret = btrfs_del_csums(trans, - fs_info->csum_root, + ret = btrfs_del_csums(trans, csum_root, sums->bytenr, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, - fs_info->csum_root, sums); + csum_root, + sums); list_del(&sums->list); kfree(sums); } @@ -1096,7 +1101,7 @@ again: * otherwise they must be unlinked as a conflict */ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { victim_ref = (struct btrfs_inode_ref *)ptr; victim_name_len = btrfs_inode_ref_name_len(leaf, @@ -1155,7 +1160,7 @@ again: leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { @@ -1181,6 +1186,7 @@ again: parent_objectid, victim_name, victim_name_len); if (ret < 0) { + kfree(victim_name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1317,7 +1323,7 @@ again: eb = path->nodes[0]; ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { char *name = NULL; int namelen; @@ -1503,7 +1509,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + ref_end = ref_ptr + btrfs_item_size(eb, slot); if (key->type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *r; @@ -1677,7 +1683,7 @@ static int count_inode_extrefs(struct btrfs_root *root, break; leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); cur_offset = 0; @@ -1731,7 +1737,7 @@ process_slot: key.type != BTRFS_INODE_REF_KEY) break; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + ptr_end = ptr + btrfs_item_size(path->nodes[0], path->slots[0]); while (ptr < ptr_end) { struct btrfs_inode_ref *ref; @@ -1949,6 +1955,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } +static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_path *path, + struct btrfs_dir_item *dst_di, + const struct btrfs_key *log_key, + u8 log_type, + bool exists) +{ + struct btrfs_key found_key; + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* The existing dentry points to the same inode, don't delete it. */ + if (found_key.objectid == log_key->objectid && + found_key.type == log_key->type && + found_key.offset == log_key->offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) + return 1; + + /* + * Don't drop the conflicting directory entry if the inode for the new + * entry doesn't exist. + */ + if (!exists) + return 0; + + return drop_one_dir_item(trans, path, dir, dst_di); +} + /* * take a single entry in a log directory item and replay it into * the subvolume. @@ -1974,14 +2008,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, { char *name; int name_len; - struct btrfs_dir_item *dst_di; - struct btrfs_key found_key; + struct btrfs_dir_item *dir_dst_di; + struct btrfs_dir_item *index_dst_di; + bool dir_dst_matches = false; + bool index_dst_matches = false; struct btrfs_key log_key; + struct btrfs_key search_key; struct inode *dir; u8 log_type; bool exists; int ret; - bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool update_size = true; bool name_added = false; dir = read_one_inode(root, key->objectid); @@ -2007,76 +2044,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, exists = (ret == 0); ret = 0; - if (key->type == BTRFS_DIR_ITEM_KEY) { - dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); - } else if (key->type == BTRFS_DIR_INDEX_KEY) { - dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, - key->offset, name, - name_len, 1); - } else { - /* Corruption */ - ret = -EINVAL; - goto out; - } - - if (IS_ERR(dst_di)) { - ret = PTR_ERR(dst_di); + dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + if (IS_ERR(dir_dst_di)) { + ret = PTR_ERR(dir_dst_di); goto out; - } else if (!dst_di) { - /* we need a sequence number to insert, so we only - * do inserts for the BTRFS_DIR_INDEX_KEY types - */ - if (key->type != BTRFS_DIR_INDEX_KEY) + } else if (dir_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + dir_dst_di, &log_key, log_type, + exists); + if (ret < 0) goto out; - goto insert; + dir_dst_matches = (ret == 1); } - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); - /* the existing item matches the logged item */ - if (found_key.objectid == log_key.objectid && - found_key.type == log_key.type && - found_key.offset == log_key.offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) { - update_size = false; + btrfs_release_path(path); + + index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, key->offset, + name, name_len, 1); + if (IS_ERR(index_dst_di)) { + ret = PTR_ERR(index_dst_di); goto out; + } else if (index_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + index_dst_di, &log_key, + log_type, exists); + if (ret < 0) + goto out; + index_dst_matches = (ret == 1); } - /* - * don't drop the conflicting directory entry if the inode - * for the new entry doesn't exist - */ - if (!exists) - goto out; + btrfs_release_path(path); - ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di); - if (ret) + if (dir_dst_matches && index_dst_matches) { + ret = 0; + update_size = false; goto out; - - if (key->type == BTRFS_DIR_INDEX_KEY) - goto insert; -out: - btrfs_release_path(path); - if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); - iput(dir); - if (!ret && name_added) - ret = 1; - return ret; -insert: /* * Check if the inode reference exists in the log for the given name, * inode and parent inode */ - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_REF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); if (ret < 0) { goto out; } else if (ret) { @@ -2086,10 +2100,10 @@ insert: goto out; } - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_EXTREF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, key->objectid, name, name_len); if (ret < 0) { goto out; @@ -2108,87 +2122,76 @@ insert: name_added = true; update_size = false; ret = 0; - goto out; + +out: + if (!ret && update_size) { + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + } + kfree(name); + iput(dir); + if (!ret && name_added) + ret = 1; + return ret; } -/* - * find all the names in a directory item and reconcile them into - * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than - * one name in a directory item, but the same code gets used for - * both directory index types - */ +/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + int ret; struct btrfs_dir_item *di; - int name_len; - unsigned long ptr; - unsigned long ptr_end; - struct btrfs_path *fixup_path = NULL; - - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - ret = replay_one_name(trans, root, path, eb, di, key); - if (ret < 0) - break; - ptr = (unsigned long)(di + 1); - ptr += name_len; - /* - * If this entry refers to a non-directory (directories can not - * have a link count > 1) and it was added in the transaction - * that was not committed, make sure we fixup the link count of - * the inode it the entry points to. Otherwise something like - * the following would result in a directory pointing to an - * inode with a wrong link that does not account for this dir - * entry: - * - * mkdir testdir - * touch testdir/foo - * touch testdir/bar - * sync - * - * ln testdir/bar testdir/bar_link - * ln testdir/foo testdir/foo_link - * xfs_io -c "fsync" testdir/bar - * - * <power failure> - * - * mount fs, log replay happens - * - * File foo would remain with a link count of 1 when it has two - * entries pointing to it in the directory testdir. This would - * make it impossible to ever delete the parent directory has - * it would result in stale dentries that can never be deleted. - */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { - struct btrfs_key di_key; + /* We only log dir index keys, which only contain a single dir item. */ + ASSERT(key->type == BTRFS_DIR_INDEX_KEY); - if (!fixup_path) { - fixup_path = btrfs_alloc_path(); - if (!fixup_path) { - ret = -ENOMEM; - break; - } - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + ret = replay_one_name(trans, root, path, eb, di, key); + if (ret < 0) + return ret; - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, - di_key.objectid); - if (ret) - break; - } - ret = 0; + /* + * If this entry refers to a non-directory (directories can not have a + * link count > 1) and it was added in the transaction that was not + * committed, make sure we fixup the link count of the inode the entry + * points to. Otherwise something like the following would result in a + * directory pointing to an inode with a wrong link that does not account + * for this dir entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two entries + * pointing to it in the directory testdir. This would make it impossible + * to ever delete the parent directory has it would result in stale + * dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_path *fixup_path; + struct btrfs_key di_key; + + fixup_path = btrfs_alloc_path(); + if (!fixup_path) + return -ENOMEM; + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); + btrfs_free_path(fixup_path); } - btrfs_free_path(fixup_path); + return ret; } @@ -2205,7 +2208,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, */ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path, - u64 dirid, int key_type, + u64 dirid, u64 *start_ret, u64 *end_ret) { struct btrfs_key key; @@ -2218,7 +2221,7 @@ static noinline int find_dir_range(struct btrfs_root *root, return 1; key.objectid = dirid; - key.type = key_type; + key.type = BTRFS_DIR_LOG_INDEX_KEY; key.offset = *start_ret; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2232,7 +2235,7 @@ static noinline int find_dir_range(struct btrfs_root *root, if (ret != 0) btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto next; } @@ -2259,7 +2262,7 @@ next: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto out; } @@ -2290,95 +2293,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, int ret; struct extent_buffer *eb; int slot; - u32 item_size; struct btrfs_dir_item *di; - struct btrfs_dir_item *log_di; int name_len; - unsigned long ptr; - unsigned long ptr_end; char *name; - struct inode *inode; + struct inode *inode = NULL; struct btrfs_key location; -again: + /* + * Currenly we only log dir index keys. Even if we replay a log created + * by an older kernel that logged both dir index and dir item keys, all + * we need to do is process the dir index keys, we (and our caller) can + * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). + */ + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; - goto out; - } - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - log_di = NULL; - if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { - log_di = btrfs_lookup_dir_item(trans, log, log_path, - dir_key->objectid, - name, name_len, 0); - } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { - log_di = btrfs_lookup_dir_index_item(trans, log, - log_path, - dir_key->objectid, - dir_key->offset, - name, name_len, 0); - } - if (!log_di) { - btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); - btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - kfree(name); - return -EIO; - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } - ret = link_to_fixup_dir(trans, root, - path, location.objectid); - if (ret) { - kfree(name); - iput(inode); - goto out; - } + read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); - inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(inode), name, name_len); - if (!ret) - ret = btrfs_run_delayed_items(trans); - kfree(name); - iput(inode); - if (ret) - goto out; + if (log) { + struct btrfs_dir_item *log_di; - /* there might still be more names under this key - * check and repeat if required - */ - ret = btrfs_search_slot(NULL, root, dir_key, path, - 0, 0); - if (ret == 0) - goto again; + log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } else if (log_di) { + /* The dentry exists in the log, we have nothing to do. */ ret = 0; goto out; - } else if (IS_ERR(log_di)) { - kfree(name); - return PTR_ERR(log_di); } - btrfs_release_path(log_path); - kfree(name); + } - ptr = (unsigned long)(di + 1); - ptr += name_len; + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; } - ret = 0; + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + inc_nlink(inode); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len); + if (ret) + goto out; + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; + + /* + * Unlike dir item keys, dir index keys can only have one name (entry) in + * them, as there are no key collisions since each key has a unique offset + * (an index number), so we're done. + */ out: btrfs_release_path(path); btrfs_release_path(log_path); + kfree(name); + iput(inode); return ret; } @@ -2421,7 +2411,7 @@ process_leaf: } di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size_nr(path->nodes[0], i); + total_size = btrfs_item_size(path->nodes[0], i); cur = 0; while (cur < total_size) { u16 name_len = btrfs_dir_name_len(path->nodes[0], di); @@ -2498,7 +2488,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, { u64 range_start; u64 range_end; - int key_type = BTRFS_DIR_LOG_ITEM_KEY; int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; @@ -2506,7 +2495,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct inode *dir; dir_key.objectid = dirid; - dir_key.type = BTRFS_DIR_ITEM_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); if (!log_path) return -ENOMEM; @@ -2520,14 +2509,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, btrfs_free_path(log_path); return 0; } -again: + range_start = 0; range_end = 0; while (1) { if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, key_type, + ret = find_dir_range(log, path, dirid, &range_start, &range_end); if (ret < 0) goto out; @@ -2554,8 +2543,10 @@ again: btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.objectid != dirid || - found_key.type != dir_key.type) - goto next_type; + found_key.type != dir_key.type) { + ret = 0; + goto out; + } if (found_key.offset > range_end) break; @@ -2574,15 +2565,7 @@ again: break; range_start = range_end + 1; } - -next_type: ret = 0; - if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { - key_type = BTRFS_DIR_LOG_INDEX_KEY; - dir_key.type = BTRFS_DIR_INDEX_KEY; - btrfs_release_path(path); - goto again; - } out: btrfs_release_path(path); btrfs_free_path(log_path); @@ -2742,12 +2725,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_DIR_ITEM_KEY) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); - if (ret) - break; } + /* + * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the + * BTRFS_DIR_INDEX_KEY items which we use to derive the + * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an + * older kernel with such keys, ignore them. + */ } btrfs_free_path(path); return ret; @@ -2908,6 +2892,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, path->nodes[*level]->len); if (ret) return ret; + btrfs_redirty_list_add(trans->transaction, + next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -2988,6 +2974,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next->start, next->len); if (ret) goto out; + btrfs_redirty_list_add(trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3438,8 +3425,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); - if (trans && log->node) - btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } @@ -3549,20 +3534,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, goto out_unlock; } - di = btrfs_lookup_dir_item(trans, log, path, dir_ino, - name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - btrfs_release_path(path); + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, index, name, name_len, -1); if (IS_ERR(di)) { @@ -3626,7 +3601,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - int key_type, u64 dirid, + u64 dirid, u64 first_offset, u64 last_offset) { int ret; @@ -3635,10 +3610,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.objectid = dirid; key.offset = first_offset; - if (key_type == BTRFS_DIR_ITEM_KEY) - key.type = BTRFS_DIR_LOG_ITEM_KEY; - else - key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); if (ret) return ret; @@ -3673,7 +3645,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, if (count == 1) { btrfs_item_key_to_cpu(src, &key, start_slot); - item_size = btrfs_item_size_nr(src, start_slot); + item_size = btrfs_item_size(src, start_slot); batch.keys = &key; batch.data_sizes = &item_size; batch.total_data_size = item_size; @@ -3696,7 +3668,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, const int slot = start_slot + i; btrfs_item_key_to_cpu(src, &ins_keys[i], slot); - ins_sizes[i] = btrfs_item_size_nr(src, slot); + ins_sizes[i] = btrfs_item_size(src, slot); batch.total_data_size += ins_sizes[i]; } } @@ -3730,7 +3702,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - int key_type, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; @@ -3738,24 +3709,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, const int nritems = btrfs_header_nritems(src); const u64 ino = btrfs_ino(inode); const bool inode_logged_before = inode_logged(trans, inode); - u64 last_logged_key_offset; bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - if (key_type == BTRFS_DIR_ITEM_KEY) - last_logged_key_offset = inode->last_dir_item_offset; - else - last_logged_key_offset = inode->last_dir_index_offset; - for (i = path->slots[0]; i < nritems; i++) { struct btrfs_key key; int ret; btrfs_item_key_to_cpu(src, &key, i); - if (key.objectid != ino || key.type != key_type) { + if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { last_found = true; break; } @@ -3804,7 +3769,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * we logged is in the log tree, saving time and avoiding adding * contention on the log tree. */ - if (key.offset > last_logged_key_offset) + if (key.offset > inode->last_dir_index_offset) goto add_to_batch; /* * Check if the key was already logged before. If not we can add @@ -3863,7 +3828,7 @@ add_to_batch: static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path, int key_type, + struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { @@ -3877,7 +3842,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = min_offset; ret = btrfs_search_forward(root, &min_key, path, trans->transid); @@ -3886,9 +3851,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * we didn't find anything from this transaction, see if there * is anything at all */ - if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + if (ret != 0 || min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = (u64)-1; btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); @@ -3896,7 +3862,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); return ret; } - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); /* if ret == 0 there are items for this type, * create a range to tell us the last key of this type. @@ -3907,18 +3873,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) + if (tmp.type == BTRFS_DIR_INDEX_KEY) first_offset = max(min_offset, tmp.offset) + 1; } goto done; } /* go backward to find any previous key */ - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) { + if (tmp.type == BTRFS_DIR_INDEX_KEY) { first_offset = tmp.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], @@ -3949,8 +3915,7 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, - key_type, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); if (ret != 0) { if (ret < 0) err = ret; @@ -3971,11 +3936,12 @@ search: goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); - if (min_key.objectid != ino || min_key.type != key_type) { + if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { last_offset = (u64)-1; goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ctx->last_dir_item_offset = min_key.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &min_key); @@ -4001,8 +3967,8 @@ done: * insert the log range keys to indicate where the log * is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, - ino, first_offset, last_offset); + ret = insert_dir_log_key(trans, log, path, ino, first_offset, + last_offset); if (ret) err = ret; } @@ -4030,35 +3996,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 min_key; u64 max_key; int ret; - int key_type = BTRFS_DIR_ITEM_KEY; /* * If this is the first time we are being logged in the current * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the values - * of the last logged key offsets. Note that we don't use the helper + * reloaded later, in which case its logged_trans is 0, reset the value + * of the last logged key offset. Note that we don't use the helper * function inode_logged() here - that is because the function returns * true after an inode eviction, assuming the worst case as it can not * know for sure if the inode was logged before. So we can not skip key * searches in the case the inode was evicted, because it may not have * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir item and index offsets - * to (u64)-1. + * transaction, so we need to reset the last dir index offset to (u64)-1. */ - if (inode->logged_trans != trans->transid) { - inode->last_dir_item_offset = (u64)-1; + if (inode->logged_trans != trans->transid) inode->last_dir_index_offset = (u64)-1; - } -again: + min_key = 0; max_key = 0; - if (key_type == BTRFS_DIR_ITEM_KEY) - ctx->last_dir_item_offset = inode->last_dir_item_offset; - else - ctx->last_dir_item_offset = inode->last_dir_index_offset; + ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { - ret = log_dir_items(trans, inode, path, dst_path, key_type, + ret = log_dir_items(trans, inode, path, dst_path, ctx, min_key, &max_key); if (ret) return ret; @@ -4067,13 +4026,8 @@ again: min_key = max_key + 1; } - if (key_type == BTRFS_DIR_ITEM_KEY) { - inode->last_dir_item_offset = ctx->last_dir_item_offset; - key_type = BTRFS_DIR_INDEX_KEY; - goto again; - } else { - inode->last_dir_index_offset = ctx->last_dir_item_offset; - } + inode->last_dir_index_offset = ctx->last_dir_item_offset; + return 0; } @@ -4144,14 +4098,14 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type) { - int ret; - - do { - ret = btrfs_truncate_inode_items(trans, log_root, inode, - new_size, min_type, NULL); - } while (ret == -EAGAIN); + struct btrfs_truncate_control control = { + .new_size = new_size, + .ino = btrfs_ino(inode), + .min_type = min_type, + .skip_ref_updates = true, + }; - return ret; + return btrfs_truncate_inode_items(trans, log_root, &control); } static void fill_inode_item(struct btrfs_trans_handle *trans, @@ -4347,7 +4301,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = nr; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + ins_sizes[i] = btrfs_item_size(src, i + start_slot); batch.total_data_size += ins_sizes[i]; btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } @@ -4391,6 +4345,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { + struct btrfs_root *csum_root; u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); @@ -4409,8 +4364,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, cl = dl; } - ret = btrfs_lookup_csums_range( - fs_info->csum_root, + csum_root = btrfs_csum_root(fs_info, ds); + ret = btrfs_lookup_csums_range(csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); if (ret) @@ -4462,6 +4417,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *csum_root; u64 csum_offset; u64 csum_len; u64 mod_start = em->mod_start; @@ -4542,7 +4498,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, + csum_root = btrfs_csum_root(trans->fs_info, em->block_start); + ret = btrfs_lookup_csums_range(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -5163,7 +5120,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_path *search_path; char *name = NULL; u32 name_len = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; unsigned long ptr = btrfs_item_ptr_offset(eb, slot); @@ -5896,18 +5853,12 @@ struct btrfs_dir_list { * link_to_fixup_dir()); * * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and - * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item * has a size that doesn't match the sum of the lengths of all the logged - * names. This does not result in a problem because if a dir_item key is - * logged but its matching dir_index key is not logged, at log replay time we - * don't use it to replay the respective name (see replay_one_name()). On the - * other hand if only the dir_index key ends up being logged, the respective - * name is added to the fs/subvol tree with both the dir_item and dir_index - * keys created (see replay_one_name()). - * The directory's inode item with a wrong i_size is not a problem as well, - * since we don't use it at log replay time to set the i_size in the inode - * item of the fs/subvol tree (see overwrite_item()). + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * do_overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5953,7 +5904,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, goto next_dir_inode; min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = 0; again: btrfs_release_path(path); @@ -5978,7 +5929,7 @@ process_leaf: btrfs_item_key_to_cpu(leaf, &min_key, i); if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_ITEM_KEY) + min_key.type != BTRFS_DIR_INDEX_KEY) goto next_dir_inode; di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); @@ -6090,7 +6041,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; @@ -6792,15 +6743,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * was previously logged, make sure the next log attempt on the directory * is not skipped and logs the inode again. This is because the log may * not currently be authoritative for a range including the old - * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make - * sure after a log replay we do not end up with both the new and old - * dentries around (in case the inode is a directory we would have a - * directory with two hard links and 2 inode references for different - * parents). The next log attempt of old_dir will happen at - * btrfs_log_all_parents(), called through btrfs_log_inode_parent() - * below, because we have previously set inode->last_unlink_trans to the - * current transaction ID, either here or at btrfs_record_unlink_dir() in - * case inode is a directory. + * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we + * do not end up with both the new and old dentries around (in case the + * inode is a directory we would have a directory with two hard links and + * 2 inode references for different parents). The next log attempt of + * old_dir will happen at btrfs_log_all_parents(), called through + * btrfs_log_inode_parent() below, because we have previously set + * inode->last_unlink_trans to the current transaction ID, either here or + * at btrfs_record_unlink_dir() in case the inode is a directory. */ if (old_dir) old_dir->logged_trans = 0; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 74023c8a783f..b458452a1aaf 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); offset = btrfs_item_ptr_offset(eb, slot); ret = -ENOENT; @@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + offset += btrfs_item_size(eb, slot) - sizeof(subid_le); } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", @@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); @@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, goto out; } - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size == sizeof(subid)) { ret = btrfs_del_item(trans, uuid_root, path); goto out; @@ -331,7 +331,7 @@ again_search_slot: goto skip; offset = btrfs_item_ptr_offset(leaf, slot); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4968535dfff0..90eb5c2830a9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (key.objectid != btrfs_ino(inode) || key.type != key_type) break; - item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; + item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset; if (copied > 0) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 61ac57bcbf1a..b07d382d53a8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,10 @@ #include "discard.h" #include "zoned.h" +#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10 | \ + BTRFS_BLOCK_GROUP_RAID56_MASK) + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -1162,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); ASSERT(list_empty(&device->dev_alloc_list)); ASSERT(list_empty(&device->post_commit_list)); - ASSERT(atomic_read(&device->reada_in_flight) == 0); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -1370,8 +1373,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bytenr_orig = btrfs_sb_offset(0); ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); - if (ret) - return ERR_PTR(ret); + if (ret) { + device = ERR_PTR(ret); + goto error_bdev_put; + } disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) { @@ -2144,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_shrink_device(device, 0); - if (!ret) - btrfs_reada_remove_dev(device); if (ret) goto error_undo; @@ -2243,7 +2246,6 @@ out: return ret; error_undo: - btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2429,21 +2431,15 @@ struct btrfs_device *btrfs_find_device_by_devspec( return device; } -/* - * does all the dirty work required for changing file system's UUID. - */ -static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) +static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = fs_info->super_copy; - struct btrfs_device *device; - u64 super_flags; lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * Private copy of the seed devices, anchored at @@ -2451,7 +2447,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) - return PTR_ERR(seed_devices); + return seed_devices; /* * It's necessary to retain a copy of the original seed fs_devices in @@ -2462,7 +2458,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); - return PTR_ERR(old_devices); + return old_devices; } list_add(&old_devices->fs_list, &fs_uuids); @@ -2473,7 +2469,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - mutex_lock(&fs_devices->device_list_mutex); + return seed_devices; +} + +/* + * Splice seed devices into the sprout fs_devices. + * Generate a new fsid for the sprouted read-write filesystem. + */ +static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *seed_devices) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + /* + * We are updating the fsid, the thread leading to device_list_add() + * could race, so uuid_mutex is needed. + */ + lockdep_assert_held(&uuid_mutex); + + /* + * The threads listed below may traverse dev_list but can do that without + * device_list_mutex: + * - All device ops and balance - as we are in btrfs_exclop_start. + * - Various dev_list readers - are using RCU. + * - btrfs_ioctl_fitrim() - is using RCU. + * + * For-read threads as below are using device_list_mutex: + * - Readonly scrub btrfs_scrub_dev() + * - Readonly scrub btrfs_scrub_progress() + * - btrfs_get_dev_stats() + */ + lockdep_assert_held(&fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) @@ -2489,13 +2519,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - mutex_unlock(&fs_devices->device_list_mutex); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); - - return 0; } /* @@ -2586,10 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct super_block *sb = fs_info->sb; struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; u64 orig_super_num_devices; - int seeding_dev = 0; int ret = 0; + bool seeding_dev = false; bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) @@ -2606,7 +2634,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (fs_devices->seeding) { - seeding_dev = 1; + seeding_dev = true; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); locked = true; @@ -2641,7 +2669,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; @@ -2669,18 +2697,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { btrfs_clear_sb_rdonly(sb); - ret = btrfs_prepare_sprout(fs_info); - if (ret) { + + /* GFP_KERNEL allocation must not be under device_list_mutex */ + seed_devices = btrfs_init_sprout(fs_info); + if (IS_ERR(seed_devices)) { + ret = PTR_ERR(seed_devices); btrfs_abort_transaction(trans, ret); goto error_trans; } + } + + mutex_lock(&fs_devices->device_list_mutex); + if (seeding_dev) { + btrfs_setup_sprout(fs_info, seed_devices); btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, device); } device->fs_devices = fs_devices; - mutex_lock(&fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -2742,7 +2777,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* * fs_devices now represents the newly sprouted filesystem and - * its fsid has been changed by btrfs_prepare_sprout + * its fsid has been changed by btrfs_sprout_splice(). */ btrfs_sysfs_update_sprout_fsid(fs_devices); } @@ -4355,8 +4390,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + } /* * Balance can be canceled by: * @@ -4432,6 +4469,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); /* * A ro->rw remount sequence should continue with the paused balance * regardless of who pauses it, system or the user as of now, so set @@ -4500,7 +4541,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4641,7 +4682,7 @@ int btrfs_uuid_scan_kthread(void *data) eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size < sizeof(root_item)) goto skip; @@ -5502,7 +5543,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; struct btrfs_chunk *chunk; @@ -5574,7 +5614,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, } btrfs_set_stack_chunk_length(chunk, bg->length); - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); @@ -6312,7 +6352,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + /* Only stripe based profiles needs to check against stripe length. */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { u64 max_len = stripe_len - stripe_offset; /* @@ -6935,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); - atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE, NULL); @@ -7559,6 +7597,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) fs_info->fs_devices->total_rw_bytes = 0; /* + * Lockdep complains about possible circular locking dependency between + * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores + * used for freeze procection of a fs (struct super_block.s_writers), + * which we take when starting a transaction, and extent buffers of the + * chunk tree if we call read_one_dev() while holding a lock on an + * extent buffer of the chunk tree. Since we are mounting the filesystem + * and at this point there can't be any concurrent task modifying the + * chunk tree, to keep it simple, just skip locking on the chunk tree. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + path->skip_locking = 1; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk @@ -7583,10 +7634,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } - /* - * The nodes on level 1 are not locked but we don't need to do - * that during mount time as nothing else can access the tree - */ node = path->nodes[1]; if (node) { if (last_ra_node != node->start) { @@ -7614,7 +7661,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ - ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) @@ -7720,7 +7766,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); @@ -7798,7 +7844,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { @@ -8288,23 +8334,26 @@ out: return ret; } -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; + if (!btrfs_is_zoned(fs_info)) + return false; + /* Do not attempt to repair in degraded state */ if (btrfs_test_opt(fs_info, DEGRADED)) - return 0; + return true; cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) - return 0; + return true; spin_lock(&cache->lock); if (cache->relocating_repair) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); - return 0; + return true; } cache->relocating_repair = 1; spin_unlock(&cache->lock); @@ -8312,5 +8361,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); - return 0; + return true; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3b8130680749..005c9e2a491a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -123,13 +123,6 @@ struct btrfs_device { /* per-device scrub information */ struct scrub_ctx *scrub_ctx; - /* readahead state */ - atomic_t reada_in_flight; - u64 reada_next; - struct reada_zone *reada_curr_zone; - struct radix_tree_root reada_zones; - struct radix_tree_root reada_extents; - /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; @@ -637,6 +630,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 2837b4c8424d..99abf41b89b9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const int slot = path->slots[0]; struct extent_buffer *leaf = path->nodes[0]; const u16 old_data_len = btrfs_dir_data_len(leaf, di); - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); const u32 data_size = sizeof(*di) + name_len + size; - struct btrfs_item *item; unsigned long data_ptr; char *ptr; @@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_extend_item(path, data_size); } - item = btrfs_item_nr(slot); ptr = btrfs_item_ptr(leaf, slot, char); - ptr += btrfs_item_size(leaf, item) - data_size; + ptr += btrfs_item_size(leaf, slot) - data_size; di = (struct btrfs_dir_item *)ptr; btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; @@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next_item; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); cur = 0; while (cur < item_size) { u16 name_len = btrfs_dir_name_len(leaf, di); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 67d932d70798..f559d517c7c4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -5,6 +5,7 @@ #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/atomic.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -213,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos, static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zno; int ret; if (!*nr_zones) @@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; + /* + * We cannot report zones beyond the zone end. So, it is OK to + * cap *nr_zones to at the end. + */ + *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); + + for (i = 0; i < *nr_zones; i++) { + struct blk_zone *zone_info; + + zone_info = &zinfo->zone_cache[zno + i]; + if (!zone_info->len) + break; + } + + if (i == *nr_zones) { + /* Cache hit on all the zones */ + memcpy(zones, zinfo->zone_cache + zno, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; + } + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!ret) return -EIO; + /* Populate cache */ + if (zinfo->zone_cache) + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; } @@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } @@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_get_dev_zone_info(struct btrfs_device *device) +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; @@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + device->zone_info = zone_info; + if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); @@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + /* + * Enable zone cache only for a zoned device. On a non-zoned device, we + * fill the zone info with emulated CONVENTIONAL zones, so no need to + * use the cache. + */ + if (populate_cache && bdev_is_zoned(device->bdev)) { + zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * + zone_info->nr_zones); + if (!zone_info->zone_cache) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to allocate zone cache for %s", + rcu_str_deref(device->name)); + ret = -ENOMEM; + goto out; + } + } + /* Get zones type */ nactive = 0; while (sector < nr_sectors) { @@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) kfree(zones); - device->zone_info = zone_info; - switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: model = "host-managed zoned"; @@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: - bitmap_free(zone_info->active_zones); - bitmap_free(zone_info->empty_zones); - bitmap_free(zone_info->seq_zones); - kfree(zone_info); - device->zone_info = NULL; + btrfs_destroy_dev_zone_info(device); return ret; } @@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); + vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } @@ -1104,7 +1154,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, u64 *offset_ret) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; @@ -1119,6 +1169,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, key.type = 0; key.offset = 0; + root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ if (!ret) @@ -1586,29 +1637,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (!btrfs_is_zoned(fs_info)) return true; - cache = *cache_ret; + cache = btrfs_lookup_block_group(fs_info, eb->start); + if (!cache) + return true; - if (cache && (eb->start < cache->start || - cache->start + cache->length <= eb->start)) { + if (cache->meta_write_pointer != eb->start) { btrfs_put_block_group(cache); cache = NULL; - *cache_ret = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; } - if (!cache) - cache = btrfs_lookup_block_group(fs_info, eb->start); - - if (cache) { - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } - - *cache_ret = cache; - } + *cache_ret = cache; return ret; } @@ -1860,6 +1901,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, @@ -1883,7 +1925,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) return ret; } -bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_device *device; bool ret = false; @@ -1892,8 +1934,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index return true; /* Non-single profiles are not supported yet */ - if (raid_index != BTRFS_RAID_SINGLE) - return false; + ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); /* Check if there is a device with active zones left */ mutex_lock(&fs_devices->device_list_mutex); @@ -1942,6 +1983,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len ASSERT(block_group->alloc_offset == block_group->zone_capacity); ASSERT(block_group->free_space_ctl->free_space == 0); btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); map = block_group->physical_map; @@ -1973,3 +2015,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); } + +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + if (!btrfs_is_zoned(fs_info)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->zone_info) { + vfree(device->zone_info->zone_cache); + device->zone_info->zone_cache = NULL; + } + } + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e53ab7b96437..cbf016a7bb5d 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -8,6 +8,7 @@ #include "volumes.h" #include "disk-io.h" #include "block-group.h" +#include "btrfs_inode.h" /* * Block groups with more than this value (percents) of unusable space will be @@ -28,6 +29,7 @@ struct btrfs_zoned_device_info { unsigned long *seq_zones; unsigned long *empty_zones; unsigned long *active_zones; + struct blk_zone *zone_cache; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -35,7 +37,7 @@ struct btrfs_zoned_device_info { int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); -int btrfs_get_dev_zone_info(struct btrfs_device *device); +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); @@ -71,11 +73,11 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); -bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, - int raid_index); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i return 0; } -static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, + bool populate_cache) { return 0; } @@ -222,7 +225,7 @@ static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) } static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, - int raid_index) + u64 flags) { return true; } @@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -350,4 +354,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->treelog_bg_lock); } +static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_lock(&inode->vfs_inode, 0); +} + +static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_unlock(&inode->vfs_inode, 0); +} + #endif diff --git a/fs/buffer.c b/fs/buffer.c index 46bc589b7a03..8e112b6bd371 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1969,34 +1969,34 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } } -int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, +int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(page, inode, 0); + head = create_page_buffers(&folio->page, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } @@ -2016,20 +2016,20 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, if (buffer_new(bh)) { clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, + folio_zero_segments(folio, to, block_end, block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; @@ -2050,14 +2050,15 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); return err; } int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page, pos, len, get_block, NULL); + return __block_write_begin_int(page_folio(page), pos, len, get_block, + NULL); } EXPORT_SYMBOL(__block_write_begin); diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 6827b40f7ddc..719faeeda168 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -19,3 +19,10 @@ config CACHEFILES_DEBUG caching on files module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/cachefiles/parameter/debug or by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_ERROR_INJECTION + bool "Provide error injection for cachefiles" + depends on CACHEFILES && SYSCTL + help + This permits error injection to be enabled in cachefiles whilst a + cache is in service. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 02fd17731769..16d811f1a2fa 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -4,15 +4,17 @@ # cachefiles-y := \ - bind.o \ + cache.o \ daemon.o \ interface.o \ io.o \ key.o \ main.o \ namei.o \ - rdwr.o \ security.o \ + volume.o \ xattr.o +cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o + obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c deleted file mode 100644 index d463d89f5db8..000000000000 --- a/fs/cachefiles/bind.c +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Bind and unbind a cache from the filesystem backing it - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/statfs.h> -#include <linux/ctype.h> -#include <linux/xattr.h> -#include "internal.h" - -static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); - -/* - * bind a directory as a cache - */ -int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) -{ - _enter("{%u,%u,%u,%u,%u,%u},%s", - cache->frun_percent, - cache->fcull_percent, - cache->fstop_percent, - cache->brun_percent, - cache->bcull_percent, - cache->bstop_percent, - args); - - /* start by checking things over */ - ASSERT(cache->fstop_percent >= 0 && - cache->fstop_percent < cache->fcull_percent && - cache->fcull_percent < cache->frun_percent && - cache->frun_percent < 100); - - ASSERT(cache->bstop_percent >= 0 && - cache->bstop_percent < cache->bcull_percent && - cache->bcull_percent < cache->brun_percent && - cache->brun_percent < 100); - - if (*args) { - pr_err("'bind' command doesn't take an argument\n"); - return -EINVAL; - } - - if (!cache->rootdirname) { - pr_err("No cache directory specified\n"); - return -EINVAL; - } - - /* don't permit already bound caches to be re-bound */ - if (test_bit(CACHEFILES_READY, &cache->flags)) { - pr_err("Cache already bound\n"); - return -EBUSY; - } - - /* make sure we have copies of the tag and dirname strings */ - if (!cache->tag) { - /* the tag string is released by the fops->release() - * function, so we don't release it on error here */ - cache->tag = kstrdup("CacheFiles", GFP_KERNEL); - if (!cache->tag) - return -ENOMEM; - } - - /* add the cache */ - return cachefiles_daemon_add_cache(cache); -} - -/* - * add a cache - */ -static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) -{ - struct cachefiles_object *fsdef; - struct path path; - struct kstatfs stats; - struct dentry *graveyard, *cachedir, *root; - const struct cred *saved_cred; - int ret; - - _enter(""); - - /* we want to work under the module's security ID */ - ret = cachefiles_get_security_ID(cache); - if (ret < 0) - return ret; - - cachefiles_begin_secure(cache, &saved_cred); - - /* allocate the root index object */ - ret = -ENOMEM; - - fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); - if (!fsdef) - goto error_root_object; - - ASSERTCMP(fsdef->backer, ==, NULL); - - atomic_set(&fsdef->usage, 1); - fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; - - /* look up the directory at the root of the cache */ - ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); - if (ret < 0) - goto error_open_root; - - cache->mnt = path.mnt; - root = path.dentry; - - ret = -EINVAL; - if (mnt_user_ns(path.mnt) != &init_user_ns) { - pr_warn("File cache on idmapped mounts not supported"); - goto error_unsupported; - } - - /* check parameters */ - ret = -EOPNOTSUPP; - if (d_is_negative(root) || - !d_backing_inode(root)->i_op->lookup || - !d_backing_inode(root)->i_op->mkdir || - !(d_backing_inode(root)->i_opflags & IOP_XATTR) || - !root->d_sb->s_op->statfs || - !root->d_sb->s_op->sync_fs) - goto error_unsupported; - - ret = -EROFS; - if (sb_rdonly(root->d_sb)) - goto error_unsupported; - - /* determine the security of the on-disk cache as this governs - * security ID of files we create */ - ret = cachefiles_determine_cache_security(cache, root, &saved_cred); - if (ret < 0) - goto error_unsupported; - - /* get the cache size and blocksize */ - ret = vfs_statfs(&path, &stats); - if (ret < 0) - goto error_unsupported; - - ret = -ERANGE; - if (stats.f_bsize <= 0) - goto error_unsupported; - - ret = -EOPNOTSUPP; - if (stats.f_bsize > PAGE_SIZE) - goto error_unsupported; - - cache->bsize = stats.f_bsize; - cache->bshift = 0; - if (stats.f_bsize < PAGE_SIZE) - cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); - - _debug("blksize %u (shift %u)", - cache->bsize, cache->bshift); - - _debug("size %llu, avail %llu", - (unsigned long long) stats.f_blocks, - (unsigned long long) stats.f_bavail); - - /* set up caching limits */ - do_div(stats.f_files, 100); - cache->fstop = stats.f_files * cache->fstop_percent; - cache->fcull = stats.f_files * cache->fcull_percent; - cache->frun = stats.f_files * cache->frun_percent; - - _debug("limits {%llu,%llu,%llu} files", - (unsigned long long) cache->frun, - (unsigned long long) cache->fcull, - (unsigned long long) cache->fstop); - - stats.f_blocks >>= cache->bshift; - do_div(stats.f_blocks, 100); - cache->bstop = stats.f_blocks * cache->bstop_percent; - cache->bcull = stats.f_blocks * cache->bcull_percent; - cache->brun = stats.f_blocks * cache->brun_percent; - - _debug("limits {%llu,%llu,%llu} blocks", - (unsigned long long) cache->brun, - (unsigned long long) cache->bcull, - (unsigned long long) cache->bstop); - - /* get the cache directory and check its type */ - cachedir = cachefiles_get_directory(cache, root, "cache"); - if (IS_ERR(cachedir)) { - ret = PTR_ERR(cachedir); - goto error_unsupported; - } - - fsdef->dentry = cachedir; - fsdef->fscache.cookie = NULL; - - ret = cachefiles_check_object_type(fsdef); - if (ret < 0) - goto error_unsupported; - - /* get the graveyard directory */ - graveyard = cachefiles_get_directory(cache, root, "graveyard"); - if (IS_ERR(graveyard)) { - ret = PTR_ERR(graveyard); - goto error_unsupported; - } - - cache->graveyard = graveyard; - - /* publish the cache */ - fscache_init_cache(&cache->cache, - &cachefiles_cache_ops, - "%s", - fsdef->dentry->d_sb->s_id); - - fscache_object_init(&fsdef->fscache, &fscache_fsdef_index, - &cache->cache); - - ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); - if (ret < 0) - goto error_add_cache; - - /* done */ - set_bit(CACHEFILES_READY, &cache->flags); - dput(root); - - pr_info("File cache on %s registered\n", cache->cache.identifier); - - /* check how much space the cache has */ - cachefiles_has_space(cache, 0, 0); - cachefiles_end_secure(cache, saved_cred); - return 0; - -error_add_cache: - dput(cache->graveyard); - cache->graveyard = NULL; -error_unsupported: - mntput(cache->mnt); - cache->mnt = NULL; - dput(fsdef->dentry); - fsdef->dentry = NULL; - dput(root); -error_open_root: - kmem_cache_free(cachefiles_object_jar, fsdef); -error_root_object: - cachefiles_end_secure(cache, saved_cred); - pr_err("Failed to register: %d\n", ret); - return ret; -} - -/* - * unbind a cache on fd release - */ -void cachefiles_daemon_unbind(struct cachefiles_cache *cache) -{ - _enter(""); - - if (test_bit(CACHEFILES_READY, &cache->flags)) { - pr_info("File cache on %s unregistering\n", - cache->cache.identifier); - - fscache_withdraw_cache(&cache->cache); - } - - dput(cache->graveyard); - mntput(cache->mnt); - - kfree(cache->rootdirname); - kfree(cache->secctx); - kfree(cache->tag); - - _leave(""); -} diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c new file mode 100644 index 000000000000..ce4d4785003c --- /dev/null +++ b/fs/cachefiles/cache.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Manage high-level VFS aspects of a cache. + * + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <linux/statfs.h> +#include <linux/namei.h> +#include "internal.h" + +/* + * Bring a cache online. + */ +int cachefiles_add_cache(struct cachefiles_cache *cache) +{ + struct fscache_cache *cache_cookie; + struct path path; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + cache_cookie = fscache_acquire_cache(cache->tag); + if (IS_ERR(cache_cookie)) + return PTR_ERR(cache_cookie); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + goto error_getsec; + + cachefiles_begin_secure(cache, &saved_cred); + + /* look up the directory at the root of the cache */ + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); + if (ret < 0) + goto error_open_root; + + cache->mnt = path.mnt; + root = path.dentry; + + ret = -EINVAL; + if (is_idmapped_mnt(path.mnt)) { + pr_warn("File cache on idmapped mounts not supported"); + goto error_unsupported; + } + + /* check parameters */ + ret = -EOPNOTSUPP; + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !(d_backing_inode(root)->i_opflags & IOP_XATTR) || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs || + root->d_sb->s_blocksize > PAGE_SIZE) + goto error_unsupported; + + ret = -EROFS; + if (sb_rdonly(root->d_sb)) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(&path, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = 0; + if (stats.f_bsize < PAGE_SIZE) + cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + stats.f_blocks >>= cache->bshift; + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache", NULL); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + cache->store = cachedir; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard", NULL); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + cache->cache = cache_cookie; + + ret = fscache_add_cache(cache_cookie, &cachefiles_cache_ops, cache); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + pr_info("File cache on %s registered\n", cache_cookie->name); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check); + cachefiles_end_secure(cache, saved_cred); + _leave(" = 0 [%px]", cache->cache); + return 0; + +error_add_cache: + cachefiles_put_directory(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + cachefiles_put_directory(cache->store); + cache->store = NULL; + mntput(cache->mnt); + cache->mnt = NULL; + dput(root); +error_open_root: + cachefiles_end_secure(cache, saved_cred); +error_getsec: + fscache_relinquish_cache(cache_cookie); + cache->cache = NULL; + pr_err("Failed to register: %d\n", ret); + return ret; +} + +/* + * See if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr, + enum cachefiles_has_space_for reason) +{ + struct kstatfs stats; + u64 b_avail, b_writing; + int ret; + + struct path path = { + .mnt = cache->mnt, + .dentry = cache->mnt->mnt_root, + }; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(&path, &stats); + if (ret < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(path.dentry), ret, + cachefiles_trace_statfs_error); + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + b_avail = stats.f_bavail >> cache->bshift; + b_writing = atomic_long_read(&cache->b_writing); + if (b_avail > b_writing) + b_avail -= b_writing; + else + b_avail = 0; + + //_debug("avail %llu,%llu", + // (unsigned long long)stats.f_ffree, + // (unsigned long long)b_avail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (b_avail > bnr) + b_avail -= bnr; + else + b_avail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + b_avail < cache->bstop) + goto stop_and_begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + b_avail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + b_avail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +stop_and_begin_cull: + switch (reason) { + case cachefiles_has_space_for_write: + fscache_count_no_write_space(); + break; + case cachefiles_has_space_for_create: + fscache_count_no_create_space(); + break; + default: + break; + } +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * Mark all the objects as being out of service and queue them all for cleanup. + */ +static void cachefiles_withdraw_objects(struct cachefiles_cache *cache) +{ + struct cachefiles_object *object; + unsigned int count = 0; + + _enter(""); + + spin_lock(&cache->object_list_lock); + + while (!list_empty(&cache->object_list)) { + object = list_first_entry(&cache->object_list, + struct cachefiles_object, cache_link); + cachefiles_see_object(object, cachefiles_obj_see_withdrawal); + list_del_init(&object->cache_link); + fscache_withdraw_cookie(object->cookie); + count++; + if ((count & 63) == 0) { + spin_unlock(&cache->object_list_lock); + cond_resched(); + spin_lock(&cache->object_list_lock); + } + } + + spin_unlock(&cache->object_list_lock); + _leave(" [%u objs]", count); +} + +/* + * Withdraw volumes. + */ +static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) +{ + _enter(""); + + for (;;) { + struct cachefiles_volume *volume = NULL; + + spin_lock(&cache->object_list_lock); + if (!list_empty(&cache->volumes)) { + volume = list_first_entry(&cache->volumes, + struct cachefiles_volume, cache_link); + list_del_init(&volume->cache_link); + } + spin_unlock(&cache->object_list_lock); + if (!volume) + break; + + cachefiles_withdraw_volume(volume); + } + + _leave(""); +} + +/* + * Sync a cache to backing disk. + */ +static void cachefiles_sync_cache(struct cachefiles_cache *cache) +{ + const struct cred *saved_cred; + int ret; + + _enter("%s", cache->cache->name); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + down_read(&cache->mnt->mnt_sb->s_umount); + ret = sync_filesystem(cache->mnt->mnt_sb); + up_read(&cache->mnt->mnt_sb->s_umount); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock returned error %d", + ret); +} + +/* + * Withdraw cache objects. + */ +void cachefiles_withdraw_cache(struct cachefiles_cache *cache) +{ + struct fscache_cache *fscache = cache->cache; + + pr_info("File cache on %s unregistering\n", fscache->name); + + fscache_withdraw_cache(fscache); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + cachefiles_withdraw_objects(cache); + fscache_wait_for_objects(fscache); + + cachefiles_withdraw_volumes(cache); + cachefiles_sync_cache(cache); + cache->cache = NULL; + fscache_relinquish_cache(fscache); +} diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 752c1e43416f..40a792421fc1 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* Daemon interface * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -41,6 +41,8 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bind(struct cachefiles_cache *, char *); +static void cachefiles_daemon_unbind(struct cachefiles_cache *); static unsigned long cachefiles_open; @@ -78,7 +80,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { /* - * do various checks + * Prepare a cache for caching. */ static int cachefiles_daemon_open(struct inode *inode, struct file *file) { @@ -102,9 +104,10 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) } mutex_init(&cache->daemon_mutex); - cache->active_nodes = RB_ROOT; - rwlock_init(&cache->active_lock); init_waitqueue_head(&cache->daemon_pollwq); + INIT_LIST_HEAD(&cache->volumes); + INIT_LIST_HEAD(&cache->object_list); + spin_lock_init(&cache->object_list_lock); /* set default caching limits * - limit at 1% free space and/or free files @@ -124,7 +127,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) } /* - * release a cache + * Release a cache. */ static int cachefiles_daemon_release(struct inode *inode, struct file *file) { @@ -138,8 +141,6 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file) cachefiles_daemon_unbind(cache); - ASSERT(!cache->active_nodes.rb_node); - /* clean up the control file interface */ cache->cachefilesd = NULL; file->private_data = NULL; @@ -152,7 +153,7 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file) } /* - * read the cache state + * Read the cache state. */ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, size_t buflen, loff_t *pos) @@ -169,7 +170,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, return 0; /* check how much space the cache has */ - cachefiles_has_space(cache, 0, 0); + cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check); /* summarise */ f_released = atomic_xchg(&cache->f_released, 0); @@ -206,7 +207,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, } /* - * command the cache + * Take a command from cachefilesd, parse it and act on it. */ static ssize_t cachefiles_daemon_write(struct file *file, const char __user *_data, @@ -225,7 +226,7 @@ static ssize_t cachefiles_daemon_write(struct file *file, if (test_bit(CACHEFILES_DEAD, &cache->flags)) return -EIO; - if (datalen < 0 || datalen > PAGE_SIZE - 1) + if (datalen > PAGE_SIZE - 1) return -EOPNOTSUPP; /* drag the command string into the kernel so we can parse it */ @@ -284,7 +285,7 @@ found_command: } /* - * poll for culling state + * Poll for culling state * - use EPOLLOUT to indicate culling state */ static __poll_t cachefiles_daemon_poll(struct file *file, @@ -306,7 +307,7 @@ static __poll_t cachefiles_daemon_poll(struct file *file, } /* - * give a range error for cache space constraints + * Give a range error for cache space constraints * - can be tail-called */ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, @@ -318,7 +319,7 @@ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, } /* - * set the percentage of files at which to stop culling + * Set the percentage of files at which to stop culling * - command: "frun <N>%" */ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) @@ -342,7 +343,7 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of files at which to start culling + * Set the percentage of files at which to start culling * - command: "fcull <N>%" */ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) @@ -366,7 +367,7 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of files at which to stop allocating + * Set the percentage of files at which to stop allocating * - command: "fstop <N>%" */ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) @@ -382,7 +383,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (fstop < 0 || fstop >= cache->fcull_percent) + if (fstop >= cache->fcull_percent) return cachefiles_daemon_range_error(cache, args); cache->fstop_percent = fstop; @@ -390,7 +391,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to stop culling + * Set the percentage of blocks at which to stop culling * - command: "brun <N>%" */ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) @@ -414,7 +415,7 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to start culling + * Set the percentage of blocks at which to start culling * - command: "bcull <N>%" */ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) @@ -438,7 +439,7 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to stop allocating + * Set the percentage of blocks at which to stop allocating * - command: "bstop <N>%" */ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) @@ -454,7 +455,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (bstop < 0 || bstop >= cache->bcull_percent) + if (bstop >= cache->bcull_percent) return cachefiles_daemon_range_error(cache, args); cache->bstop_percent = bstop; @@ -462,7 +463,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) } /* - * set the cache directory + * Set the cache directory * - command: "dir <name>" */ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) @@ -490,7 +491,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) } /* - * set the cache security context + * Set the cache security context * - command: "secctx <ctx>" */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) @@ -518,7 +519,7 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) } /* - * set the cache tag + * Set the cache tag * - command: "tag <name>" */ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) @@ -544,7 +545,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) } /* - * request a node in the cache be culled from the current working directory + * Request a node in the cache be culled from the current working directory * - command: "cull <name>" */ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) @@ -568,7 +569,6 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) return -EIO; } - /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); if (!d_can_lookup(path.dentry)) @@ -593,7 +593,7 @@ inval: } /* - * set debugging mode + * Set debugging mode * - command: "debug <mask>" */ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) @@ -616,7 +616,7 @@ inval: } /* - * find out whether an object in the current working directory is in use or not + * Find out whether an object in the current working directory is in use or not * - command: "inuse <name>" */ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) @@ -640,7 +640,6 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) return -EIO; } - /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); if (!d_can_lookup(path.dentry)) @@ -665,84 +664,65 @@ inval: } /* - * see if we have space for a number of pages and/or a number of files in the - * cache + * Bind a directory as a cache */ -int cachefiles_has_space(struct cachefiles_cache *cache, - unsigned fnr, unsigned bnr) +static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) { - struct kstatfs stats; - struct path path = { - .mnt = cache->mnt, - .dentry = cache->mnt->mnt_root, - }; - int ret; + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + if (cache->fstop_percent >= cache->fcull_percent || + cache->fcull_percent >= cache->frun_percent || + cache->frun_percent >= 100) + return -ERANGE; + + if (cache->bstop_percent >= cache->bcull_percent || + cache->bcull_percent >= cache->brun_percent || + cache->brun_percent >= 100) + return -ERANGE; - //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", - // (unsigned long long) cache->frun, - // (unsigned long long) cache->fcull, - // (unsigned long long) cache->fstop, - // (unsigned long long) cache->brun, - // (unsigned long long) cache->bcull, - // (unsigned long long) cache->bstop, - // fnr, bnr); - - /* find out how many pages of blockdev are available */ - memset(&stats, 0, sizeof(stats)); - - ret = vfs_statfs(&path, &stats); - if (ret < 0) { - if (ret == -EIO) - cachefiles_io_error(cache, "statfs failed"); - _leave(" = %d", ret); - return ret; + if (*args) { + pr_err("'bind' command doesn't take an argument\n"); + return -EINVAL; } - stats.f_bavail >>= cache->bshift; - - //_debug("avail %llu,%llu", - // (unsigned long long) stats.f_ffree, - // (unsigned long long) stats.f_bavail); - - /* see if there is sufficient space */ - if (stats.f_ffree > fnr) - stats.f_ffree -= fnr; - else - stats.f_ffree = 0; - - if (stats.f_bavail > bnr) - stats.f_bavail -= bnr; - else - stats.f_bavail = 0; - - ret = -ENOBUFS; - if (stats.f_ffree < cache->fstop || - stats.f_bavail < cache->bstop) - goto begin_cull; - - ret = 0; - if (stats.f_ffree < cache->fcull || - stats.f_bavail < cache->bcull) - goto begin_cull; - - if (test_bit(CACHEFILES_CULLING, &cache->flags) && - stats.f_ffree >= cache->frun && - stats.f_bavail >= cache->brun && - test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) - ) { - _debug("cease culling"); - cachefiles_state_changed(cache); + if (!cache->rootdirname) { + pr_err("No cache directory specified\n"); + return -EINVAL; } - //_leave(" = 0"); - return 0; - -begin_cull: - if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { - _debug("### CULL CACHE ###"); - cachefiles_state_changed(cache); + /* Don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + pr_err("Cache already bound\n"); + return -EBUSY; } - _leave(" = %d", ret); - return ret; + return cachefiles_add_cache(cache); +} + +/* + * Unbind a cache. + */ +static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) + cachefiles_withdraw_cache(cache); + + cachefiles_put_directory(cache->graveyard); + cachefiles_put_directory(cache->store); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); } diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c new file mode 100644 index 000000000000..58f8aec964e4 --- /dev/null +++ b/fs/cachefiles/error_inject.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Error injection handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/sysctl.h> +#include "internal.h" + +unsigned int cachefiles_error_injection_state; + +static struct ctl_table_header *cachefiles_sysctl; +static struct ctl_table cachefiles_sysctls[] = { + { + .procname = "error_injection", + .data = &cachefiles_error_injection_state, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + {} +}; + +static struct ctl_table cachefiles_sysctls_root[] = { + { + .procname = "cachefiles", + .mode = 0555, + .child = cachefiles_sysctls, + }, + {} +}; + +int __init cachefiles_register_error_injection(void) +{ + cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root); + if (!cachefiles_sysctl) + return -ENOMEM; + return 0; + +} + +void cachefiles_unregister_error_injection(void) +{ + unregister_sysctl_table(cachefiles_sysctl); +} diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index da28ac1fa225..51c968cd00a6 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -1,572 +1,445 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache interface to CacheFiles * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/file.h> +#include <linux/falloc.h> +#include <trace/events/fscache.h> #include "internal.h" -struct cachefiles_lookup_data { - struct cachefiles_xattr *auxdata; /* auxiliary data */ - char *key; /* key path */ -}; - -static int cachefiles_attr_changed(struct fscache_object *_object); +static atomic_t cachefiles_object_debug_id; /* - * allocate an object record for a cookie lookup and prepare the lookup data + * Allocate a cache object record. */ -static struct fscache_object *cachefiles_alloc_object( - struct fscache_cache *_cache, - struct fscache_cookie *cookie) +static +struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie) { - struct cachefiles_lookup_data *lookup_data; + struct fscache_volume *vcookie = cookie->volume; + struct cachefiles_volume *volume = vcookie->cache_priv; struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct cachefiles_xattr *auxdata; - unsigned keylen, auxlen; - void *buffer, *p; - char *key; - cache = container_of(_cache, struct cachefiles_cache, cache); + _enter("{%s},%x,", vcookie->key, cookie->debug_id); - _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id); - - lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); - if (!lookup_data) - goto nomem_lookup_data; - - /* create a new object record and a temporary leaf image */ - object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp); + object = kmem_cache_zalloc(cachefiles_object_jar, GFP_KERNEL); if (!object) - goto nomem_object; - - ASSERTCMP(object->backer, ==, NULL); + return NULL; - BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - atomic_set(&object->usage, 1); + refcount_set(&object->ref, 1); - fscache_object_init(&object->fscache, cookie, &cache->cache); + spin_lock_init(&object->lock); + INIT_LIST_HEAD(&object->cache_link); + object->volume = volume; + object->debug_id = atomic_inc_return(&cachefiles_object_debug_id); + object->cookie = fscache_get_cookie(cookie, fscache_cookie_get_attach_object); - object->type = cookie->def->type; - - /* get hold of the raw key - * - stick the length on the front and leave space on the back for the - * encoder - */ - buffer = kmalloc((2 + 512) + 3, cachefiles_gfp); - if (!buffer) - goto nomem_buffer; - - keylen = cookie->key_len; - if (keylen <= sizeof(cookie->inline_key)) - p = cookie->inline_key; - else - p = cookie->key; - memcpy(buffer + 2, p, keylen); - - *(uint16_t *)buffer = keylen; - ((char *)buffer)[keylen + 2] = 0; - ((char *)buffer)[keylen + 3] = 0; - ((char *)buffer)[keylen + 4] = 0; - - /* turn the raw key into something that can work with as a filename */ - key = cachefiles_cook_key(buffer, keylen + 2, object->type); - if (!key) - goto nomem_key; - - /* get hold of the auxiliary data and prepend the object type */ - auxdata = buffer; - auxlen = cookie->aux_len; - if (auxlen) { - if (auxlen <= sizeof(cookie->inline_aux)) - p = cookie->inline_aux; - else - p = cookie->aux; - memcpy(auxdata->data, p, auxlen); - } - - auxdata->len = auxlen + 1; - auxdata->type = cookie->type; - - lookup_data->auxdata = auxdata; - lookup_data->key = key; - object->lookup_data = lookup_data; - - _leave(" = %x [%p]", object->fscache.debug_id, lookup_data); - return &object->fscache; - -nomem_key: - kfree(buffer); -nomem_buffer: - BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - kmem_cache_free(cachefiles_object_jar, object); - fscache_object_destroyed(&cache->cache); -nomem_object: - kfree(lookup_data); -nomem_lookup_data: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + fscache_count_object(vcookie->cache); + trace_cachefiles_ref(object->debug_id, cookie->debug_id, 1, + cachefiles_obj_new); + return object; } /* - * attempt to look up the nominated node in this cache - * - return -ETIMEDOUT to be scheduled again + * Note that an object has been seen. */ -static int cachefiles_lookup_object(struct fscache_object *_object) +void cachefiles_see_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_lookup_data *lookup_data; - struct cachefiles_object *parent, *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - int ret; - - _enter("{OBJ%x}", _object->debug_id); - - cache = container_of(_object->cache, struct cachefiles_cache, cache); - parent = container_of(_object->parent, - struct cachefiles_object, fscache); - object = container_of(_object, struct cachefiles_object, fscache); - lookup_data = object->lookup_data; - - ASSERTCMP(lookup_data, !=, NULL); - - /* look up the key, creating any missing bits */ - cachefiles_begin_secure(cache, &saved_cred); - ret = cachefiles_walk_to_object(parent, object, - lookup_data->key, - lookup_data->auxdata); - cachefiles_end_secure(cache, saved_cred); - - /* polish off by setting the attributes of non-index files */ - if (ret == 0 && - object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) - cachefiles_attr_changed(&object->fscache); - - if (ret < 0 && ret != -ETIMEDOUT) { - if (ret != -ENOBUFS) - pr_warn("Lookup failed error %d\n", ret); - fscache_object_lookup_error(&object->fscache); - } - - _leave(" [%d]", ret); - return ret; + trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, + refcount_read(&object->ref), why); } /* - * indication of lookup completion + * Increment the usage count on an object; */ -static void cachefiles_lookup_complete(struct fscache_object *_object) +struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_object *object; - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + int r; - if (object->lookup_data) { - kfree(object->lookup_data->key); - kfree(object->lookup_data->auxdata); - kfree(object->lookup_data); - object->lookup_data = NULL; - } + __refcount_inc(&object->ref, &r); + trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, r, why); + return object; } /* - * increment the usage count on an inode object (may fail if unmounting) + * dispose of a reference to an object */ -static -struct fscache_object *cachefiles_grab_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +void cachefiles_put_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_object *object = - container_of(_object, struct cachefiles_object, fscache); - int u; + unsigned int object_debug_id = object->debug_id; + unsigned int cookie_debug_id = object->cookie->debug_id; + struct fscache_cache *cache; + bool done; + int r; + + done = __refcount_dec_and_test(&object->ref, &r); + trace_cachefiles_ref(object_debug_id, cookie_debug_id, r, why); + if (done) { + _debug("- kill object OBJ%x", object_debug_id); + + ASSERTCMP(object->file, ==, NULL); - _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + kfree(object->d_name); -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif + cache = object->volume->cache->cache; + fscache_put_cookie(object->cookie, fscache_cookie_put_object); + object->cookie = NULL; + kmem_cache_free(cachefiles_object_jar, object); + fscache_uncount_object(cache); + } - u = atomic_inc_return(&object->usage); - trace_cachefiles_ref(object, _object->cookie, - (enum cachefiles_obj_ref_trace)why, u); - return &object->fscache; + _leave(""); } /* - * update the auxiliary data for an object object on disk + * Adjust the size of a cache file if necessary to match the DIO size. We keep + * the EOF marker a multiple of DIO blocks so that we don't fall back to doing + * non-DIO for a partial block straddling the EOF, but we also have to be + * careful of someone expanding the file and accidentally accreting the + * padding. */ -static void cachefiles_update_object(struct fscache_object *_object) +static int cachefiles_adjust_size(struct cachefiles_object *object) { - struct cachefiles_object *object; - struct cachefiles_xattr *auxdata; - struct cachefiles_cache *cache; - struct fscache_cookie *cookie; - const struct cred *saved_cred; - const void *aux; - unsigned auxlen; + struct iattr newattrs; + struct file *file = object->file; + uint64_t ni_size; + loff_t oi_size; + int ret; - _enter("{OBJ%x}", _object->debug_id); + ni_size = object->cookie->object_size; + ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); - object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, struct cachefiles_cache, - cache); + _enter("{OBJ%x},[%llu]", + object->debug_id, (unsigned long long) ni_size); - if (!fscache_use_cookie(_object)) { - _leave(" [relinq]"); - return; - } + if (!file) + return -ENOBUFS; - cookie = object->fscache.cookie; - auxlen = cookie->aux_len; + oi_size = i_size_read(file_inode(file)); + if (oi_size == ni_size) + return 0; - if (!auxlen) { - fscache_unuse_cookie(_object); - _leave(" [no aux]"); - return; - } + inode_lock(file_inode(file)); - auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp); - if (!auxdata) { - fscache_unuse_cookie(_object); - _leave(" [nomem]"); - return; + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = notify_change(&init_user_ns, file->f_path.dentry, + &newattrs, NULL); + if (ret < 0) + goto truncate_failed; } - aux = (auxlen <= sizeof(cookie->inline_aux)) ? - cookie->inline_aux : cookie->aux; + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = notify_change(&init_user_ns, file->f_path.dentry, + &newattrs, NULL); - memcpy(auxdata->data, aux, auxlen); - fscache_unuse_cookie(_object); +truncate_failed: + inode_unlock(file_inode(file)); - auxdata->len = auxlen + 1; - auxdata->type = cookie->type; + if (ret < 0) + trace_cachefiles_io_error(NULL, file_inode(file), ret, + cachefiles_trace_notify_change_error); + if (ret == -EIO) { + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } - cachefiles_begin_secure(cache, &saved_cred); - cachefiles_update_object_xattr(object, auxdata); - cachefiles_end_secure(cache, saved_cred); - kfree(auxdata); - _leave(""); + _leave(" = %d", ret); + return ret; } /* - * discard the resources pinned by an object and effect retirement if - * requested + * Attempt to look up the nominated node in this cache */ -static void cachefiles_drop_object(struct fscache_object *_object) +static bool cachefiles_lookup_cookie(struct fscache_cookie *cookie) { struct cachefiles_object *object; - struct cachefiles_cache *cache; + struct cachefiles_cache *cache = cookie->volume->cache->cache_priv; const struct cred *saved_cred; - struct inode *inode; - blkcnt_t i_blocks = 0; + bool success; - ASSERT(_object); + object = cachefiles_alloc_object(cookie); + if (!object) + goto fail; - object = container_of(_object, struct cachefiles_object, fscache); + _enter("{OBJ%x}", object->debug_id); - _enter("{OBJ%x,%d}", - object->fscache.debug_id, atomic_read(&object->usage)); + if (!cachefiles_cook_key(object)) + goto fail_put; - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + cookie->cache_priv = object; -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif + cachefiles_begin_secure(cache, &saved_cred); - /* We need to tidy the object up if we did in fact manage to open it. - * It's possible for us to get here before the object is fully - * initialised if the parent goes away or the object gets retired - * before we set it up. - */ - if (object->dentry) { - /* delete retired objects */ - if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && - _object != cache->cache.fsdef - ) { - _debug("- retire object OBJ%x", object->fscache.debug_id); - inode = d_backing_inode(object->dentry); - if (inode) - i_blocks = inode->i_blocks; - - cachefiles_begin_secure(cache, &saved_cred); - cachefiles_delete_object(cache, object); - cachefiles_end_secure(cache, saved_cred); - } + success = cachefiles_look_up_object(object); + if (!success) + goto fail_withdraw; - /* close the filesystem stuff attached to the object */ - if (object->backer != object->dentry) - dput(object->backer); - object->backer = NULL; - } + cachefiles_see_object(object, cachefiles_obj_see_lookup_cookie); + + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + cachefiles_adjust_size(object); - /* note that the object is now inactive */ - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object, i_blocks); + cachefiles_end_secure(cache, saved_cred); + _leave(" = t"); + return true; - dput(object->dentry); - object->dentry = NULL; +fail_withdraw: + cachefiles_end_secure(cache, saved_cred); + cachefiles_see_object(object, cachefiles_obj_see_lookup_failed); + fscache_caching_failed(cookie); + _debug("failed c=%08x o=%08x", cookie->debug_id, object->debug_id); + /* The caller holds an access count on the cookie, so we need them to + * drop it before we can withdraw the object. + */ + return false; - _leave(""); +fail_put: + cachefiles_put_object(object, cachefiles_obj_put_alloc_fail); +fail: + return false; } /* - * dispose of a reference to an object + * Shorten the backing object to discard any dirty data and free up + * any unused granules. */ -void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +static bool cachefiles_shorten_object(struct cachefiles_object *object, + struct file *file, loff_t new_size) { - struct cachefiles_object *object; - struct fscache_cache *cache; - int u; - - ASSERT(_object); - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("{OBJ%x,%d}", - object->fscache.debug_id, atomic_read(&object->usage)); - -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif - - ASSERTIFCMP(object->fscache.parent, - object->fscache.parent->n_children, >, 0); - - u = atomic_dec_return(&object->usage); - trace_cachefiles_ref(object, _object->cookie, - (enum cachefiles_obj_ref_trace)why, u); - ASSERTCMP(u, !=, -1); - if (u == 0) { - _debug("- kill object OBJ%x", object->fscache.debug_id); + struct cachefiles_cache *cache = object->volume->cache; + struct inode *inode = file_inode(file); + loff_t i_size, dio_size; + int ret; - ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - ASSERTCMP(object->fscache.parent, ==, NULL); - ASSERTCMP(object->backer, ==, NULL); - ASSERTCMP(object->dentry, ==, NULL); - ASSERTCMP(object->fscache.n_ops, ==, 0); - ASSERTCMP(object->fscache.n_children, ==, 0); + dio_size = round_up(new_size, CACHEFILES_DIO_BLOCK_SIZE); + i_size = i_size_read(inode); + + trace_cachefiles_trunc(object, inode, i_size, dio_size, + cachefiles_trunc_shrink); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_truncate(&file->f_path, dio_size); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_trunc_error); + cachefiles_io_error_obj(object, "Trunc-to-size failed %d", ret); + cachefiles_remove_object_xattr(cache, object, file->f_path.dentry); + return false; + } - if (object->lookup_data) { - kfree(object->lookup_data->key); - kfree(object->lookup_data->auxdata); - kfree(object->lookup_data); - object->lookup_data = NULL; + if (new_size < dio_size) { + trace_cachefiles_trunc(object, inode, dio_size, new_size, + cachefiles_trunc_dio_adjust); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE, + new_size, dio_size); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_fallocate_error); + cachefiles_io_error_obj(object, "Trunc-to-dio-size failed %d", ret); + cachefiles_remove_object_xattr(cache, object, file->f_path.dentry); + return false; } - - cache = object->fscache.cache; - fscache_object_destroy(&object->fscache); - kmem_cache_free(cachefiles_object_jar, object); - fscache_object_destroyed(cache); } - _leave(""); + return true; } /* - * sync a cache + * Resize the backing object. */ -static void cachefiles_sync_cache(struct fscache_cache *_cache) +static void cachefiles_resize_cookie(struct netfs_cache_resources *cres, + loff_t new_size) { - struct cachefiles_cache *cache; + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + struct fscache_cookie *cookie = object->cookie; const struct cred *saved_cred; - int ret; + struct file *file = cachefiles_cres_file(cres); + loff_t old_size = cookie->object_size; - _enter("%s", _cache->tag->name); + _enter("%llu->%llu", old_size, new_size); - cache = container_of(_cache, struct cachefiles_cache, cache); - - /* make sure all pages pinned by operations on behalf of the netfs are - * written to disc */ - cachefiles_begin_secure(cache, &saved_cred); - down_read(&cache->mnt->mnt_sb->s_umount); - ret = sync_filesystem(cache->mnt->mnt_sb); - up_read(&cache->mnt->mnt_sb->s_umount); - cachefiles_end_secure(cache, saved_cred); + if (new_size < old_size) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_shorten_object(object, file, new_size); + cachefiles_end_secure(cache, saved_cred); + object->cookie->object_size = new_size; + return; + } - if (ret == -EIO) - cachefiles_io_error(cache, - "Attempt to sync backing fs superblock" - " returned error %d", - ret); + /* The file is being expanded. We don't need to do anything + * particularly. cookie->initial_size doesn't change and so the point + * at which we have to download before doesn't change. + */ + cookie->object_size = new_size; } /* - * check if the backing cache is updated to FS-Cache - * - called by FS-Cache when evaluates if need to invalidate the cache + * Commit changes to the object as we drop it. */ -static int cachefiles_check_consistency(struct fscache_operation *op) +static void cachefiles_commit_object(struct cachefiles_object *object, + struct cachefiles_cache *cache) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - int ret; + bool update = false; - _enter("{OBJ%x}", op->object->debug_id); + if (test_and_clear_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags)) + update = true; + if (test_and_clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags)) + update = true; + if (update) + cachefiles_set_object_xattr(object); - object = container_of(op->object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + if (test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) + cachefiles_commit_tmpfile(cache, object); +} - cachefiles_begin_secure(cache, &saved_cred); - ret = cachefiles_check_auxdata(object); - cachefiles_end_secure(cache, saved_cred); +/* + * Finalise and object and close the VFS structs that we have. + */ +static void cachefiles_clean_up_object(struct cachefiles_object *object, + struct cachefiles_cache *cache) +{ + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + cachefiles_see_object(object, cachefiles_obj_see_clean_delete); + _debug("- inval object OBJ%x", object->debug_id); + cachefiles_delete_object(object, FSCACHE_OBJECT_WAS_RETIRED); + } else { + cachefiles_see_object(object, cachefiles_obj_see_clean_drop_tmp); + _debug("- inval object OBJ%x tmpfile", object->debug_id); + } + } else { + cachefiles_see_object(object, cachefiles_obj_see_clean_commit); + cachefiles_commit_object(object, cache); + } - _leave(" = %d", ret); - return ret; + cachefiles_unmark_inode_in_use(object, object->file); + if (object->file) { + fput(object->file); + object->file = NULL; + } } /* - * notification the attributes on an object have changed - * - called with reads/writes excluded by FS-Cache + * Withdraw caching for a cookie. */ -static int cachefiles_attr_changed(struct fscache_object *_object) +static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; + struct cachefiles_object *object = cookie->cache_priv; + struct cachefiles_cache *cache = object->volume->cache; const struct cred *saved_cred; - struct iattr newattrs; - uint64_t ni_size; - loff_t oi_size; - int ret; - - ni_size = _object->store_limit_l; - - _enter("{OBJ%x},[%llu]", - _object->debug_id, (unsigned long long) ni_size); - - object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - if (ni_size == object->i_size) - return 0; - - if (!object->backer) - return -ENOBUFS; - ASSERT(d_is_reg(object->backer)); + _enter("o=%x", object->debug_id); + cachefiles_see_object(object, cachefiles_obj_see_withdraw_cookie); - fscache_set_store_limit(&object->fscache, ni_size); - - oi_size = i_size_read(d_backing_inode(object->backer)); - if (oi_size == ni_size) - return 0; - - cachefiles_begin_secure(cache, &saved_cred); - inode_lock(d_inode(object->backer)); - - /* if there's an extension to a partial page at the end of the backing - * file, we need to discard the partial page so that we pick up new - * data after it */ - if (oi_size & ~PAGE_MASK && ni_size > oi_size) { - _debug("discard tail %llx", oi_size); - newattrs.ia_valid = ATTR_SIZE; - newattrs.ia_size = oi_size & PAGE_MASK; - ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); - if (ret < 0) - goto truncate_failed; + if (!list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + cachefiles_see_object(object, cachefiles_obj_see_withdrawal); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); } - newattrs.ia_valid = ATTR_SIZE; - newattrs.ia_size = ni_size; - ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); - -truncate_failed: - inode_unlock(d_inode(object->backer)); - cachefiles_end_secure(cache, saved_cred); - - if (ret == -EIO) { - fscache_set_store_limit(&object->fscache, 0); - cachefiles_io_error_obj(object, "Size set failed"); - ret = -ENOBUFS; + if (object->file) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_clean_up_object(object, cache); + cachefiles_end_secure(cache, saved_cred); } - _leave(" = %d", ret); - return ret; + cookie->cache_priv = NULL; + cachefiles_put_object(object, cachefiles_obj_put_detach); } /* - * Invalidate an object + * Invalidate the storage associated with a cookie. */ -static void cachefiles_invalidate_object(struct fscache_operation *op) +static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - struct path path; - uint64_t ni_size; - int ret; + struct cachefiles_object *object = cookie->cache_priv; + struct file *new_file, *old_file; + bool old_tmpfile; - object = container_of(op->object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + _enter("o=%x,[%llu]", object->debug_id, object->cookie->object_size); - ni_size = op->object->store_limit_l; + old_tmpfile = test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); - _enter("{OBJ%x},[%llu]", - op->object->debug_id, (unsigned long long)ni_size); + if (!object->file) { + fscache_resume_after_invalidation(cookie); + _leave(" = t [light]"); + return true; + } - if (object->backer) { - ASSERT(d_is_reg(object->backer)); + new_file = cachefiles_create_tmpfile(object); + if (IS_ERR(new_file)) + goto failed; - fscache_set_store_limit(&object->fscache, ni_size); + /* Substitute the VFS target */ + _debug("sub"); + spin_lock(&object->lock); - path.dentry = object->backer; - path.mnt = cache->mnt; + old_file = object->file; + object->file = new_file; + object->content_info = CACHEFILES_CONTENT_NO_DATA; + set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); - cachefiles_begin_secure(cache, &saved_cred); - ret = vfs_truncate(&path, 0); - if (ret == 0) - ret = vfs_truncate(&path, ni_size); - cachefiles_end_secure(cache, saved_cred); + spin_unlock(&object->lock); + _debug("subbed"); + + /* Allow I/O to take place again */ + fscache_resume_after_invalidation(cookie); + + if (old_file) { + if (!old_tmpfile) { + struct cachefiles_volume *volume = object->volume; + struct dentry *fan = volume->fanout[(u8)cookie->key_hash]; - if (ret != 0) { - fscache_set_store_limit(&object->fscache, 0); - if (ret == -EIO) - cachefiles_io_error_obj(object, - "Invalidate failed"); + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + cachefiles_bury_object(volume->cache, object, fan, + old_file->f_path.dentry, + FSCACHE_OBJECT_INVALIDATED); } + fput(old_file); } - fscache_op_complete(op, true); - _leave(""); -} + _leave(" = t"); + return true; -/* - * dissociate a cache from all the pages it was backing - */ -static void cachefiles_dissociate_pages(struct fscache_cache *cache) -{ - _enter(""); +failed: + _leave(" = f"); + return false; } const struct fscache_cache_ops cachefiles_cache_ops = { .name = "cachefiles", - .alloc_object = cachefiles_alloc_object, - .lookup_object = cachefiles_lookup_object, - .lookup_complete = cachefiles_lookup_complete, - .grab_object = cachefiles_grab_object, - .update_object = cachefiles_update_object, - .invalidate_object = cachefiles_invalidate_object, - .drop_object = cachefiles_drop_object, - .put_object = cachefiles_put_object, - .sync_cache = cachefiles_sync_cache, - .attr_changed = cachefiles_attr_changed, - .read_or_alloc_page = cachefiles_read_or_alloc_page, - .read_or_alloc_pages = cachefiles_read_or_alloc_pages, - .allocate_page = cachefiles_allocate_page, - .allocate_pages = cachefiles_allocate_pages, - .write_page = cachefiles_write_page, - .uncache_page = cachefiles_uncache_page, - .dissociate_pages = cachefiles_dissociate_pages, - .check_consistency = cachefiles_check_consistency, - .begin_read_operation = cachefiles_begin_read_operation, + .acquire_volume = cachefiles_acquire_volume, + .free_volume = cachefiles_free_volume, + .lookup_cookie = cachefiles_lookup_cookie, + .withdraw_cookie = cachefiles_withdraw_cookie, + .invalidate_cookie = cachefiles_invalidate_cookie, + .begin_operation = cachefiles_begin_operation, + .resize_cookie = cachefiles_resize_cookie, + .prepare_to_write = cachefiles_prepare_to_write, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 0a511c36dab8..8dd54d9375b6 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* General netfs cache on cache files internal defs * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -13,58 +13,72 @@ #include <linux/fscache-cache.h> -#include <linux/timer.h> -#include <linux/wait_bit.h> #include <linux/cred.h> -#include <linux/workqueue.h> #include <linux/security.h> +#define CACHEFILES_DIO_BLOCK_SIZE 4096 + struct cachefiles_cache; struct cachefiles_object; -extern unsigned cachefiles_debug; -#define CACHEFILES_DEBUG_KENTER 1 -#define CACHEFILES_DEBUG_KLEAVE 2 -#define CACHEFILES_DEBUG_KDEBUG 4 +enum cachefiles_content { + /* These values are saved on disk */ + CACHEFILES_CONTENT_NO_DATA = 0, /* No content stored */ + CACHEFILES_CONTENT_SINGLE = 1, /* Content is monolithic, all is present */ + CACHEFILES_CONTENT_ALL = 2, /* Content is all present, no map */ + CACHEFILES_CONTENT_BACKFS_MAP = 3, /* Content is piecemeal, mapped through backing fs */ + CACHEFILES_CONTENT_DIRTY = 4, /* Content is dirty (only seen on disk) */ + nr__cachefiles_content +}; -#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) +/* + * Cached volume representation. + */ +struct cachefiles_volume { + struct cachefiles_cache *cache; + struct list_head cache_link; /* Link in cache->volumes */ + struct fscache_volume *vcookie; /* The netfs's representation */ + struct dentry *dentry; /* The volume dentry */ + struct dentry *fanout[256]; /* Fanout subdirs */ +}; /* - * node records + * Backing file state. */ struct cachefiles_object { - struct fscache_object fscache; /* fscache handle */ - struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ - struct dentry *dentry; /* the file/dir representing this object */ - struct dentry *backer; /* backing file */ - loff_t i_size; /* object size */ + struct fscache_cookie *cookie; /* Netfs data storage object cookie */ + struct cachefiles_volume *volume; /* Cache volume that holds this object */ + struct list_head cache_link; /* Link in cache->*_list */ + struct file *file; /* The file representing this object */ + char *d_name; /* Backing file name */ + int debug_id; + spinlock_t lock; + refcount_t ref; + u8 d_name_len; /* Length of filename */ + enum cachefiles_content content_info:8; /* Info about content presence */ unsigned long flags; -#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ - atomic_t usage; /* object usage count */ - uint8_t type; /* object type */ - uint8_t new; /* T if object new */ - spinlock_t work_lock; - struct rb_node active_node; /* link in active tree (dentry is key) */ +#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ }; -extern struct kmem_cache *cachefiles_object_jar; - /* * Cache files cache definition */ struct cachefiles_cache { - struct fscache_cache cache; /* FS-Cache record */ + struct fscache_cache *cache; /* Cache cookie */ struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *store; /* Directory into which live objects go */ struct dentry *graveyard; /* directory into which dead objects go */ struct file *cachefilesd; /* manager daemon handle */ + struct list_head volumes; /* List of volume objects */ + struct list_head object_list; /* List of active objects */ + spinlock_t object_list_lock; /* Lock for volumes and object_list */ const struct cred *cache_cred; /* security override for accessing cache */ struct mutex daemon_mutex; /* command serialisation mutex */ wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ - struct rb_root active_nodes; /* active nodes (can't be culled) */ - rwlock_t active_lock; /* lock for active_nodes */ atomic_t gravecounter; /* graveyard uniquifier */ atomic_t f_released; /* number of objects released lately */ atomic_long_t b_released; /* number of blocks released lately */ + atomic_long_t b_writing; /* Number of blocks being written */ unsigned frun_percent; /* when to stop culling (% files) */ unsigned fcull_percent; /* when to start culling (% files) */ unsigned fstop_percent; /* when to stop allocating (% files) */ @@ -89,38 +103,19 @@ struct cachefiles_cache { char *tag; /* cache binding tag */ }; -/* - * backing file read tracking - */ -struct cachefiles_one_read { - wait_queue_entry_t monitor; /* link into monitored waitqueue */ - struct page *back_page; /* backing file page we're waiting for */ - struct page *netfs_page; /* netfs page we're going to fill */ - struct fscache_retrieval *op; /* retrieval op covering this */ - struct list_head op_link; /* link in op's todo list */ -}; - -/* - * backing file write tracking - */ -struct cachefiles_one_write { - struct page *netfs_page; /* netfs page to copy */ - struct cachefiles_object *object; - struct list_head obj_link; /* link in object's lists */ - fscache_rw_complete_t end_io_func; - void *context; -}; +#include <trace/events/cachefiles.h> -/* - * auxiliary data xattr buffer - */ -struct cachefiles_xattr { - uint16_t len; - uint8_t type; - uint8_t data[]; -}; +static inline +struct file *cachefiles_cres_file(struct netfs_cache_resources *cres) +{ + return cres->cache_priv2; +} -#include <trace/events/cachefiles.h> +static inline +struct cachefiles_object *cachefiles_cres_object(struct netfs_cache_resources *cres) +{ + return fscache_cres_cookie(cres)->cache_priv; +} /* * note change of state for daemon @@ -132,74 +127,118 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache) } /* - * bind.c + * cache.c */ -extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); -extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); +extern int cachefiles_add_cache(struct cachefiles_cache *cache); +extern void cachefiles_withdraw_cache(struct cachefiles_cache *cache); + +enum cachefiles_has_space_for { + cachefiles_has_space_check, + cachefiles_has_space_for_write, + cachefiles_has_space_for_create, +}; +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr, + enum cachefiles_has_space_for reason); /* * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; -extern int cachefiles_has_space(struct cachefiles_cache *cache, - unsigned fnr, unsigned bnr); +/* + * error_inject.c + */ +#ifdef CONFIG_CACHEFILES_ERROR_INJECTION +extern unsigned int cachefiles_error_injection_state; +extern int cachefiles_register_error_injection(void); +extern void cachefiles_unregister_error_injection(void); + +#else +#define cachefiles_error_injection_state 0 + +static inline int cachefiles_register_error_injection(void) +{ + return 0; +} + +static inline void cachefiles_unregister_error_injection(void) +{ +} +#endif + + +static inline int cachefiles_inject_read_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : 0; +} + +static inline int cachefiles_inject_write_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : + cachefiles_error_injection_state & 1 ? -ENOSPC : + 0; +} + +static inline int cachefiles_inject_remove_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : 0; +} /* * interface.c */ extern const struct fscache_cache_ops cachefiles_cache_ops; +extern void cachefiles_see_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); +extern struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); +extern void cachefiles_put_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); -void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why); +/* + * io.c + */ +extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state); /* * key.c */ -extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); +extern bool cachefiles_cook_key(struct cachefiles_object *object); + +/* + * main.c + */ +extern struct kmem_cache *cachefiles_object_jar; /* * namei.c */ -extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object, - blkcnt_t i_blocks); -extern int cachefiles_delete_object(struct cachefiles_cache *cache, - struct cachefiles_object *object); -extern int cachefiles_walk_to_object(struct cachefiles_object *parent, - struct cachefiles_object *object, - const char *key, - struct cachefiles_xattr *auxdata); +extern void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct file *file); +extern int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, + struct dentry *rep, + enum fscache_why_object_killed why); +extern int cachefiles_delete_object(struct cachefiles_object *object, + enum fscache_why_object_killed why); +extern bool cachefiles_look_up_object(struct cachefiles_object *object); extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, struct dentry *dir, - const char *name); + const char *name, + bool *_is_new); +extern void cachefiles_put_directory(struct dentry *dir); extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename); extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); - -/* - * rdwr.c - */ -extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, - struct page *, gfp_t); -extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, - struct list_head *, unsigned *, - gfp_t); -extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, - gfp_t); -extern int cachefiles_allocate_pages(struct fscache_retrieval *, - struct list_head *, unsigned *, gfp_t); -extern int cachefiles_write_page(struct fscache_storage *, struct page *); -extern void cachefiles_uncache_page(struct fscache_object *, struct page *); - -/* - * rdwr2.c - */ -extern int cachefiles_begin_read_operation(struct netfs_read_request *, - struct fscache_retrieval *); +extern struct file *cachefiles_create_tmpfile(struct cachefiles_object *object); +extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, + struct cachefiles_object *object); /* * security.c @@ -222,28 +261,32 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache, } /* + * volume.c + */ +void cachefiles_acquire_volume(struct fscache_volume *volume); +void cachefiles_free_volume(struct fscache_volume *volume); +void cachefiles_withdraw_volume(struct cachefiles_volume *volume); + +/* * xattr.c */ -extern int cachefiles_check_object_type(struct cachefiles_object *object); -extern int cachefiles_set_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); -extern int cachefiles_update_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); -extern int cachefiles_check_auxdata(struct cachefiles_object *object); -extern int cachefiles_check_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object); +extern int cachefiles_check_auxdata(struct cachefiles_object *object, + struct file *file); extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct cachefiles_object *object, struct dentry *dentry); - +extern void cachefiles_prepare_to_write(struct fscache_cookie *cookie); +extern bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume); +extern int cachefiles_check_volume_xattr(struct cachefiles_volume *volume); /* - * error handling + * Error handling */ - #define cachefiles_io_error(___cache, FMT, ...) \ do { \ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ - fscache_io_error(&(___cache)->cache); \ + fscache_io_error((___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ } while (0) @@ -251,15 +294,20 @@ do { \ do { \ struct cachefiles_cache *___cache; \ \ - ___cache = container_of((object)->fscache.cache, \ - struct cachefiles_cache, cache); \ - cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ + ___cache = (object)->volume->cache; \ + cachefiles_io_error(___cache, FMT " [o=%08x]", ##__VA_ARGS__, \ + (object)->debug_id); \ } while (0) /* - * debug tracing + * Debug tracing */ +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + #define dbgprintk(FMT, ...) \ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index effe37ef8629..60b1eac2ce78 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -9,8 +9,9 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/falloc.h> #include <linux/sched/mm.h> -#include <linux/netfs.h> +#include <trace/events/fscache.h> #include "internal.h" struct cachefiles_kiocb { @@ -21,14 +22,18 @@ struct cachefiles_kiocb { size_t skipped; size_t len; }; + struct cachefiles_object *object; netfs_io_terminated_t term_func; void *term_func_priv; bool was_async; + unsigned int inval_counter; /* Copy of cookie->inval_counter */ + u64 b_writing; }; static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) { if (refcount_dec_and_test(&ki->ki_refcnt)) { + cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq); fput(ki->iocb.ki_filp); kfree(ki); } @@ -40,12 +45,22 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) static void cachefiles_read_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct inode *inode = file_inode(ki->iocb.ki_filp); _enter("%ld", ret); + if (ret < 0) + trace_cachefiles_io_error(ki->object, inode, ret, + cachefiles_trace_read_error); + if (ki->term_func) { - if (ret >= 0) - ret += ki->skipped; + if (ret >= 0) { + if (ki->object->cookie->inval_counter == ki->inval_counter) + ki->skipped += ret; + else + ret = -ESTALE; + } + ki->term_func(ki->term_func_priv, ret, ki->was_async); } @@ -58,16 +73,24 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret) static int cachefiles_read(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, - bool seek_data, + enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv) { + struct cachefiles_object *object; struct cachefiles_kiocb *ki; - struct file *file = cres->cache_priv2; + struct file *file; unsigned int old_nofs; ssize_t ret = -ENOBUFS; size_t len = iov_iter_count(iter), skipped = 0; + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + goto presubmission_error; + + fscache_count_read(); + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); @@ -75,10 +98,12 @@ static int cachefiles_read(struct netfs_cache_resources *cres, /* If the caller asked us to seek for data before doing the read, then * we should do that now. If we find a gap, we fill it with zeros. */ - if (seek_data) { + if (read_hole != NETFS_READ_HOLE_IGNORE) { loff_t off = start_pos, off2; - off2 = vfs_llseek(file, off, SEEK_DATA); + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_DATA); if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { skipped = 0; ret = off2; @@ -90,6 +115,10 @@ static int cachefiles_read(struct netfs_cache_resources *cres, * in the region, so clear the rest of the buffer and * return success. */ + ret = -ENODATA; + if (read_hole == NETFS_READ_HOLE_FAIL) + goto presubmission_error; + iov_iter_zero(len, iter); skipped = len; ret = 0; @@ -100,7 +129,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, iov_iter_zero(skipped, iter); } - ret = -ENOBUFS; + ret = -ENOMEM; ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) goto presubmission_error; @@ -112,6 +141,8 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->skipped = skipped; + ki->object = object; + ki->inval_counter = cres->inval_counter; ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; @@ -120,9 +151,13 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_complete = cachefiles_read_complete; get_file(ki->iocb.ki_filp); + cachefiles_grab_object(object, cachefiles_obj_get_ioreq); + trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped); old_nofs = memalloc_nofs_save(); - ret = vfs_iocb_iter_read(file, &ki->iocb, iter); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_iocb_iter_read(file, &ki->iocb, iter); memalloc_nofs_restore(old_nofs); switch (ret) { case -EIOCBQUEUED: @@ -162,6 +197,7 @@ presubmission_error: static void cachefiles_write_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct cachefiles_object *object = ki->object; struct inode *inode = file_inode(ki->iocb.ki_filp); _enter("%ld", ret); @@ -170,9 +206,14 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + if (ret < 0) + trace_cachefiles_io_error(object, inode, ret, + cachefiles_trace_write_error); + + atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); + set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); if (ki->term_func) ki->term_func(ki->term_func_priv, ret, ki->was_async); - cachefiles_put_kiocb(ki); } @@ -185,17 +226,27 @@ static int cachefiles_write(struct netfs_cache_resources *cres, netfs_io_terminated_t term_func, void *term_func_priv) { + struct cachefiles_object *object; + struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; struct inode *inode; - struct file *file = cres->cache_priv2; + struct file *file; unsigned int old_nofs; ssize_t ret = -ENOBUFS; size_t len = iov_iter_count(iter); + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) + goto presubmission_error; + fscache_count_write(); + object = cachefiles_cres_object(cres); + cache = object->volume->cache; + file = cachefiles_cres_file(cres); + _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); + ret = -ENOMEM; ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) goto presubmission_error; @@ -206,14 +257,18 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); + ki->object = object; + ki->inval_counter = cres->inval_counter; ki->start = start_pos; ki->len = len; ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; + ki->b_writing = (len + (1 << cache->bshift)) >> cache->bshift; if (ki->term_func) ki->iocb.ki_complete = cachefiles_write_complete; + atomic_long_add(ki->b_writing, &cache->b_writing); /* Open-code file_start_write here to grab freeze protection, which * will be released by another thread in aio_complete_rw(). Fool @@ -225,9 +280,13 @@ static int cachefiles_write(struct netfs_cache_resources *cres, __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); get_file(ki->iocb.ki_filp); + cachefiles_grab_object(object, cachefiles_obj_get_ioreq); + trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); - ret = vfs_iocb_iter_write(file, &ki->iocb, iter); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_iocb_iter_write(file, &ki->iocb, iter); memalloc_nofs_restore(old_nofs); switch (ret) { case -EIOCBQUEUED: @@ -257,8 +316,8 @@ in_progress: presubmission_error: if (term_func) - term_func(term_func_priv, -ENOMEM, false); - return -ENOMEM; + term_func(term_func_priv, ret, false); + return ret; } /* @@ -268,47 +327,82 @@ presubmission_error: static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, loff_t i_size) { - struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv; + enum cachefiles_prepare_read_trace why; + struct netfs_read_request *rreq = subreq->rreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; struct cachefiles_object *object; struct cachefiles_cache *cache; + struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; - struct file *file = subreq->rreq->cache_resources.cache_priv2; + struct file *file = cachefiles_cres_file(cres); + enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER; loff_t off, to; + ino_t ino = file ? file_inode(file)->i_ino : 0; _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + if (subreq->start >= i_size) { + ret = NETFS_FILL_WITH_ZEROES; + why = cachefiles_trace_read_after_eof; + goto out_no_object; + } - if (!file) - goto cache_fail_nosec; + if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + why = cachefiles_trace_read_no_data; + goto out_no_object; + } - if (subreq->start >= i_size) - return NETFS_FILL_WITH_ZEROES; + /* The object and the file may be being created in the background. */ + if (!file) { + why = cachefiles_trace_read_no_file; + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + goto out_no_object; + file = cachefiles_cres_file(cres); + if (!file) + goto out_no_object; + ino = file_inode(file)->i_ino; + } + object = cachefiles_cres_object(cres); + cache = object->volume->cache; cachefiles_begin_secure(cache, &saved_cred); - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, subreq->start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { - if (off == (loff_t)-ENXIO) + if (off == (loff_t)-ENXIO) { + why = cachefiles_trace_read_seek_nxio; goto download_and_store; - goto cache_fail; + } + trace_cachefiles_io_error(object, file_inode(file), off, + cachefiles_trace_seek_error); + why = cachefiles_trace_read_seek_error; + goto out; } - if (off >= subreq->start + subreq->len) + if (off >= subreq->start + subreq->len) { + why = cachefiles_trace_read_found_hole; goto download_and_store; + } if (off > subreq->start) { off = round_up(off, cache->bsize); subreq->len = off - subreq->start; + why = cachefiles_trace_read_found_part; goto download_and_store; } - to = vfs_llseek(file, subreq->start, SEEK_HOLE); - if (to < 0 && to >= (loff_t)-MAX_ERRNO) - goto cache_fail; + to = cachefiles_inject_read_error(); + if (to == 0) + to = vfs_llseek(file, subreq->start, SEEK_HOLE); + if (to < 0 && to >= (loff_t)-MAX_ERRNO) { + trace_cachefiles_io_error(object, file_inode(file), to, + cachefiles_trace_seek_error); + why = cachefiles_trace_read_seek_error; + goto out; + } if (to < subreq->start + subreq->len) { if (subreq->start + subreq->len >= i_size) @@ -318,32 +412,119 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque subreq->len = to - subreq->start; } - cachefiles_end_secure(cache, saved_cred); - return NETFS_READ_FROM_CACHE; + why = cachefiles_trace_read_have_data; + ret = NETFS_READ_FROM_CACHE; + goto out; download_and_store: - if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0) - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); -cache_fail: + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); +out: cachefiles_end_secure(cache, saved_cred); -cache_fail_nosec: - return NETFS_DOWNLOAD_FROM_SERVER; +out_no_object: + trace_cachefiles_prep_read(subreq, ret, why, ino); + return ret; } /* * Prepare for a write to occur. */ -static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size) +static int __cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size, + bool no_space_allocated_yet) { - loff_t start = *_start; + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + struct file *file = cachefiles_cres_file(cres); + loff_t start = *_start, pos; size_t len = *_len, down; + int ret; /* Round to DIO size */ down = start - round_down(start, PAGE_SIZE); *_start = start - down; *_len = round_up(down + len, PAGE_SIZE); - return 0; + + /* We need to work out whether there's sufficient disk space to perform + * the write - but we can skip that check if we have space already + * allocated. + */ + if (no_space_allocated_yet) + goto check_space; + + pos = cachefiles_inject_read_error(); + if (pos == 0) + pos = vfs_llseek(file, *_start, SEEK_DATA); + if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { + if (pos == -ENXIO) + goto check_space; /* Unallocated tail */ + trace_cachefiles_io_error(object, file_inode(file), pos, + cachefiles_trace_seek_error); + return pos; + } + if ((u64)pos >= (u64)*_start + *_len) + goto check_space; /* Unallocated region */ + + /* We have a block that's at least partially filled - if we're low on + * space, we need to see if it's fully allocated. If it's not, we may + * want to cull it. + */ + if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, + cachefiles_has_space_check) == 0) + return 0; /* Enough space to simply overwrite the whole block */ + + pos = cachefiles_inject_read_error(); + if (pos == 0) + pos = vfs_llseek(file, *_start, SEEK_HOLE); + if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { + trace_cachefiles_io_error(object, file_inode(file), pos, + cachefiles_trace_seek_error); + return pos; + } + if ((u64)pos >= (u64)*_start + *_len) + return 0; /* Fully allocated */ + + /* Partially allocated, but insufficient space: cull. */ + fscache_count_no_write_space(); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + *_start, *_len); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_fallocate_error); + cachefiles_io_error_obj(object, + "CacheFiles: fallocate failed (%d)\n", ret); + ret = -EIO; + } + + return ret; + +check_space: + return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, + cachefiles_has_space_for_write); +} + +static int cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size, + bool no_space_allocated_yet) +{ + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + const struct cred *saved_cred; + int ret; + + if (!cachefiles_cres_file(cres)) { + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) + return -ENOBUFS; + if (!cachefiles_cres_file(cres)) + return -ENOBUFS; + } + + cachefiles_begin_secure(cache, &saved_cred); + ret = __cachefiles_prepare_write(cres, _start, _len, i_size, + no_space_allocated_yet); + cachefiles_end_secure(cache, saved_cred); + return ret; } /* @@ -351,19 +532,11 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, */ static void cachefiles_end_operation(struct netfs_cache_resources *cres) { - struct fscache_retrieval *op = cres->cache_priv; - struct file *file = cres->cache_priv2; - - _enter(""); + struct file *file = cachefiles_cres_file(cres); if (file) fput(file); - if (op) { - fscache_op_complete(&op->op, false); - fscache_put_retrieval(op); - } - - _leave(""); + fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end); } static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { @@ -377,44 +550,25 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { /* * Open the cache file when beginning a cache operation. */ -int cachefiles_begin_read_operation(struct netfs_read_request *rreq, - struct fscache_retrieval *op) +bool cachefiles_begin_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct path path; - struct file *file; + struct cachefiles_object *object = cachefiles_cres_object(cres); + + if (!cachefiles_cres_file(cres)) { + cres->ops = &cachefiles_netfs_cache_ops; + if (object->file) { + spin_lock(&object->lock); + if (!cres->cache_priv2 && object->file) + cres->cache_priv2 = get_file(object->file); + spin_unlock(&object->lock); + } + } - _enter(""); - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - path.mnt = cache->mnt; - path.dentry = object->backer; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_inode(object->backer), cache->cache_cred); - if (IS_ERR(file)) - return PTR_ERR(file); - if (!S_ISREG(file_inode(file)->i_mode)) - goto error_file; - if (unlikely(!file->f_op->read_iter) || - unlikely(!file->f_op->write_iter)) { - pr_notice("Cache does not support read_iter and write_iter\n"); - goto error_file; + if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) { + pr_err("failed to get cres->file\n"); + return false; } - fscache_get_retrieval(op); - rreq->cache_resources.cache_priv = op; - rreq->cache_resources.cache_priv2 = file; - rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; - rreq->cache_resources.debug_id = object->fscache.debug_id; - _leave(""); - return 0; - -error_file: - fput(file); - return -EIO; + return true; } diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index 7f94efc97e23..bf935e25bdbe 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* Key to pathname encoder * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -22,134 +22,117 @@ static const char cachefiles_filecharmap[256] = { [48 ... 127] = 1, /* '0' -> '~' */ }; +static inline unsigned int how_many_hex_digits(unsigned int x) +{ + return x ? round_up(ilog2(x) + 1, 4) / 4 : 0; +} + /* * turn the raw key into something cooked - * - the raw key should include the length in the two bytes at the front - * - the key may be up to 514 bytes in length (including the length word) + * - the key may be up to NAME_MAX in length (including the length word) * - "base64" encode the strange keys, mapping 3 bytes of raw to four of * cooked * - need to cut the cooked key into 252 char lengths (189 raw bytes) */ -char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +bool cachefiles_cook_key(struct cachefiles_object *object) { - unsigned char csum, ch; - unsigned int acc; - char *key; - int loop, len, max, seg, mark, print; + const u8 *key = fscache_get_key(object->cookie), *kend; + unsigned char ch; + unsigned int acc, i, n, nle, nbe, keylen = object->cookie->key_len; + unsigned int b64len, len, print, pad; + char *name, sep; - _enter(",%d", keylen); + _enter(",%u,%*phN", keylen, keylen, key); - BUG_ON(keylen < 2 || keylen > 514); + BUG_ON(keylen > NAME_MAX - 3); - csum = raw[0] + raw[1]; print = 1; - for (loop = 2; loop < keylen; loop++) { - ch = raw[loop]; - csum += ch; + for (i = 0; i < keylen; i++) { + ch = key[i]; print &= cachefiles_filecharmap[ch]; } + /* If the path is usable ASCII, then we render it directly */ if (print) { - /* if the path is usable ASCII, then we render it directly */ - max = keylen - 2; - max += 2; /* two base64'd length chars on the front */ - max += 5; /* @checksum/M */ - max += 3 * 2; /* maximum number of segment dividers (".../M") - * is ((514 + 251) / 252) = 3 - */ - max += 1; /* NUL on end */ - } else { - /* calculate the maximum length of the cooked key */ - keylen = (keylen + 2) / 3; - - max = keylen * 4; - max += 5; /* @checksum/M */ - max += 3 * 2; /* maximum number of segment dividers (".../M") - * is ((514 + 188) / 189) = 3 - */ - max += 1; /* NUL on end */ + len = 1 + keylen; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return false; + + name[0] = 'D'; /* Data object type, string encoding */ + memcpy(name + 1, key, keylen); + goto success; } - max += 1; /* 2nd NUL on end */ - - _debug("max: %d", max); - - key = kmalloc(max, cachefiles_gfp); - if (!key) - return NULL; - - len = 0; - - /* build the cooked key */ - sprintf(key, "@%02x%c+", (unsigned) csum, 0); - len = 5; - mark = len - 1; - - if (print) { - acc = *(uint16_t *) raw; - raw += 2; - - key[len + 1] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len] = cachefiles_charmap[acc & 63]; - len += 2; - - seg = 250; - for (loop = keylen; loop > 0; loop--) { - if (seg <= 0) { - key[len++] = '\0'; - mark = len; - key[len++] = '+'; - seg = 252; - } - - key[len++] = *raw++; - ASSERT(len < max); - } - - switch (type) { - case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; - case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; - default: type = 'S'; break; - } - } else { - seg = 252; - for (loop = keylen; loop > 0; loop--) { - if (seg <= 0) { - key[len++] = '\0'; - mark = len; - key[len++] = '+'; - seg = 252; - } - - acc = *raw++; - acc |= *raw++ << 8; - acc |= *raw++ << 16; - - _debug("acc: %06x", acc); - - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - - ASSERT(len < max); - } + /* See if it makes sense to encode it as "hex,hex,hex" for each 32-bit + * chunk. We rely on the key having been padded out to a whole number + * of 32-bit words. + */ + n = round_up(keylen, 4); + nbe = nle = 0; + for (i = 0; i < n; i += 4) { + u32 be = be32_to_cpu(*(__be32 *)(key + i)); + u32 le = le32_to_cpu(*(__le32 *)(key + i)); + + nbe += 1 + how_many_hex_digits(be); + nle += 1 + how_many_hex_digits(le); + } - switch (type) { - case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; - case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; - default: type = 'T'; break; + b64len = DIV_ROUND_UP(keylen, 3); + pad = b64len * 3 - keylen; + b64len = 2 + b64len * 4; /* Length if we base64-encode it */ + _debug("len=%u nbe=%u nle=%u b64=%u", keylen, nbe, nle, b64len); + if (nbe < b64len || nle < b64len) { + unsigned int nlen = min(nbe, nle) + 1; + name = kmalloc(nlen, GFP_KERNEL); + if (!name) + return false; + sep = (nbe <= nle) ? 'S' : 'T'; /* Encoding indicator */ + len = 0; + for (i = 0; i < n; i += 4) { + u32 x; + if (nbe <= nle) + x = be32_to_cpu(*(__be32 *)(key + i)); + else + x = le32_to_cpu(*(__le32 *)(key + i)); + name[len++] = sep; + if (x != 0) + len += snprintf(name + len, nlen - len, "%x", x); + sep = ','; } + goto success; } - key[mark] = type; - key[len++] = 0; - key[len] = 0; + /* We need to base64-encode it */ + name = kmalloc(b64len + 1, GFP_KERNEL); + if (!name) + return false; + + name[0] = 'E'; + name[1] = '0' + pad; + len = 2; + kend = key + keylen; + do { + acc = *key++; + if (key < kend) { + acc |= *key++ << 8; + if (key < kend) + acc |= *key++ << 16; + } - _leave(" = %s %d", key, len); - return key; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + } while (key < kend); + +success: + name[len] = 0; + object->d_name = name; + object->d_name_len = len; + _leave(" = %s", object->d_name); + return true; } diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 9c8d34c49b12..3f369c6f816d 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -2,7 +2,7 @@ /* Network filesystem caching backend to use cache files on a premounted * filesystem * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -18,6 +18,8 @@ #include <linux/statfs.h> #include <linux/sysctl.h> #include <linux/miscdevice.h> +#include <linux/netfs.h> +#include <trace/events/netfs.h> #define CREATE_TRACE_POINTS #include "internal.h" @@ -37,14 +39,6 @@ static struct miscdevice cachefiles_dev = { .fops = &cachefiles_daemon_fops, }; -static void cachefiles_object_init_once(void *_object) -{ - struct cachefiles_object *object = _object; - - memset(object, 0, sizeof(*object)); - spin_lock_init(&object->work_lock); -} - /* * initialise the fs caching module */ @@ -52,6 +46,9 @@ static int __init cachefiles_init(void) { int ret; + ret = cachefiles_register_error_injection(); + if (ret < 0) + goto error_einj; ret = misc_register(&cachefiles_dev); if (ret < 0) goto error_dev; @@ -61,9 +58,7 @@ static int __init cachefiles_init(void) cachefiles_object_jar = kmem_cache_create("cachefiles_object_jar", sizeof(struct cachefiles_object), - 0, - SLAB_HWCACHE_ALIGN, - cachefiles_object_init_once); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!cachefiles_object_jar) { pr_notice("Failed to allocate an object jar\n"); goto error_object_jar; @@ -75,6 +70,8 @@ static int __init cachefiles_init(void) error_object_jar: misc_deregister(&cachefiles_dev); error_dev: + cachefiles_unregister_error_injection(); +error_einj: pr_err("failed to register: %d\n", ret); return ret; } @@ -90,6 +87,7 @@ static void __exit cachefiles_exit(void) kmem_cache_destroy(cachefiles_object_jar); misc_deregister(&cachefiles_dev); + cachefiles_unregister_error_injection(); } module_exit(cachefiles_exit); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index a9aca5ab5970..9bd692870617 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -1,295 +1,268 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles path walking and related routines * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/file.h> #include <linux/fs.h> -#include <linux/fsnotify.h> -#include <linux/quotaops.h> -#include <linux/xattr.h> -#include <linux/mount.h> #include <linux/namei.h> -#include <linux/security.h> -#include <linux/slab.h> #include "internal.h" -#define CACHEFILES_KEYBUF_SIZE 512 - /* - * dump debugging info about an object + * Mark the backing file as being a cache file if it's not already in use. The + * mark tells the culling request command that it's not allowed to cull the + * file or directory. The caller must hold the inode lock. */ -static noinline -void __cachefiles_printk_object(struct cachefiles_object *object, - const char *prefix) +static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - struct fscache_cookie *cookie; - const u8 *k; - unsigned loop; - - pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); - pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", - prefix, object->fscache.state->name, - object->fscache.flags, work_busy(&object->fscache.work), - object->fscache.events, object->fscache.event_mask); - pr_err("%sops=%u inp=%u exc=%u\n", - prefix, object->fscache.n_ops, object->fscache.n_in_progress, - object->fscache.n_exclusive); - pr_err("%sparent=%x\n", - prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0); - - spin_lock(&object->fscache.lock); - cookie = object->fscache.cookie; - if (cookie) { - pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n", - prefix, - cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, - cookie->netfs_data, - cookie->flags); - pr_err("%skey=[%u] '", prefix, cookie->key_len); - k = (cookie->key_len <= sizeof(cookie->inline_key)) ? - cookie->inline_key : cookie->key; - for (loop = 0; loop < cookie->key_len; loop++) - pr_cont("%02x", k[loop]); - pr_cont("'\n"); + struct inode *inode = d_backing_inode(dentry); + bool can_use = false; + + if (!(inode->i_flags & S_KERNEL_FILE)) { + inode->i_flags |= S_KERNEL_FILE; + trace_cachefiles_mark_active(object, inode); + can_use = true; } else { - pr_err("%scookie=NULL\n", prefix); + pr_notice("cachefiles: Inode already in use: %pd\n", dentry); } - spin_unlock(&object->fscache.lock); + + return can_use; } -/* - * dump debugging info about a pair of objects - */ -static noinline void cachefiles_printk_object(struct cachefiles_object *object, - struct cachefiles_object *xobject) +static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - if (object) - __cachefiles_printk_object(object, ""); - if (xobject) - __cachefiles_printk_object(xobject, "x"); + struct inode *inode = d_backing_inode(dentry); + bool can_use; + + inode_lock(inode); + can_use = __cachefiles_mark_inode_in_use(object, dentry); + inode_unlock(inode); + return can_use; } /* - * mark the owner of a dentry, if there is one, to indicate that that dentry - * has been preemptively deleted - * - the caller must hold the i_mutex on the dentry's parent as required to - * call vfs_unlink(), vfs_rmdir() or vfs_rename() + * Unmark a backing inode. The caller must hold the inode lock. */ -static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry, - enum fscache_why_object_killed why) +static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - struct cachefiles_object *object; - struct rb_node *p; - - _enter(",'%pd'", dentry); + struct inode *inode = d_backing_inode(dentry); - write_lock(&cache->active_lock); + inode->i_flags &= ~S_KERNEL_FILE; + trace_cachefiles_mark_inactive(object, inode); +} - p = cache->active_nodes.rb_node; - while (p) { - object = rb_entry(p, struct cachefiles_object, active_node); - if (object->dentry > dentry) - p = p->rb_left; - else if (object->dentry < dentry) - p = p->rb_right; - else - goto found_dentry; +/* + * Unmark a backing inode and tell cachefilesd that there's something that can + * be culled. + */ +void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct file *file) +{ + struct cachefiles_cache *cache = object->volume->cache; + struct inode *inode = file_inode(file); + + if (inode) { + inode_lock(inode); + __cachefiles_unmark_inode_in_use(object, file->f_path.dentry); + inode_unlock(inode); + + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); + } } +} - write_unlock(&cache->active_lock); - trace_cachefiles_mark_buried(NULL, dentry, why); - _leave(" [no owner]"); - return; +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname, + bool *_is_new) +{ + struct dentry *subdir; + struct path path; + int ret; - /* found the dentry for */ -found_dentry: - kdebug("preemptive burial: OBJ%x [%s] %pd", - object->fscache.debug_id, - object->fscache.state->name, - dentry); + _enter(",,%s", dirname); - trace_cachefiles_mark_buried(object, dentry, why); + /* search the current directory for the element name */ + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - if (fscache_object_is_live(&object->fscache)) { - pr_err("\n"); - pr_err("Error: Can't preemptively bury live object\n"); - cachefiles_printk_object(object, NULL); - } else { - if (why != FSCACHE_OBJECT_IS_STALE) - fscache_object_mark_killed(&object->fscache, why); +retry: + ret = cachefiles_inject_read_error(); + if (ret == 0) + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + else + subdir = ERR_PTR(ret); + if (IS_ERR(subdir)) { + trace_cachefiles_vfs_error(NULL, d_backing_inode(dir), + PTR_ERR(subdir), + cachefiles_trace_lookup_error); + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; } - write_unlock(&cache->active_lock); - _leave(" [owner marked]"); -} + _debug("subdir -> %pd %s", + subdir, d_backing_inode(subdir) ? "positive" : "negative"); -/* - * record the fact that an object is now active - */ -static int cachefiles_mark_object_active(struct cachefiles_cache *cache, - struct cachefiles_object *object) -{ - struct cachefiles_object *xobject; - struct rb_node **_p, *_parent = NULL; - struct dentry *dentry; + /* we need to create the subdir if it doesn't exist yet */ + if (d_is_negative(subdir)) { + ret = cachefiles_has_space(cache, 1, 0, + cachefiles_has_space_for_create); + if (ret < 0) + goto mkdir_error; - _enter(",%x", object->fscache.debug_id); + _debug("attempt mkdir"); -try_again: - write_lock(&cache->active_lock); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); + if (ret < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, + cachefiles_trace_mkdir_error); + goto mkdir_error; + } - dentry = object->dentry; - trace_cachefiles_mark_active(object, dentry); + if (unlikely(d_unhashed(subdir))) { + cachefiles_put_directory(subdir); + goto retry; + } + ASSERT(d_backing_inode(subdir)); - if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - pr_err("Error: Object already active\n"); - cachefiles_printk_object(object, NULL); - BUG(); + _debug("mkdir -> %pd{ino=%lu}", + subdir, d_backing_inode(subdir)->i_ino); + if (_is_new) + *_is_new = true; } - _p = &cache->active_nodes.rb_node; - while (*_p) { - _parent = *_p; - xobject = rb_entry(_parent, - struct cachefiles_object, active_node); + /* Tell rmdir() it's not allowed to delete the subdir */ + inode_lock(d_inode(subdir)); + inode_unlock(d_inode(dir)); - ASSERT(xobject != object); + if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + goto mark_error; - if (xobject->dentry > dentry) - _p = &(*_p)->rb_left; - else if (xobject->dentry < dentry) - _p = &(*_p)->rb_right; - else - goto wait_for_old_object; - } + inode_unlock(d_inode(subdir)); - rb_link_node(&object->active_node, _parent, _p); - rb_insert_color(&object->active_node, &cache->active_nodes); + /* we need to make sure the subdir is a directory */ + ASSERT(d_backing_inode(subdir)); - write_unlock(&cache->active_lock); - _leave(" = 0"); - return 0; + if (!d_can_lookup(subdir)) { + pr_err("%s is not a directory\n", dirname); + ret = -EIO; + goto check_error; + } - /* an old object from a previous incarnation is hogging the slot - we - * need to wait for it to be destroyed */ -wait_for_old_object: - trace_cachefiles_wait_active(object, dentry, xobject); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - - if (fscache_object_is_live(&xobject->fscache)) { - pr_err("\n"); - pr_err("Error: Unexpected object collision\n"); - cachefiles_printk_object(object, xobject); - } - atomic_inc(&xobject->usage); - write_unlock(&cache->active_lock); - - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { - wait_queue_head_t *wq; - - signed long timeout = 60 * HZ; - wait_queue_entry_t wait; - bool requeue; - - /* if the object we're waiting for is queued for processing, - * then just put ourselves on the queue behind it */ - if (work_pending(&xobject->fscache.work)) { - _debug("queue OBJ%x behind OBJ%x immediately", - object->fscache.debug_id, - xobject->fscache.debug_id); - goto requeue; - } + ret = -EPERM; + if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->rename || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) + goto check_error; - /* otherwise we sleep until either the object we're waiting for - * is done, or the fscache_object is congested */ - wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); - init_wait(&wait); - requeue = false; - do { - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) - break; - - requeue = fscache_object_sleep_till_congested(&timeout); - } while (timeout > 0 && !requeue); - finish_wait(wq, &wait); - - if (requeue && - test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { - _debug("queue OBJ%x behind OBJ%x after wait", - object->fscache.debug_id, - xobject->fscache.debug_id); - goto requeue; - } + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); + return subdir; - if (timeout <= 0) { - pr_err("\n"); - pr_err("Error: Overlong wait for old active object to go away\n"); - cachefiles_printk_object(object, xobject); - goto requeue; - } - } +check_error: + cachefiles_put_directory(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); - ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); +mark_error: + inode_unlock(d_inode(subdir)); + dput(subdir); + return ERR_PTR(-EBUSY); - cache->cache.ops->put_object(&xobject->fscache, - (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry); - goto try_again; +mkdir_error: + inode_unlock(d_inode(dir)); + dput(subdir); + pr_err("mkdir %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + inode_unlock(d_inode(dir)); + ret = PTR_ERR(subdir); + pr_err("Lookup %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); -requeue: - cache->cache.ops->put_object(&xobject->fscache, - (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo); - _leave(" = -ETIMEDOUT"); - return -ETIMEDOUT; +nomem_d_alloc: + inode_unlock(d_inode(dir)); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); } /* - * Mark an object as being inactive. + * Put a subdirectory. */ -void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object, - blkcnt_t i_blocks) +void cachefiles_put_directory(struct dentry *dir) { - struct dentry *dentry = object->dentry; - struct inode *inode = d_backing_inode(dentry); - - trace_cachefiles_mark_inactive(object, dentry, inode); + if (dir) { + inode_lock(dir->d_inode); + __cachefiles_unmark_inode_in_use(NULL, dir); + inode_unlock(dir->d_inode); + dput(dir); + } +} - write_lock(&cache->active_lock); - rb_erase(&object->active_node, &cache->active_nodes); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - write_unlock(&cache->active_lock); +/* + * Remove a regular file from the cache. + */ +static int cachefiles_unlink(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, struct dentry *dentry, + enum fscache_why_object_killed why) +{ + struct path path = { + .mnt = cache->mnt, + .dentry = dir, + }; + int ret; - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + trace_cachefiles_unlink(object, dentry, why); + ret = security_path_unlink(&path, dentry); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + return ret; + } - /* This object can now be culled, so we need to let the daemon know - * that there is something it can remove if it needs to. - */ - atomic_long_add(i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); + ret = cachefiles_inject_remove_error(); + if (ret == 0) { + ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL); + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + } + if (ret != 0) + trace_cachefiles_vfs_error(object, d_backing_inode(dir), ret, + cachefiles_trace_unlink_error); + return ret; } /* - * delete an object representation from the cache - * - file backed objects are unlinked - * - directory backed objects are stuffed into the graveyard for userspace to + * Delete an object representation from the cache + * - File backed objects are unlinked + * - Directory backed objects are stuffed into the graveyard for userspace to * delete - * - unlocks the directory mutex */ -static int cachefiles_bury_object(struct cachefiles_cache *cache, - struct cachefiles_object *object, - struct dentry *dir, - struct dentry *rep, - bool preemptive, - enum fscache_why_object_killed why) +int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, + struct dentry *rep, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -298,29 +271,21 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); + if (rep->d_parent != dir) { + inode_unlock(d_inode(dir)); + _leave(" = -ESTALE"); + return -ESTALE; + } + /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { - _debug("unlink stale object"); - - path.mnt = cache->mnt; - path.dentry = dir; - ret = security_path_unlink(&path, rep); - if (ret < 0) { - cachefiles_io_error(cache, "Unlink security error"); - } else { - trace_cachefiles_unlink(object, rep, why); - ret = vfs_unlink(&init_user_ns, d_inode(dir), rep, - NULL); - - if (preemptive) - cachefiles_mark_object_buried(cache, rep, why); - } + dget(rep); /* Stop the dentry being negated if it's only pinned + * by a file struct. + */ + ret = cachefiles_unlink(cache, object, dir, rep, why); + dput(rep); inode_unlock(d_inode(dir)); - - if (ret == -EIO) - cachefiles_io_error(cache, "Unlink failed"); - _leave(" = %d", ret); return ret; } @@ -368,14 +333,16 @@ try_again: grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); if (IS_ERR(grave)) { unlock_rename(cache->graveyard, dir); + trace_cachefiles_vfs_error(object, d_inode(cache->graveyard), + PTR_ERR(grave), + cachefiles_trace_lookup_error); if (PTR_ERR(grave) == -ENOMEM) { _leave(" = -ENOMEM"); return -ENOMEM; } - cachefiles_io_error(cache, "Lookup error %ld", - PTR_ERR(grave)); + cachefiles_io_error(cache, "Lookup error %ld", PTR_ERR(grave)); return -EIO; } @@ -420,15 +387,18 @@ try_again: .new_dentry = grave, }; trace_cachefiles_rename(object, rep, grave, why); - ret = vfs_rename(&rd); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_rename(&rd); + if (ret != 0) + trace_cachefiles_vfs_error(object, d_inode(dir), ret, + cachefiles_trace_rename_error); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); - - if (preemptive) - cachefiles_mark_object_buried(cache, rep, why); } + __cachefiles_unmark_inode_in_use(object, rep); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -436,493 +406,358 @@ try_again: } /* - * delete an object representation from the cache + * Delete a cache file. */ -int cachefiles_delete_object(struct cachefiles_cache *cache, - struct cachefiles_object *object) +int cachefiles_delete_object(struct cachefiles_object *object, + enum fscache_why_object_killed why) { - struct dentry *dir; + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry = object->file->f_path.dentry; + struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; - _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry); - - ASSERT(object->dentry); - ASSERT(d_backing_inode(object->dentry)); - ASSERT(object->dentry->d_parent); + _enter(",OBJ%x{%pD}", object->debug_id, object->file); - dir = dget_parent(object->dentry); + /* Stop the dentry being negated if it's only pinned by a file struct. */ + dget(dentry); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - - if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { - /* object allocation for the same key preemptively deleted this - * object's file so that it could create its own file */ - _debug("object preemptively buried"); - inode_unlock(d_inode(dir)); - ret = 0; - } else { - /* we need to check that our parent is _still_ our parent - it - * may have been renamed */ - if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, object, dir, - object->dentry, false, - FSCACHE_OBJECT_WAS_RETIRED); - } else { - /* it got moved, presumably by cachefilesd culling it, - * so it's no longer in the key path and we can ignore - * it */ - inode_unlock(d_inode(dir)); - ret = 0; - } - } - - dput(dir); - _leave(" = %d", ret); + inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); + inode_unlock(d_backing_inode(fan)); + dput(dentry); return ret; } /* - * walk from the parent object to the child object through the backing - * filesystem, creating directories as we go + * Create a temporary file and leave it unattached and un-xattr'd until the + * time comes to discard the object from memory. */ -int cachefiles_walk_to_object(struct cachefiles_object *parent, - struct cachefiles_object *object, - const char *key, - struct cachefiles_xattr *auxdata) +struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) { - struct cachefiles_cache *cache; - struct dentry *dir, *next = NULL; - struct inode *inode; + struct cachefiles_volume *volume = object->volume; + struct cachefiles_cache *cache = volume->cache; + const struct cred *saved_cred; + struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; + struct file *file; struct path path; - const char *name; - int ret, nlen; - - _enter("OBJ%x{%pd},OBJ%x,%s,", - parent->fscache.debug_id, parent->dentry, - object->fscache.debug_id, key); - - cache = container_of(parent->fscache.cache, - struct cachefiles_cache, cache); - path.mnt = cache->mnt; - - ASSERT(parent->dentry); - ASSERT(d_backing_inode(parent->dentry)); - - if (!(d_is_dir(parent->dentry))) { - // TODO: convert file to dir - _leave("looking up in none directory"); - return -ENOBUFS; - } - - dir = dget(parent->dentry); - -advance: - /* attempt to transit the first directory component */ - name = key; - nlen = strlen(key); - - /* key ends in a double NUL */ - key = key + nlen + 1; - if (!*key) - key = NULL; - -lookup_again: - /* search the current directory for the element name */ - _debug("lookup '%s'", name); - - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + uint64_t ni_size = object->cookie->object_size; + long ret; - next = lookup_one_len(name, dir, nlen); - if (IS_ERR(next)) { - trace_cachefiles_lookup(object, next, NULL); - goto lookup_error; - } + ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); - inode = d_backing_inode(next); - trace_cachefiles_lookup(object, next, inode); - _debug("next -> %pd %s", next, inode ? "positive" : "negative"); - - if (!key) - object->new = !inode; - - /* if this element of the path doesn't exist, then the lookup phase - * failed, and we can release any readers in the certain knowledge that - * there's nothing for them to actually read */ - if (d_is_negative(next)) - fscache_object_lookup_negative(&object->fscache); - - /* we need to create the object if it's negative */ - if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { - /* index objects and intervening tree levels must be subdirs */ - if (d_is_negative(next)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto no_space_error; - - path.dentry = dir; - ret = security_path_mkdir(&path, next, 0); - if (ret < 0) - goto create_error; - ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); - if (!key) - trace_cachefiles_mkdir(object, next, ret); - if (ret < 0) - goto create_error; - - if (unlikely(d_unhashed(next))) { - dput(next); - inode_unlock(d_inode(dir)); - goto lookup_again; - } - ASSERT(d_backing_inode(next)); - - _debug("mkdir -> %pd{ino=%lu}", - next, d_backing_inode(next)->i_ino); - - } else if (!d_can_lookup(next)) { - pr_err("inode %lu is not a directory\n", - d_backing_inode(next)->i_ino); - ret = -ENOBUFS; - goto error; - } + cachefiles_begin_secure(cache, &saved_cred); - } else { - /* non-index objects start out life as files */ - if (d_is_negative(next)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto no_space_error; - - path.dentry = dir; - ret = security_path_mknod(&path, next, S_IFREG, 0); - if (ret < 0) - goto create_error; - ret = vfs_create(&init_user_ns, d_inode(dir), next, - S_IFREG, true); - trace_cachefiles_create(object, next, ret); - if (ret < 0) - goto create_error; - - ASSERT(d_backing_inode(next)); - - _debug("create -> %pd{ino=%lu}", - next, d_backing_inode(next)->i_ino); - - } else if (!d_can_lookup(next) && - !d_is_reg(next) - ) { - pr_err("inode %lu is not a file or directory\n", - d_backing_inode(next)->i_ino); - ret = -ENOBUFS; - goto error; + path.mnt = cache->mnt; + ret = cachefiles_inject_write_error(); + if (ret == 0) + path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); + else + path.dentry = ERR_PTR(ret); + if (IS_ERR(path.dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + cachefiles_trace_tmpfile_error); + if (PTR_ERR(path.dentry) == -EIO) + cachefiles_io_error_obj(object, "Failed to create tmpfile"); + file = ERR_CAST(path.dentry); + goto out; + } + + trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + + if (!cachefiles_mark_inode_in_use(object, path.dentry)) { + file = ERR_PTR(-EBUSY); + goto out_dput; + } + + if (ni_size > 0) { + trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + cachefiles_trunc_expand_tmpfile); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_truncate(&path, ni_size); + if (ret < 0) { + trace_cachefiles_vfs_error( + object, d_backing_inode(path.dentry), ret, + cachefiles_trace_trunc_error); + file = ERR_PTR(ret); + goto out_dput; } } - /* process the next component */ - if (key) { - _debug("advance"); - inode_unlock(d_inode(dir)); - dput(dir); - dir = next; - next = NULL; - goto advance; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(path.dentry), cache->cache_cred); + if (IS_ERR(file)) { + trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), + PTR_ERR(file), + cachefiles_trace_open_error); + goto out_dput; + } + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + fput(file); + pr_notice("Cache does not support read_iter and write_iter\n"); + file = ERR_PTR(-EINVAL); } - /* we've found the object we were looking for */ - object->dentry = next; - - /* if we've found that the terminal object exists, then we need to - * check its attributes and delete it if it's out of date */ - if (!object->new) { - _debug("validate '%pd'", next); - - ret = cachefiles_check_object_xattr(object, auxdata); - if (ret == -ESTALE) { - /* delete the object (the deleter drops the directory - * mutex) */ - object->dentry = NULL; +out_dput: + dput(path.dentry); +out: + cachefiles_end_secure(cache, saved_cred); + return file; +} - ret = cachefiles_bury_object(cache, object, dir, next, - true, - FSCACHE_OBJECT_IS_STALE); - dput(next); - next = NULL; +/* + * Create a new file. + */ +static bool cachefiles_create_file(struct cachefiles_object *object) +{ + struct file *file; + int ret; - if (ret < 0) - goto delete_error; + ret = cachefiles_has_space(object->volume->cache, 1, 0, + cachefiles_has_space_for_create); + if (ret < 0) + return false; - _debug("redo lookup"); - fscache_object_retrying_stale(&object->fscache); - goto lookup_again; - } - } + file = cachefiles_create_tmpfile(object); + if (IS_ERR(file)) + return false; - /* note that we're now using this object */ - ret = cachefiles_mark_object_active(cache, object); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); + set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino); + object->file = file; + return true; +} - inode_unlock(d_inode(dir)); - dput(dir); - dir = NULL; +/* + * Open an existing file, checking its attributes and replacing it if it is + * stale. + */ +static bool cachefiles_open_file(struct cachefiles_object *object, + struct dentry *dentry) +{ + struct cachefiles_cache *cache = object->volume->cache; + struct file *file; + struct path path; + int ret; - if (ret == -ETIMEDOUT) - goto mark_active_timed_out; + _enter("%pd", dentry); - _debug("=== OBTAINED_OBJECT ==="); + if (!cachefiles_mark_inode_in_use(object, dentry)) + return false; - if (object->new) { - /* attach data to a newly constructed terminal object */ - ret = cachefiles_set_object_xattr(object, auxdata); - if (ret < 0) - goto check_error; - } else { - /* always update the atime on an object we've just looked up - * (this is used to keep track of culling, and atimes are only - * updated by read, write and readdir but not lookup or - * open) */ - path.dentry = next; - touch_atime(&path); - } - - /* open a file interface onto a data file */ - if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (d_is_reg(object->dentry)) { - const struct address_space_operations *aops; - - ret = -EPERM; - aops = d_backing_inode(object->dentry)->i_mapping->a_ops; - if (!aops->bmap) - goto check_error; - if (object->dentry->d_sb->s_blocksize > PAGE_SIZE) - goto check_error; - - object->backer = object->dentry; - } else { - BUG(); // TODO: open file in data-class subdir - } + /* We need to open a file interface onto a data file now as we can't do + * it on demand because writeback called from do_exit() sees + * current->fs == NULL - which breaks d_path() called from ext4 open. + */ + path.mnt = cache->mnt; + path.dentry = dentry; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); + if (IS_ERR(file)) { + trace_cachefiles_vfs_error(object, d_backing_inode(dentry), + PTR_ERR(file), + cachefiles_trace_open_error); + goto error; } - object->new = 0; - fscache_obtained_object(&object->fscache); - - _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); - return 0; - -no_space_error: - fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); -create_error: - _debug("create error %d", ret); - if (ret == -EIO) - cachefiles_io_error(cache, "Create/mkdir failed"); - goto error; + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + pr_notice("Cache does not support read_iter and write_iter\n"); + goto error_fput; + } + _debug("file -> %pd positive", dentry); -mark_active_timed_out: - _debug("mark active timed out"); - goto release_dentry; + ret = cachefiles_check_auxdata(object, file); + if (ret < 0) + goto check_failed; -check_error: - _debug("check error %d", ret); - cachefiles_mark_object_inactive( - cache, object, d_backing_inode(object->dentry)->i_blocks); -release_dentry: - dput(object->dentry); - object->dentry = NULL; - goto error_out; - -delete_error: - _debug("delete error %d", ret); - goto error_out2; + object->file = file; -lookup_error: - _debug("lookup error %ld", PTR_ERR(next)); - ret = PTR_ERR(next); - if (ret == -EIO) - cachefiles_io_error(cache, "Lookup failed"); - next = NULL; + /* Always update the atime on an object we've just looked up (this is + * used to keep track of culling, and atimes are only updated by read, + * write and readdir but not lookup or open). + */ + touch_atime(&file->f_path); + dput(dentry); + return true; + +check_failed: + fscache_cookie_lookup_negative(object->cookie); + cachefiles_unmark_inode_in_use(object, file); + if (ret == -ESTALE) { + fput(file); + dput(dentry); + return cachefiles_create_file(object); + } +error_fput: + fput(file); error: - inode_unlock(d_inode(dir)); - dput(next); -error_out2: - dput(dir); -error_out: - _leave(" = error %d", -ret); - return ret; + dput(dentry); + return false; } /* - * get a subdirectory + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go */ -struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, - struct dentry *dir, - const char *dirname) +bool cachefiles_look_up_object(struct cachefiles_object *object) { - struct dentry *subdir; - struct path path; + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; - _enter(",,%s", dirname); - - /* search the current directory for the element name */ - inode_lock(d_inode(dir)); - -retry: - subdir = lookup_one_len(dirname, dir, strlen(dirname)); - if (IS_ERR(subdir)) { - if (PTR_ERR(subdir) == -ENOMEM) - goto nomem_d_alloc; - goto lookup_error; + _enter("OBJ%x,%s,", object->debug_id, object->d_name); + + /* Look up path "cache/vol/fanout/file". */ + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_positive_unlocked(object->d_name, fan, + object->d_name_len); + else + dentry = ERR_PTR(ret); + trace_cachefiles_lookup(object, dentry); + if (IS_ERR(dentry)) { + if (dentry == ERR_PTR(-ENOENT)) + goto new_file; + if (dentry == ERR_PTR(-EIO)) + cachefiles_io_error_obj(object, "Lookup failed"); + return false; + } + + if (!d_is_reg(dentry)) { + pr_err("%pd is not a file\n", dentry); + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_bury_object(volume->cache, object, fan, dentry, + FSCACHE_OBJECT_IS_WEIRD); + dput(dentry); + if (ret < 0) + return false; + goto new_file; } - _debug("subdir -> %pd %s", - subdir, d_backing_inode(subdir) ? "positive" : "negative"); + if (!cachefiles_open_file(object, dentry)) + return false; - /* we need to create the subdir if it doesn't exist yet */ - if (d_is_negative(subdir)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto mkdir_error; + _leave(" = t [%lu]", file_inode(object->file)->i_ino); + return true; - _debug("attempt mkdir"); +new_file: + fscache_cookie_lookup_negative(object->cookie); + return cachefiles_create_file(object); +} - path.mnt = cache->mnt; - path.dentry = dir; - ret = security_path_mkdir(&path, subdir, 0700); - if (ret < 0) - goto mkdir_error; - ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); - if (ret < 0) - goto mkdir_error; +/* + * Attempt to link a temporary file into its rightful place in the cache. + */ +bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; + bool success = false; + int ret; - if (unlikely(d_unhashed(subdir))) { - dput(subdir); - goto retry; + _enter(",%pD", object->file); + + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + else + dentry = ERR_PTR(ret); + if (IS_ERR(dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), + cachefiles_trace_lookup_error); + _debug("lookup fail %ld", PTR_ERR(dentry)); + goto out_unlock; + } + + if (!d_is_negative(dentry)) { + if (d_backing_inode(dentry) == file_inode(object->file)) { + success = true; + goto out_dput; } - ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %pd{ino=%lu}", - subdir, d_backing_inode(subdir)->i_ino); - } - - inode_unlock(d_inode(dir)); - - /* we need to make sure the subdir is a directory */ - ASSERT(d_backing_inode(subdir)); + ret = cachefiles_unlink(volume->cache, object, fan, dentry, + FSCACHE_OBJECT_IS_STALE); + if (ret < 0) + goto out_dput; - if (!d_can_lookup(subdir)) { - pr_err("%s is not a directory\n", dirname); - ret = -EIO; - goto check_error; + dput(dentry); + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + else + dentry = ERR_PTR(ret); + if (IS_ERR(dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), + cachefiles_trace_lookup_error); + _debug("lookup fail %ld", PTR_ERR(dentry)); + goto out_unlock; + } } - ret = -EPERM; - if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || - !d_backing_inode(subdir)->i_op->lookup || - !d_backing_inode(subdir)->i_op->mkdir || - !d_backing_inode(subdir)->i_op->create || - !d_backing_inode(subdir)->i_op->rename || - !d_backing_inode(subdir)->i_op->rmdir || - !d_backing_inode(subdir)->i_op->unlink) - goto check_error; - - _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); - return subdir; - -check_error: - dput(subdir); - _leave(" = %d [check]", ret); - return ERR_PTR(ret); - -mkdir_error: - inode_unlock(d_inode(dir)); - dput(subdir); - pr_err("mkdir %s failed with error %d\n", dirname, ret); - return ERR_PTR(ret); - -lookup_error: - inode_unlock(d_inode(dir)); - ret = PTR_ERR(subdir); - pr_err("Lookup %s failed with error %d\n", dirname, ret); - return ERR_PTR(ret); - -nomem_d_alloc: - inode_unlock(d_inode(dir)); - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_link(object->file->f_path.dentry, &init_user_ns, + d_inode(fan), dentry, NULL); + if (ret < 0) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, + cachefiles_trace_link_error); + _debug("link fail %d", ret); + } else { + trace_cachefiles_link(object, file_inode(object->file)); + spin_lock(&object->lock); + /* TODO: Do we want to switch the file pointer to the new dentry? */ + clear_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + spin_unlock(&object->lock); + success = true; + } + +out_dput: + dput(dentry); +out_unlock: + inode_unlock(d_inode(fan)); + _leave(" = %u", success); + return success; } /* - * find out if an object is in use or not - * - if finds object and it's not in use: - * - returns a pointer to the object and a reference on it - * - returns with the directory locked + * Look up an inode to be checked or culled. Return -EBUSY if the inode is + * marked in use. */ -static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, - struct dentry *dir, - char *filename) +static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) { - struct cachefiles_object *object; - struct rb_node *_n; struct dentry *victim; - int ret; - - //_enter(",%pd/,%s", - // dir, filename); + int ret = -ENOENT; - /* look up the victim */ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); victim = lookup_one_len(filename, dir, strlen(filename)); if (IS_ERR(victim)) goto lookup_error; - - //_debug("victim -> %pd %s", - // victim, d_backing_inode(victim) ? "positive" : "negative"); - - /* if the object is no longer there then we probably retired the object - * at the netfs's request whilst the cull was in progress - */ - if (d_is_negative(victim)) { - inode_unlock(d_inode(dir)); - dput(victim); - _leave(" = -ENOENT [absent]"); - return ERR_PTR(-ENOENT); - } - - /* check to see if we're using this object */ - read_lock(&cache->active_lock); - - _n = cache->active_nodes.rb_node; - - while (_n) { - object = rb_entry(_n, struct cachefiles_object, active_node); - - if (object->dentry > victim) - _n = _n->rb_left; - else if (object->dentry < victim) - _n = _n->rb_right; - else - goto object_in_use; - } - - read_unlock(&cache->active_lock); - - //_leave(" = %pd", victim); + if (d_is_negative(victim)) + goto lookup_put; + if (d_inode(victim)->i_flags & S_KERNEL_FILE) + goto lookup_busy; return victim; -object_in_use: - read_unlock(&cache->active_lock); +lookup_busy: + ret = -EBUSY; +lookup_put: inode_unlock(d_inode(dir)); dput(victim); - //_leave(" = -EBUSY [in use]"); - return ERR_PTR(-EBUSY); + return ERR_PTR(ret); lookup_error: inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); - if (ret == -ENOENT) { - /* file or dir now absent - probably retired by netfs */ - _leave(" = -ESTALE [absent]"); - return ERR_PTR(-ESTALE); - } + if (ret == -ENOENT) + return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); @@ -931,46 +766,46 @@ lookup_error: ret = -EIO; } - _leave(" = %d", ret); return ERR_PTR(ret); } /* - * cull an object if it's not in use + * Cull an object if it's not in use * - called only by cache manager daemon */ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; + struct inode *inode; int ret; _enter(",%pd/,%s", dir, filename); - victim = cachefiles_check_active(cache, dir, filename); + victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); - _debug("victim -> %pd %s", - victim, d_backing_inode(victim) ? "positive" : "negative"); - - /* okay... the victim is not being used so we can cull it - * - start by marking it as stale - */ - _debug("victim is cullable"); - - ret = cachefiles_remove_object_xattr(cache, victim); + /* check to see if someone is using this object */ + inode = d_inode(victim); + inode_lock(inode); + if (inode->i_flags & S_KERNEL_FILE) { + ret = -EBUSY; + } else { + /* Stop the cache from picking it back up */ + inode->i_flags |= S_KERNEL_FILE; + ret = 0; + } + inode_unlock(inode); if (ret < 0) goto error_unlock; - /* actually remove the victim (drops the dir mutex) */ - _debug("bury"); - - ret = cachefiles_bury_object(cache, NULL, dir, victim, false, + ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; + fscache_count_culled(); dput(victim); _leave(" = 0"); return 0; @@ -979,11 +814,8 @@ error_unlock: inode_unlock(d_inode(dir)); error: dput(victim); - if (ret == -ENOENT) { - /* file or dir now absent - probably retired by netfs */ - _leave(" = -ESTALE [absent]"); - return -ESTALE; - } + if (ret == -ENOENT) + return -ESTALE; /* Probably got retired by the netfs */ if (ret != -ENOMEM) { pr_err("Internal error: %d\n", ret); @@ -995,7 +827,7 @@ error: } /* - * find out if an object is in use or not + * Find out if an object is in use or not * - called only by cache manager daemon * - returns -EBUSY or 0 to indicate whether an object is in use or not */ @@ -1003,16 +835,13 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; + int ret = 0; - //_enter(",%pd/,%s", - // dir, filename); - - victim = cachefiles_check_active(cache, dir, filename); + victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); inode_unlock(d_inode(dir)); dput(victim); - //_leave(" = 0"); - return 0; + return ret; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c deleted file mode 100644 index fcf4f3b72923..000000000000 --- a/fs/cachefiles/rdwr.c +++ /dev/null @@ -1,972 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Storage object read/write - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/file.h> -#include <linux/swap.h> -#include "internal.h" - -/* - * detect wake up events generated by the unlocking of pages in which we're - * interested - * - we use this to detect read completion of backing pages - * - the caller holds the waitqueue lock - */ -static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, - int sync, void *_key) -{ - struct cachefiles_one_read *monitor = - container_of(wait, struct cachefiles_one_read, monitor); - struct cachefiles_object *object; - struct fscache_retrieval *op = monitor->op; - struct wait_page_key *key = _key; - struct folio *folio = wait->private; - - ASSERT(key); - - _enter("{%lu},%u,%d,{%p,%u}", - monitor->netfs_page->index, mode, sync, - key->folio, key->bit_nr); - - if (key->folio != folio || key->bit_nr != PG_locked) - return 0; - - _debug("--- monitor %p %lx ---", folio, folio->flags); - - if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { - /* unlocked, not uptodate and not erronous? */ - _debug("page probably truncated"); - } - - /* remove from the waitqueue */ - list_del(&wait->entry); - - /* move onto the action list and queue for FS-Cache thread pool */ - ASSERT(op); - - /* We need to temporarily bump the usage count as we don't own a ref - * here otherwise cachefiles_read_copier() may free the op between the - * monitor being enqueued on the op->to_do list and the op getting - * enqueued on the work queue. - */ - fscache_get_retrieval(op); - - object = container_of(op->op.object, struct cachefiles_object, fscache); - spin_lock(&object->work_lock); - list_add_tail(&monitor->op_link, &op->to_do); - fscache_enqueue_retrieval(op); - spin_unlock(&object->work_lock); - - fscache_put_retrieval(op); - return 0; -} - -/* - * handle a probably truncated page - * - check to see if the page is still relevant and reissue the read if - * possible - * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we - * must wait again and 0 if successful - */ -static int cachefiles_read_reissue(struct cachefiles_object *object, - struct cachefiles_one_read *monitor) -{ - struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; - struct page *backpage = monitor->back_page, *backpage2; - int ret; - - _enter("{ino=%lx},{%lx,%lx}", - d_backing_inode(object->backer)->i_ino, - backpage->index, backpage->flags); - - /* skip if the page was truncated away completely */ - if (backpage->mapping != bmapping) { - _leave(" = -ENODATA [mapping]"); - return -ENODATA; - } - - backpage2 = find_get_page(bmapping, backpage->index); - if (!backpage2) { - _leave(" = -ENODATA [gone]"); - return -ENODATA; - } - - if (backpage != backpage2) { - put_page(backpage2); - _leave(" = -ENODATA [different]"); - return -ENODATA; - } - - /* the page is still there and we already have a ref on it, so we don't - * need a second */ - put_page(backpage2); - - INIT_LIST_HEAD(&monitor->op_link); - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - - if (trylock_page(backpage)) { - ret = -EIO; - if (PageError(backpage)) - goto unlock_discard; - ret = 0; - if (PageUptodate(backpage)) - goto unlock_discard; - - _debug("reissue read"); - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto discard; - } - - /* but the page may have been read before the monitor was installed, so - * the monitor may miss the event - so we have to ensure that we do get - * one in such a case */ - if (trylock_page(backpage)) { - _debug("jumpstart %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - - /* it'll reappear on the todo list */ - _leave(" = -EINPROGRESS"); - return -EINPROGRESS; - -unlock_discard: - unlock_page(backpage); -discard: - spin_lock_irq(&object->work_lock); - list_del(&monitor->op_link); - spin_unlock_irq(&object->work_lock); - _leave(" = %d", ret); - return ret; -} - -/* - * copy data from backing pages to netfs pages to complete a read operation - * - driven by FS-Cache's thread pool - */ -static void cachefiles_read_copier(struct fscache_operation *_op) -{ - struct cachefiles_one_read *monitor; - struct cachefiles_object *object; - struct fscache_retrieval *op; - int error, max; - - op = container_of(_op, struct fscache_retrieval, op); - object = container_of(op->op.object, - struct cachefiles_object, fscache); - - _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); - - max = 8; - spin_lock_irq(&object->work_lock); - - while (!list_empty(&op->to_do)) { - monitor = list_entry(op->to_do.next, - struct cachefiles_one_read, op_link); - list_del(&monitor->op_link); - - spin_unlock_irq(&object->work_lock); - - _debug("- copy {%lu}", monitor->back_page->index); - - recheck: - if (test_bit(FSCACHE_COOKIE_INVALIDATING, - &object->fscache.cookie->flags)) { - error = -ESTALE; - } else if (PageUptodate(monitor->back_page)) { - copy_highpage(monitor->netfs_page, monitor->back_page); - fscache_mark_page_cached(monitor->op, - monitor->netfs_page); - error = 0; - } else if (!PageError(monitor->back_page)) { - /* the page has probably been truncated */ - error = cachefiles_read_reissue(object, monitor); - if (error == -EINPROGRESS) - goto next; - goto recheck; - } else { - cachefiles_io_error_obj( - object, - "Readpage failed on backing file %lx", - (unsigned long) monitor->back_page->flags); - error = -EIO; - } - - put_page(monitor->back_page); - - fscache_end_io(op, monitor->netfs_page, error); - put_page(monitor->netfs_page); - fscache_retrieval_complete(op, 1); - fscache_put_retrieval(op); - kfree(monitor); - - next: - /* let the thread pool have some air occasionally */ - max--; - if (max < 0 || need_resched()) { - if (!list_empty(&op->to_do)) - fscache_enqueue_retrieval(op); - _leave(" [maxed out]"); - return; - } - - spin_lock_irq(&object->work_lock); - } - - spin_unlock_irq(&object->work_lock); - _leave(""); -} - -/* - * read the corresponding page to the given set from the backing file - * - an uncertain page is simply discarded, to be tried again another time - */ -static int cachefiles_read_backing_file_one(struct cachefiles_object *object, - struct fscache_retrieval *op, - struct page *netpage) -{ - struct cachefiles_one_read *monitor; - struct address_space *bmapping; - struct page *newpage, *backpage; - int ret; - - _enter(""); - - _debug("read back %p{%lu,%d}", - netpage, netpage->index, page_count(netpage)); - - monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); - if (!monitor) - goto nomem; - - monitor->netfs_page = netpage; - monitor->op = fscache_get_retrieval(op); - - init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); - - /* attempt to get hold of the backing page */ - bmapping = d_backing_inode(object->backer)->i_mapping; - newpage = NULL; - - for (;;) { - backpage = find_get_page(bmapping, netpage->index); - if (backpage) - goto backing_page_already_present; - - if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp); - if (!newpage) - goto nomem_monitor; - } - - ret = add_to_page_cache_lru(newpage, bmapping, - netpage->index, cachefiles_gfp); - if (ret == 0) - goto installed_new_backing_page; - if (ret != -EEXIST) - goto nomem_page; - } - - /* we've installed a new backing page, so now we need to start - * it reading */ -installed_new_backing_page: - _debug("- new %p", newpage); - - backpage = newpage; - newpage = NULL; - -read_backing_page: - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto read_error; - - /* set the monitor to transfer the data across */ -monitor_backing_page: - _debug("- monitor add"); - - /* install the monitor */ - get_page(monitor->netfs_page); - get_page(backpage); - monitor->back_page = backpage; - monitor->monitor.private = backpage; - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - monitor = NULL; - - /* but the page may have been read before the monitor was installed, so - * the monitor may miss the event - so we have to ensure that we do get - * one in such a case */ - if (trylock_page(backpage)) { - _debug("jumpstart %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - goto success; - - /* if the backing page is already present, it can be in one of - * three states: read in progress, read failed or read okay */ -backing_page_already_present: - _debug("- present"); - - if (newpage) { - put_page(newpage); - newpage = NULL; - } - - if (PageError(backpage)) - goto io_error; - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate; - - if (!trylock_page(backpage)) - goto monitor_backing_page; - _debug("read %p {%lx}", backpage, backpage->flags); - goto read_backing_page; - - /* the backing page is already up to date, attach the netfs - * page to the pagecache and LRU and copy the data across */ -backing_page_already_uptodate: - _debug("- uptodate"); - - fscache_mark_page_cached(op, netpage); - - copy_highpage(netpage, backpage); - fscache_end_io(op, netpage, 0); - fscache_retrieval_complete(op, 1); - -success: - _debug("success"); - ret = 0; - -out: - if (backpage) - put_page(backpage); - if (monitor) { - fscache_put_retrieval(monitor->op); - kfree(monitor); - } - _leave(" = %d", ret); - return ret; - -read_error: - _debug("read error %d", ret); - if (ret == -ENOMEM) { - fscache_retrieval_complete(op, 1); - goto out; - } -io_error: - cachefiles_io_error_obj(object, "Page read error on backing file"); - fscache_retrieval_complete(op, 1); - ret = -ENOBUFS; - goto out; - -nomem_page: - put_page(newpage); -nomem_monitor: - fscache_put_retrieval(monitor->op); - kfree(monitor); -nomem: - fscache_retrieval_complete(op, 1); - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * read a page from the cache or allocate a block in which to store it - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if no buffers can be made available - * - returns -ENOBUFS if page is beyond EOF - * - if the page is backed by a block in the cache: - * - a read will be started which will call the callback on completion - * - 0 will be returned - * - else if the page is unbacked: - * - the metadata will be retained - * - -ENODATA will be returned - */ -int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct inode *inode; - sector_t block; - unsigned shift; - int ret, ret2; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("{%p},{%lx},,,", object, page->index); - - if (!object->backer) - goto enobufs; - - inode = d_backing_inode(object->backer); - ASSERT(S_ISREG(inode->i_mode)); - - /* calculate the shift required to use bmap */ - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - - op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_ASYNC; - op->op.processor = cachefiles_read_copier; - - /* we assume the absence or presence of the first block is a good - * enough indication for the page as a whole - * - TODO: don't use bmap() for this as it is _not_ actually good - * enough for this as it doesn't indicate errors, but it's all we've - * got for the moment - */ - block = page->index; - block <<= shift; - - ret2 = bmap(inode, &block); - ASSERT(ret2 == 0); - - _debug("%llx -> %llx", - (unsigned long long) (page->index << shift), - (unsigned long long) block); - - if (block) { - /* submit the apparently valid page to the backing fs to be - * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page); - } else if (cachefiles_has_space(cache, 0, 1) == 0) { - /* there's space in the cache we can use */ - fscache_mark_page_cached(op, page); - fscache_retrieval_complete(op, 1); - ret = -ENODATA; - } else { - goto enobufs; - } - - _leave(" = %d", ret); - return ret; - -enobufs: - fscache_retrieval_complete(op, 1); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} - -/* - * read the corresponding pages to the given set from the backing file - * - any uncertain pages are simply discarded, to be tried again another time - */ -static int cachefiles_read_backing_file(struct cachefiles_object *object, - struct fscache_retrieval *op, - struct list_head *list) -{ - struct cachefiles_one_read *monitor = NULL; - struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; - struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; - int ret = 0; - - _enter(""); - - list_for_each_entry_safe(netpage, _n, list, lru) { - list_del(&netpage->lru); - - _debug("read back %p{%lu,%d}", - netpage, netpage->index, page_count(netpage)); - - if (!monitor) { - monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); - if (!monitor) - goto nomem; - - monitor->op = fscache_get_retrieval(op); - init_waitqueue_func_entry(&monitor->monitor, - cachefiles_read_waiter); - } - - for (;;) { - backpage = find_get_page(bmapping, netpage->index); - if (backpage) - goto backing_page_already_present; - - if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp); - if (!newpage) - goto nomem; - } - - ret = add_to_page_cache_lru(newpage, bmapping, - netpage->index, - cachefiles_gfp); - if (ret == 0) - goto installed_new_backing_page; - if (ret != -EEXIST) - goto nomem; - } - - /* we've installed a new backing page, so now we need - * to start it reading */ - installed_new_backing_page: - _debug("- new %p", newpage); - - backpage = newpage; - newpage = NULL; - - reread_backing_page: - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto read_error; - - /* add the netfs page to the pagecache and LRU, and set the - * monitor to transfer the data across */ - monitor_backing_page: - _debug("- monitor add"); - - ret = add_to_page_cache_lru(netpage, op->mapping, - netpage->index, cachefiles_gfp); - if (ret < 0) { - if (ret == -EEXIST) { - put_page(backpage); - backpage = NULL; - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - goto nomem; - } - - /* install a monitor */ - get_page(netpage); - monitor->netfs_page = netpage; - - get_page(backpage); - monitor->back_page = backpage; - monitor->monitor.private = backpage; - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - monitor = NULL; - - /* but the page may have been read before the monitor was - * installed, so the monitor may miss the event - so we have to - * ensure that we do get one in such a case */ - if (trylock_page(backpage)) { - _debug("2unlock %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - - put_page(backpage); - backpage = NULL; - - put_page(netpage); - netpage = NULL; - continue; - - /* if the backing page is already present, it can be in one of - * three states: read in progress, read failed or read okay */ - backing_page_already_present: - _debug("- present %p", backpage); - - if (PageError(backpage)) - goto io_error; - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate; - - _debug("- not ready %p{%lx}", backpage, backpage->flags); - - if (!trylock_page(backpage)) - goto monitor_backing_page; - - if (PageError(backpage)) { - _debug("error %lx", backpage->flags); - unlock_page(backpage); - goto io_error; - } - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate_unlock; - - /* we've locked a page that's neither up to date nor erroneous, - * so we need to attempt to read it again */ - goto reread_backing_page; - - /* the backing page is already up to date, attach the netfs - * page to the pagecache and LRU and copy the data across */ - backing_page_already_uptodate_unlock: - _debug("uptodate %lx", backpage->flags); - unlock_page(backpage); - backing_page_already_uptodate: - _debug("- uptodate"); - - ret = add_to_page_cache_lru(netpage, op->mapping, - netpage->index, cachefiles_gfp); - if (ret < 0) { - if (ret == -EEXIST) { - put_page(backpage); - backpage = NULL; - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - goto nomem; - } - - copy_highpage(netpage, backpage); - - put_page(backpage); - backpage = NULL; - - fscache_mark_page_cached(op, netpage); - - /* the netpage is unlocked and marked up to date here */ - fscache_end_io(op, netpage, 0); - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - - netpage = NULL; - - _debug("out"); - -out: - /* tidy up */ - if (newpage) - put_page(newpage); - if (netpage) - put_page(netpage); - if (backpage) - put_page(backpage); - if (monitor) { - fscache_put_retrieval(op); - kfree(monitor); - } - - list_for_each_entry_safe(netpage, _n, list, lru) { - list_del(&netpage->lru); - put_page(netpage); - fscache_retrieval_complete(op, 1); - } - - _leave(" = %d", ret); - return ret; - -nomem: - _debug("nomem"); - ret = -ENOMEM; - goto record_page_complete; - -read_error: - _debug("read error %d", ret); - if (ret == -ENOMEM) - goto record_page_complete; -io_error: - cachefiles_io_error_obj(object, "Page read error on backing file"); - ret = -ENOBUFS; -record_page_complete: - fscache_retrieval_complete(op, 1); - goto out; -} - -/* - * read a list of pages from the cache or allocate blocks in which to store - * them - */ -int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct list_head backpages; - struct pagevec pagevec; - struct inode *inode; - struct page *page, *_n; - unsigned shift, nrbackpages; - int ret, ret2, space; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("{OBJ%x,%d},,%d,,", - object->fscache.debug_id, atomic_read(&op->op.usage), - *nr_pages); - - if (!object->backer) - goto all_enobufs; - - space = 1; - if (cachefiles_has_space(cache, 0, *nr_pages) < 0) - space = 0; - - inode = d_backing_inode(object->backer); - ASSERT(S_ISREG(inode->i_mode)); - - /* calculate the shift required to use bmap */ - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - - pagevec_init(&pagevec); - - op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_ASYNC; - op->op.processor = cachefiles_read_copier; - - INIT_LIST_HEAD(&backpages); - nrbackpages = 0; - - ret = space ? -ENODATA : -ENOBUFS; - list_for_each_entry_safe(page, _n, pages, lru) { - sector_t block; - - /* we assume the absence or presence of the first block is a - * good enough indication for the page as a whole - * - TODO: don't use bmap() for this as it is _not_ actually - * good enough for this as it doesn't indicate errors, but - * it's all we've got for the moment - */ - block = page->index; - block <<= shift; - - ret2 = bmap(inode, &block); - ASSERT(ret2 == 0); - - _debug("%llx -> %llx", - (unsigned long long) (page->index << shift), - (unsigned long long) block); - - if (block) { - /* we have data - add it to the list to give to the - * backing fs */ - list_move(&page->lru, &backpages); - (*nr_pages)--; - nrbackpages++; - } else if (space && pagevec_add(&pagevec, page) == 0) { - fscache_mark_pages_cached(op, &pagevec); - fscache_retrieval_complete(op, 1); - ret = -ENODATA; - } else { - fscache_retrieval_complete(op, 1); - } - } - - if (pagevec_count(&pagevec) > 0) - fscache_mark_pages_cached(op, &pagevec); - - if (list_empty(pages)) - ret = 0; - - /* submit the apparently valid pages to the backing fs to be read from - * disk */ - if (nrbackpages > 0) { - ret2 = cachefiles_read_backing_file(object, op, &backpages); - if (ret2 == -ENOMEM || ret2 == -EINTR) - ret = ret2; - } - - _leave(" = %d [nr=%u%s]", - ret, *nr_pages, list_empty(pages) ? " empty" : ""); - return ret; - -all_enobufs: - fscache_retrieval_complete(op, *nr_pages); - return -ENOBUFS; -} - -/* - * allocate a block in the cache in which to store a page - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if no buffers can be made available - * - returns -ENOBUFS if page is beyond EOF - * - otherwise: - * - the metadata will be retained - * - 0 will be returned - */ -int cachefiles_allocate_page(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - int ret; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("%p,{%lx},", object, page->index); - - ret = cachefiles_has_space(cache, 0, 1); - if (ret == 0) - fscache_mark_page_cached(op, page); - else - ret = -ENOBUFS; - - fscache_retrieval_complete(op, 1); - _leave(" = %d", ret); - return ret; -} - -/* - * allocate blocks in the cache in which to store a set of pages - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if some buffers couldn't be made available - * - returns -ENOBUFS if some pages are beyond EOF - * - otherwise: - * - -ENODATA will be returned - * - metadata will be retained for any page marked - */ -int cachefiles_allocate_pages(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct pagevec pagevec; - struct page *page; - int ret; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("%p,,,%d,", object, *nr_pages); - - ret = cachefiles_has_space(cache, 0, *nr_pages); - if (ret == 0) { - pagevec_init(&pagevec); - - list_for_each_entry(page, pages, lru) { - if (pagevec_add(&pagevec, page) == 0) - fscache_mark_pages_cached(op, &pagevec); - } - - if (pagevec_count(&pagevec) > 0) - fscache_mark_pages_cached(op, &pagevec); - ret = -ENODATA; - } else { - ret = -ENOBUFS; - } - - fscache_retrieval_complete(op, *nr_pages); - _leave(" = %d", ret); - return ret; -} - -/* - * request a page be stored in the cache - * - cache withdrawal is prevented by the caller - * - this request may be ignored if there's no cache block available, in which - * case -ENOBUFS will be returned - * - if the op is in progress, 0 will be returned - */ -int cachefiles_write_page(struct fscache_storage *op, struct page *page) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct file *file; - struct path path; - loff_t pos, eof; - size_t len; - void *data; - int ret = -ENOBUFS; - - ASSERT(op != NULL); - ASSERT(page != NULL); - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - - _enter("%p,%p{%lx},,,", object, page, page->index); - - if (!object->backer) { - _leave(" = -ENOBUFS"); - return -ENOBUFS; - } - - ASSERT(d_is_reg(object->backer)); - - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - pos = (loff_t)page->index << PAGE_SHIFT; - - /* We mustn't write more data than we have, so we have to beware of a - * partial page at EOF. - */ - eof = object->fscache.store_limit_l; - if (pos >= eof) - goto error; - - /* write the page to the backing filesystem and let it store it in its - * own time */ - path.mnt = cache->mnt; - path.dentry = object->backer; - file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto error_2; - } - - len = PAGE_SIZE; - if (eof & ~PAGE_MASK) { - if (eof - pos < PAGE_SIZE) { - _debug("cut short %llx to %llx", - pos, eof); - len = eof - pos; - ASSERTCMP(pos + len, ==, eof); - } - } - - data = kmap(page); - ret = kernel_write(file, data, len, &pos); - kunmap(page); - fput(file); - if (ret != len) - goto error_eio; - - _leave(" = 0"); - return 0; - -error_eio: - ret = -EIO; -error_2: - if (ret == -EIO) - cachefiles_io_error_obj(object, - "Write page to backing file failed"); -error: - _leave(" = -ENOBUFS [%d]", ret); - return -ENOBUFS; -} - -/* - * detach a backing block from a page - * - cache withdrawal is prevented by the caller - */ -void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) - __releases(&object->fscache.cookie->lock) -{ - struct cachefiles_object *object; - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("%p,{%lu}", object, page->index); - - spin_unlock(&object->fscache.cookie->lock); -} diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index aec13fd94692..fe777164f1d8 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles security management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c new file mode 100644 index 000000000000..89df0ba8ba5e --- /dev/null +++ b/fs/cachefiles/volume.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Volume handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include "internal.h" +#include <trace/events/fscache.h> + +/* + * Allocate and set up a volume representation. We make sure all the fanout + * directories are created and pinned. + */ +void cachefiles_acquire_volume(struct fscache_volume *vcookie) +{ + struct cachefiles_volume *volume; + struct cachefiles_cache *cache = vcookie->cache->cache_priv; + const struct cred *saved_cred; + struct dentry *vdentry, *fan; + size_t len; + char *name; + bool is_new = false; + int ret, n_accesses, i; + + _enter(""); + + volume = kzalloc(sizeof(struct cachefiles_volume), GFP_KERNEL); + if (!volume) + return; + volume->vcookie = vcookie; + volume->cache = cache; + INIT_LIST_HEAD(&volume->cache_link); + + cachefiles_begin_secure(cache, &saved_cred); + + len = vcookie->key[0]; + name = kmalloc(len + 3, GFP_NOFS); + if (!name) + goto error_vol; + name[0] = 'I'; + memcpy(name + 1, vcookie->key + 1, len); + name[len + 1] = 0; + +retry: + vdentry = cachefiles_get_directory(cache, cache->store, name, &is_new); + if (IS_ERR(vdentry)) + goto error_name; + volume->dentry = vdentry; + + if (is_new) { + if (!cachefiles_set_volume_xattr(volume)) + goto error_dir; + } else { + ret = cachefiles_check_volume_xattr(volume); + if (ret < 0) { + if (ret != -ESTALE) + goto error_dir; + inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT); + cachefiles_bury_object(cache, NULL, cache->store, vdentry, + FSCACHE_VOLUME_IS_WEIRD); + cachefiles_put_directory(volume->dentry); + cond_resched(); + goto retry; + } + } + + for (i = 0; i < 256; i++) { + sprintf(name, "@%02x", i); + fan = cachefiles_get_directory(cache, vdentry, name, NULL); + if (IS_ERR(fan)) + goto error_fan; + volume->fanout[i] = fan; + } + + cachefiles_end_secure(cache, saved_cred); + + vcookie->cache_priv = volume; + n_accesses = atomic_inc_return(&vcookie->n_accesses); /* Stop wakeups on dec-to-0 */ + trace_fscache_access_volume(vcookie->debug_id, 0, + refcount_read(&vcookie->ref), + n_accesses, fscache_access_cache_pin); + + spin_lock(&cache->object_list_lock); + list_add(&volume->cache_link, &volume->cache->volumes); + spin_unlock(&cache->object_list_lock); + + kfree(name); + return; + +error_fan: + for (i = 0; i < 256; i++) + cachefiles_put_directory(volume->fanout[i]); +error_dir: + cachefiles_put_directory(volume->dentry); +error_name: + kfree(name); +error_vol: + kfree(volume); + cachefiles_end_secure(cache, saved_cred); +} + +/* + * Release a volume representation. + */ +static void __cachefiles_free_volume(struct cachefiles_volume *volume) +{ + int i; + + _enter(""); + + volume->vcookie->cache_priv = NULL; + + for (i = 0; i < 256; i++) + cachefiles_put_directory(volume->fanout[i]); + cachefiles_put_directory(volume->dentry); + kfree(volume); +} + +void cachefiles_free_volume(struct fscache_volume *vcookie) +{ + struct cachefiles_volume *volume = vcookie->cache_priv; + + if (volume) { + spin_lock(&volume->cache->object_list_lock); + list_del_init(&volume->cache_link); + spin_unlock(&volume->cache->object_list_lock); + __cachefiles_free_volume(volume); + } +} + +void cachefiles_withdraw_volume(struct cachefiles_volume *volume) +{ + fscache_withdraw_volume(volume->vcookie); + cachefiles_set_volume_xattr(volume); + __cachefiles_free_volume(volume); +} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 9e82de668595..83f41bd0c3a9 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles extended attribute management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -15,310 +15,245 @@ #include <linux/slab.h> #include "internal.h" +#define CACHEFILES_COOKIE_TYPE_DATA 1 + +struct cachefiles_xattr { + __be64 object_size; /* Actual size of the object */ + __be64 zero_point; /* Size after which server has no data not written by us */ + __u8 type; /* Type of object */ + __u8 content; /* Content presence (enum cachefiles_content) */ + __u8 data[]; /* netfs coherency data */ +} __packed; + static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; /* - * check the type label on an object - * - done using xattrs + * set the state xattr on a cache file */ -int cachefiles_check_object_type(struct cachefiles_object *object) +int cachefiles_set_object_xattr(struct cachefiles_object *object) { - struct dentry *dentry = object->dentry; - char type[3], xtype[3]; + struct cachefiles_xattr *buf; + struct dentry *dentry; + struct file *file = object->file; + unsigned int len = object->cookie->aux_len; int ret; - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - - if (!object->fscache.cookie) - strcpy(type, "C3"); - else - snprintf(type, 3, "%02x", object->fscache.cookie->def->type); - - _enter("%x{%s}", object->fscache.debug_id, type); + if (!file) + return -ESTALE; + dentry = file->f_path.dentry; - /* attempt to install a type label directly */ - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, - 2, XATTR_CREATE); - if (ret == 0) { - _debug("SET"); /* we succeeded */ - goto error; - } + _enter("%x,#%d", object->debug_id, len); - if (ret != -EEXIST) { - pr_err("Can't set xattr on %pd [%lu] (err %d)\n", - dentry, d_backing_inode(dentry)->i_ino, - -ret); - goto error; - } + buf = kmalloc(sizeof(struct cachefiles_xattr) + len, GFP_KERNEL); + if (!buf) + return -ENOMEM; - /* read the current type label */ - ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype, - 3); + buf->object_size = cpu_to_be64(object->cookie->object_size); + buf->zero_point = 0; + buf->type = CACHEFILES_COOKIE_TYPE_DATA; + buf->content = object->content_info; + if (test_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags)) + buf->content = CACHEFILES_CONTENT_DIRTY; + if (len > 0) + memcpy(buf->data, fscache_get_aux(object->cookie), len); + + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + buf, sizeof(struct cachefiles_xattr) + len, 0); if (ret < 0) { - if (ret == -ERANGE) - goto bad_type_length; - - pr_err("Can't read xattr on %pd [%lu] (err %d)\n", - dentry, d_backing_inode(dentry)->i_ino, - -ret); - goto error; + trace_cachefiles_vfs_error(object, file_inode(file), ret, + cachefiles_trace_setxattr_error); + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, + cachefiles_coherency_set_fail); + if (ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + } else { + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, + cachefiles_coherency_set_ok); } - /* check the type is what we're expecting */ - if (ret != 2) - goto bad_type_length; - - if (xtype[0] != type[0] || xtype[1] != type[1]) - goto bad_type; - - ret = 0; - -error: + kfree(buf); _leave(" = %d", ret); return ret; - -bad_type_length: - pr_err("Cache object %lu type xattr length incorrect\n", - d_backing_inode(dentry)->i_ino); - ret = -EIO; - goto error; - -bad_type: - xtype[2] = 0; - pr_err("Cache object %pd [%lu] type %s not %s\n", - dentry, d_backing_inode(dentry)->i_ino, - xtype, type); - ret = -EIO; - goto error; } /* - * set the state xattr on a cache file + * check the consistency between the backing cache and the FS-Cache cookie */ -int cachefiles_set_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file) { - struct dentry *dentry = object->dentry; - int ret; - - ASSERT(dentry); - - _enter("%p,#%d", object, auxdata->len); + struct cachefiles_xattr *buf; + struct dentry *dentry = file->f_path.dentry; + unsigned int len = object->cookie->aux_len, tlen; + const void *p = fscache_get_aux(object->cookie); + enum cachefiles_coherency_trace why; + ssize_t xlen; + int ret = -ESTALE; - /* attempt to install the cache metadata directly */ - _debug("SET #%u", auxdata->len); + tlen = sizeof(struct cachefiles_xattr) + len; + buf = kmalloc(tlen, GFP_KERNEL); + if (!buf) + return -ENOMEM; - clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, XATTR_CREATE); - if (ret < 0 && ret != -ENOMEM) - cachefiles_io_error_obj( - object, - "Failed to set xattr with error %d", ret); + xlen = cachefiles_inject_read_error(); + if (xlen == 0) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + if (xlen != tlen) { + if (xlen < 0) + trace_cachefiles_vfs_error(object, file_inode(file), xlen, + cachefiles_trace_getxattr_error); + if (xlen == -EIO) + cachefiles_io_error_obj( + object, + "Failed to read aux with error %zd", xlen); + why = cachefiles_coherency_check_xattr; + } else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) { + why = cachefiles_coherency_check_type; + } else if (memcmp(buf->data, p, len) != 0) { + why = cachefiles_coherency_check_aux; + } else if (be64_to_cpu(buf->object_size) != object->cookie->object_size) { + why = cachefiles_coherency_check_objsize; + } else if (buf->content == CACHEFILES_CONTENT_DIRTY) { + // TODO: Begin conflict resolution + pr_warn("Dirty object in cache\n"); + why = cachefiles_coherency_check_dirty; + } else { + why = cachefiles_coherency_check_ok; + ret = 0; + } - _leave(" = %d", ret); + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, why); + kfree(buf); return ret; } /* - * update the state xattr on a cache file + * remove the object's xattr to mark it stale */ -int cachefiles_update_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dentry) { - struct dentry *dentry = object->dentry; int ret; - if (!dentry) - return -ESTALE; - - _enter("%x,#%d", object->fscache.debug_id, auxdata->len); - - /* attempt to install the cache metadata directly */ - _debug("SET #%u", auxdata->len); - - clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, XATTR_REPLACE); - if (ret < 0 && ret != -ENOMEM) - cachefiles_io_error_obj( - object, - "Failed to update xattr with error %d", ret); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); + if (ret < 0) { + trace_cachefiles_vfs_error(object, d_inode(dentry), ret, + cachefiles_trace_remxattr_error); + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + d_backing_inode(dentry)->i_ino, -ret); + } _leave(" = %d", ret); return ret; } /* - * check the consistency between the backing cache and the FS-Cache cookie + * Stick a marker on the cache object to indicate that it's dirty. */ -int cachefiles_check_auxdata(struct cachefiles_object *object) +void cachefiles_prepare_to_write(struct fscache_cookie *cookie) { - struct cachefiles_xattr *auxbuf; - enum fscache_checkaux validity; - struct dentry *dentry = object->dentry; - ssize_t xlen; - int ret; - - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - ASSERT(object->fscache.cookie->def->check_aux); - - auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); - if (!auxbuf) - return -ENOMEM; + const struct cred *saved_cred; + struct cachefiles_object *object = cookie->cache_priv; + struct cachefiles_cache *cache = object->volume->cache; - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxbuf->type, 512 + 1); - ret = -ESTALE; - if (xlen < 1 || - auxbuf->type != object->fscache.cookie->def->type) - goto error; + _enter("c=%08x", object->cookie->debug_id); - xlen--; - validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen, - i_size_read(d_backing_inode(dentry))); - if (validity != FSCACHE_CHECKAUX_OKAY) - goto error; - - ret = 0; -error: - kfree(auxbuf); - return ret; + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_set_object_xattr(object); + cachefiles_end_secure(cache, saved_cred); + } } /* - * check the state xattr on a cache file - * - return -ESTALE if the object should be deleted + * Set the state xattr on a volume directory. */ -int cachefiles_check_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *auxbuf; - struct dentry *dentry = object->dentry; + unsigned int len = volume->vcookie->coherency_len; + const void *p = volume->vcookie->coherency; + struct dentry *dentry = volume->dentry; int ret; - _enter("%p,#%d", object, auxdata->len); - - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - - auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); - if (!auxbuf) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } + _enter("%x,#%d", volume->vcookie->debug_id, len); - /* read the current type label */ - ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxbuf->type, 512 + 1); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + p, len, 0); if (ret < 0) { - if (ret == -ENODATA) - goto stale; /* no attribute - power went off - * mid-cull? */ - - if (ret == -ERANGE) - goto bad_type_length; - - cachefiles_io_error_obj(object, - "Can't read xattr on %lu (err %d)", - d_backing_inode(dentry)->i_ino, -ret); - goto error; + trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, + cachefiles_trace_setxattr_error); + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, + cachefiles_coherency_vol_set_fail); + if (ret != -ENOMEM) + cachefiles_io_error( + volume->cache, "Failed to set xattr with error %d", ret); + } else { + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, + cachefiles_coherency_vol_set_ok); } - /* check the on-disk object */ - if (ret < 1) - goto bad_type_length; - - if (auxbuf->type != auxdata->type) - goto stale; - - auxbuf->len = ret; - - /* consult the netfs */ - if (object->fscache.cookie->def->check_aux) { - enum fscache_checkaux result; - unsigned int dlen; - - dlen = auxbuf->len - 1; - - _debug("checkaux %s #%u", - object->fscache.cookie->def->name, dlen); - - result = fscache_check_aux(&object->fscache, - &auxbuf->data, dlen, - i_size_read(d_backing_inode(dentry))); - - switch (result) { - /* entry okay as is */ - case FSCACHE_CHECKAUX_OKAY: - goto okay; - - /* entry requires update */ - case FSCACHE_CHECKAUX_NEEDS_UPDATE: - break; - - /* entry requires deletion */ - case FSCACHE_CHECKAUX_OBSOLETE: - goto stale; - - default: - BUG(); - } - - /* update the current label */ - ret = vfs_setxattr(&init_user_ns, dentry, - cachefiles_xattr_cache, &auxdata->type, - auxdata->len, XATTR_REPLACE); - if (ret < 0) { - cachefiles_io_error_obj(object, - "Can't update xattr on %lu" - " (error %d)", - d_backing_inode(dentry)->i_ino, -ret); - goto error; - } - } - -okay: - ret = 0; - -error: - kfree(auxbuf); _leave(" = %d", ret); - return ret; - -bad_type_length: - pr_err("Cache object %lu xattr length incorrect\n", - d_backing_inode(dentry)->i_ino); - ret = -EIO; - goto error; - -stale: - ret = -ESTALE; - goto error; + return ret == 0; } /* - * remove the object's xattr to mark it stale + * Check the consistency between the backing cache and the volume cookie. */ -int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, - struct dentry *dentry) +int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - int ret; + struct cachefiles_xattr *buf; + struct dentry *dentry = volume->dentry; + unsigned int len = volume->vcookie->coherency_len; + const void *p = volume->vcookie->coherency; + enum cachefiles_coherency_trace why; + ssize_t xlen; + int ret = -ESTALE; - ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); - if (ret < 0) { - if (ret == -ENOENT || ret == -ENODATA) - ret = 0; - else if (ret != -ENOMEM) - cachefiles_io_error(cache, - "Can't remove xattr from %lu" - " (error %d)", - d_backing_inode(dentry)->i_ino, -ret); + _enter(""); + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + xlen = cachefiles_inject_read_error(); + if (xlen == 0) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + if (xlen != len) { + if (xlen < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, + cachefiles_trace_getxattr_error); + if (xlen == -EIO) + cachefiles_io_error( + volume->cache, + "Failed to read xattr with error %zd", xlen); + } + why = cachefiles_coherency_vol_check_xattr; + } else if (memcmp(buf->data, p, len) != 0) { + why = cachefiles_coherency_vol_check_cmp; + } else { + why = cachefiles_coherency_vol_check_ok; + ret = 0; } + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, why); + kfree(buf); _leave(" = %d", ret); return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e53c8541f5b2..b3d9459c9bbd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -4,8 +4,8 @@ #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/writeback.h> /* generic_writepages */ #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> @@ -126,7 +126,7 @@ static int ceph_set_page_dirty(struct page *page) BUG_ON(PagePrivate(page)); attach_page_private(page, snapc); - return __set_page_dirty_nobuffers(page); + return ceph_fscache_set_page_dirty(page); } /* @@ -141,8 +141,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - wait_on_page_fscache(page); - inode = page->mapping->host; ci = ceph_inode(inode); @@ -153,28 +151,36 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, } WARN_ON(!PageLocked(page)); - if (!PagePrivate(page)) - return; + if (PagePrivate(page)) { + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); + snapc = detach_page_private(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + } - snapc = detach_page_private(page); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + wait_on_page_fscache(page); } static int ceph_releasepage(struct page *page, gfp_t gfp) { - dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, - page, page->index, PageDirty(page) ? "" : "not "); + struct inode *inode = page->mapping->host; + + dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n", + ceph_vinop(inode), page, + page->index, PageDirty(page) ? "" : "not "); + + if (PagePrivate(page)) + return 0; if (PageFsCache(page)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return 0; wait_on_page_fscache(page); } - return !PagePrivate(page); + ceph_fscache_note_page_release(inode); + return 1; } static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) @@ -378,6 +384,38 @@ static void ceph_readahead(struct readahead_control *ractl) netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); } +#ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + set_page_fscache(page); +} + +static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) +{ + struct inode *inode = priv; + + if (IS_ERR_VALUE(error) && error != -ENOBUFS) + ceph_fscache_invalidate(inode, false); +} + +static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), + ceph_fscache_write_terminated, inode, caching); +} +#else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + +static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + struct ceph_writeback_ctl { loff_t i_size; @@ -493,6 +531,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; + bool caching = ceph_is_cache_enabled(inode); dout("writepage %p idx %lu\n", page, page->index); @@ -531,16 +570,17 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - set_page_writeback(page); req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + if (IS_ERR(req)) return PTR_ERR(req); - } + + set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); + ceph_fscache_write_to_cache(inode, page_off, len, caching); /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); @@ -599,6 +639,9 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; BUG_ON(!inode); ihold(inode); + + wait_on_page_fscache(page); + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -720,6 +763,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; bool done = false; + bool caching = ceph_is_cache_enabled(inode); dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : @@ -843,7 +887,7 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || PageFsCache(page)) { if (wbc->sync_mode == WB_SYNC_NONE) { dout("%p under writeback\n", page); unlock_page(page); @@ -851,6 +895,7 @@ get_more_pages: } dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); + wait_on_page_fscache(page); } if (!clear_page_dirty_for_io(page)) { @@ -983,9 +1028,19 @@ new_request: op_idx = 0; for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ if (offset + len != cur_offset) { + /* If it's full, stop here */ if (op_idx + 1 == req->r_num_ops) break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); dout("writepages got pages at %llu~%llu\n", @@ -996,14 +1051,17 @@ new_request: osd_req_op_extent_update(req, op_idx, len); len = 0; - offset = cur_offset; + offset = cur_offset; data_pages = pages + i; op_idx++; } set_page_writeback(pages[i]); + if (caching) + ceph_set_page_fscache(pages[i]); len += thp_size(page); } + ceph_fscache_write_to_cache(inode, offset, len, caching); if (ceph_wbc.size_stable) { len = min(len, ceph_wbc.i_size - offset); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 457afda5498a..7d22850623ef 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -12,199 +12,99 @@ #include "super.h" #include "cache.h" -struct fscache_netfs ceph_cache_netfs = { - .name = "ceph", - .version = 0, -}; - -static DEFINE_MUTEX(ceph_fscache_lock); -static LIST_HEAD(ceph_fscache_list); - -struct ceph_fscache_entry { - struct list_head list; - struct fscache_cookie *fscache; - size_t uniq_len; - /* The following members must be last */ - struct ceph_fsid fsid; - char uniquifier[]; -}; - -static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { - .name = "CEPH.fsid", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -int __init ceph_fscache_register(void) -{ - return fscache_register_netfs(&ceph_cache_netfs); -} - -void ceph_fscache_unregister(void) -{ - fscache_unregister_netfs(&ceph_cache_netfs); -} - -int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - const struct ceph_fsid *fsid = &fsc->client->fsid; - const char *fscache_uniq = fsc->mount_options->fscache_uniq; - size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; - struct ceph_fscache_entry *ent; - int err = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) - continue; - if (ent->uniq_len != uniq_len) - continue; - if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) - continue; - - errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", - fsid); - err = -EBUSY; - goto out_unlock; - } + /* No caching for filesystem? */ + if (!fsc->fscache) + return; - ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); - if (!ent) { - err = -ENOMEM; - goto out_unlock; - } + /* Regular files only */ + if (!S_ISREG(inode->i_mode)) + return; - memcpy(&ent->fsid, fsid, sizeof(*fsid)); - if (uniq_len > 0) { - memcpy(&ent->uniquifier, fscache_uniq, uniq_len); - ent->uniq_len = uniq_len; - } + /* Only new inodes! */ + if (!(inode->i_state & I_NEW)) + return; - fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, - &ceph_fscache_fsid_object_def, - &ent->fsid, sizeof(ent->fsid) + uniq_len, - NULL, 0, - fsc, 0, true); + WARN_ON_ONCE(ci->fscache); - if (fsc->fscache) { - ent->fscache = fsc->fscache; - list_add_tail(&ent->list, &ceph_fscache_list); - } else { - kfree(ent); - errorfc(fc, "unable to register fscache cookie for fsid %pU", - fsid); - /* all other fs ignore this error */ - } -out_unlock: - mutex_unlock(&ceph_fscache_lock); - return err; + ci->fscache = fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); } -static enum fscache_checkaux ceph_fscache_inode_check_aux( - void *cookie_netfs_data, const void *data, uint16_t dlen, - loff_t object_size) +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) { - struct ceph_inode_info* ci = cookie_netfs_data; - struct inode* inode = &ci->vfs_inode; + struct fscache_cookie *cookie = ci->fscache; - if (dlen != sizeof(ci->i_version) || - i_size_read(inode) != object_size) - return FSCACHE_CHECKAUX_OBSOLETE; + fscache_relinquish_cookie(cookie, false); +} - if (*(u64 *)data != ci->i_version) - return FSCACHE_CHECKAUX_OBSOLETE; +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) +{ + struct ceph_inode_info *ci = ceph_inode(inode); - dout("ceph inode 0x%p cached okay\n", ci); - return FSCACHE_CHECKAUX_OKAY; + fscache_use_cookie(ci->fscache, will_modify); } -static const struct fscache_cookie_def ceph_fscache_inode_object_def = { - .name = "CEPH.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = ceph_fscache_inode_check_aux, -}; - -void ceph_fscache_register_inode_cookie(struct inode *inode) +void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - - /* No caching for filesystem */ - if (!fsc->fscache) - return; - /* Only cache for regular files that are read only */ - if (!S_ISREG(inode->i_mode)) - return; + if (update) { + loff_t i_size = i_size_read(inode); - inode_lock_nested(inode, I_MUTEX_CHILD); - if (!ci->fscache) { - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - &ci->i_vino, sizeof(ci->i_vino), - &ci->i_version, sizeof(ci->i_version), - ci, i_size_read(inode), false); + fscache_unuse_cookie(ci->fscache, &ci->i_version, &i_size); + } else { + fscache_unuse_cookie(ci->fscache, NULL, NULL); } - inode_unlock(inode); } -void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +void ceph_fscache_update(struct inode *inode) { - struct fscache_cookie* cookie; - - if ((cookie = ci->fscache) == NULL) - return; - - ci->fscache = NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + loff_t i_size = i_size_read(inode); - fscache_relinquish_cookie(cookie, &ci->i_vino, false); + fscache_update_cookie(ci->fscache, &ci->i_version, &i_size); } -static bool ceph_fscache_can_enable(void *data) +void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { - struct inode *inode = data; - return !inode_is_open_for_write(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_invalidate(ceph_inode(inode)->fscache, + &ci->i_version, i_size_read(inode), + dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); } -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { - struct ceph_inode_info *ci = ceph_inode(inode); + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + char *name; + int err = 0; - if (!fscache_cookie_valid(ci->fscache)) - return; + name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "", + uniq_len ? fscache_uniq : ""); + if (!name) + return -ENOMEM; - if (inode_is_open_for_write(inode)) { - dout("fscache_file_set_cookie %p %p disabling cache\n", - inode, filp); - fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - } else { - fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), - ceph_fscache_can_enable, inode); - if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabling cache\n", - inode, filp); - } + fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(fsc->fscache)) { + errorfc(fc, "Unable to register fscache cookie for %s", name); + err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP; + fsc->fscache = NULL; } + kfree(name); + return err; } void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fscache_cookie_valid(fsc->fscache)) { - struct ceph_fscache_entry *ent; - bool found = false; - - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (ent->fscache == fsc->fscache) { - list_del(&ent->list); - kfree(ent); - found = true; - break; - } - } - WARN_ON_ONCE(!found); - mutex_unlock(&ceph_fscache_lock); - - __fscache_relinquish_cookie(fsc->fscache, NULL, false); - } - fsc->fscache = NULL; + fscache_relinquish_volume(fsc->fscache, NULL, false); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 058ea2a04376..09164389fa66 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -12,19 +12,19 @@ #include <linux/netfs.h> #ifdef CONFIG_CEPH_FSCACHE - -extern struct fscache_netfs ceph_cache_netfs; - -int ceph_fscache_register(void); -void ceph_fscache_unregister(void); +#include <linux/fscache.h> int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); + +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify); +void ceph_fscache_unuse_cookie(struct inode *inode, bool update); + +void ceph_fscache_update(struct inode *inode); +void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { @@ -36,37 +36,51 @@ static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info return ci->fscache; } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { - fscache_invalidate(ceph_inode(inode)->fscache); + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + if (cookie) { + ceph_fscache_use_cookie(inode, true); + fscache_resize_cookie(cookie, to); + ceph_fscache_unuse_cookie(inode, true); + } } -static inline bool ceph_is_cache_enabled(struct inode *inode) +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); + fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); +} + +static inline int ceph_fscache_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); - if (!cookie) - return false; - return fscache_cookie_enabled(cookie); + return fscache_set_page_dirty(page, ceph_fscache_cookie(ci)); } static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); } -#else -static inline int ceph_fscache_register(void) +static inline bool ceph_is_cache_enabled(struct inode *inode) { - return 0; + return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_unregister(void) +static inline void ceph_fscache_note_page_release(struct inode *inode) { -} + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_note_page_release(ceph_fscache_cookie(ci)); +} +#else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { @@ -81,28 +95,49 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { - return NULL; } -static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) { } -static inline void ceph_fscache_file_set_cookie(struct inode *inode, - struct file *filp) +static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_update(struct inode *inode) { } +static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write) +{ +} + +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) +{ +} + +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) +{ +} + +static inline int ceph_fscache_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; @@ -112,6 +147,10 @@ static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { return -ENOBUFS; } -#endif -#endif /* _CEPH_CACHE_H */ +static inline void ceph_fscache_note_page_release(struct inode *inode) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b9460b6fb76f..7d2c33cdbac6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1856,7 +1856,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); + ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); @@ -2388,6 +2388,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); dout("write_inode %p wait=%d\n", inode, wait); + ceph_fscache_unpin_writeback(inode, wbc); if (wait) { dirty = try_flush_caps(inode, &flush_tid); if (dirty) @@ -4350,7 +4351,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); int bits = (fmode << 1) | 1; - bool is_opened = false; + bool already_opened = false; int i; if (count == 1) @@ -4358,19 +4359,19 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { - if (bits & (1 << i)) - ci->i_nr_by_mode[i] += count; - /* - * If any of the mode ref is larger than 1, + * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ - if (i && ci->i_nr_by_mode[i] > 1) - is_opened = true; + if (i && ci->i_nr_by_mode[i]) + already_opened = true; + + if (bits & (1 << i)) + ci->i_nr_by_mode[i] += count; } - if (!is_opened) + if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 02a0a0fd9ccd..9d9304e712d9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -248,8 +248,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) switch (inode->i_mode & S_IFMT) { case S_IFREG: - ceph_fscache_register_inode_cookie(inode); - ceph_fscache_file_set_cookie(inode, file); + ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE); fallthrough; case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, @@ -605,13 +604,25 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); in.cap.flags = CEPH_CAP_FLAG_AUTH; in.ctime = in.mtime = in.atime = iinfo.btime; - in.mode = cpu_to_le32((u32)mode); in.truncate_seq = cpu_to_le32(1); in.truncate_size = cpu_to_le64(-1ULL); in.xattr_version = cpu_to_le64(1); in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); - in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? - dir->i_gid : current_fsgid())); + if (dir->i_mode & S_ISGID) { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); + + /* Directories always inherit the setgid bit. */ + if (S_ISDIR(mode)) + mode |= S_ISGID; + else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && + !in_group_p(dir->i_gid) && + !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID)) + mode &= ~S_ISGID; + } else { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); + } + in.mode = cpu_to_le32((u32)mode); + in.nlink = cpu_to_le32(1); in.max_size = cpu_to_le64(lo->stripe_unit); @@ -810,6 +821,7 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p regular file %p\n", inode, file); WARN_ON(!list_empty(&fi->rw_contexts)); + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); ceph_put_fmode(ci, fi->fmode, 1); kmem_cache_free(ceph_file_cachep, fi); @@ -847,7 +859,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; u64 off = iocb->ki_pos; u64 len = iov_iter_count(to); - u64 i_size; + u64 i_size = i_size_read(inode); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -1206,7 +1218,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, snapc, snapc ? snapc->seq : 0); if (write) { - int ret2 = invalidate_inode_pages2_range(inode->i_mapping, + int ret2; + + ceph_fscache_invalidate(inode, true); + + ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) @@ -1417,6 +1433,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) return ret; + ceph_fscache_invalidate(inode, false); ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); @@ -2101,6 +2118,7 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; filemap_invalidate_lock(inode->i_mapping); + ceph_fscache_invalidate(inode, false); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2425,6 +2443,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out_caps; /* Drop dst file cached pages */ + ceph_fscache_invalidate(dst_inode, false); ret = invalidate_inode_pages2_range(dst_inode->i_mapping, dst_off >> PAGE_SHIFT, (dst_off + len) >> PAGE_SHIFT); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e3322fcb2e8d..ef4a980a7bf3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -564,6 +564,8 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); @@ -634,6 +636,12 @@ int ceph_fill_file_size(struct inode *inode, int issued, } i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); + /* + * If we're expanding, then we should be able to just update + * the existing cookie. + */ + if (size > isize) + ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -666,10 +674,6 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } - - if (queue_trunc) - ceph_fscache_invalidate(inode); - return queue_trunc; } @@ -1053,6 +1057,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); + ceph_fscache_register_inode_cookie(inode); + if (fill_inline) ceph_fill_inline_data(inode, locked_page, iinfo->inline_data, iinfo->inline_len); @@ -1814,11 +1820,13 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); + ceph_fscache_update(inode); inode->i_blocks = calc_inode_blocks(size); ret = __ceph_should_report_size(ci); spin_unlock(&ci->i_ceph_lock); + return ret; } @@ -1844,6 +1852,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) u32 orig_gen; int check = 0; + ceph_fscache_invalidate(inode, false); + mutex_lock(&ci->i_truncate_mutex); if (ceph_inode_is_shutdown(inode)) { @@ -1868,7 +1878,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); + ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -1937,6 +1947,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); + ceph_fscache_resize(inode, to); truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); @@ -2184,7 +2195,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (mask) { req->r_inode = inode; ihold(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 250aad330a10..c30eefc0ac19 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3683,7 +3683,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; char *path; - int pathlen, err; + int pathlen = 0, err; u64 pathbase; u64 snap_follows; @@ -3703,7 +3703,6 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, } } else { path = NULL; - pathlen = 0; pathbase = 0; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..bea89bdb534a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -787,16 +787,10 @@ static int __init init_caches(void) if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; - error = ceph_fscache_register(); - if (error) - goto bad_fscache; - return 0; -bad_fscache: - kmem_cache_destroy(ceph_mds_request_cachep); bad_pagevec_pool: - mempool_destroy(ceph_wb_pagevec_pool); + kmem_cache_destroy(ceph_mds_request_cachep); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: @@ -828,8 +822,6 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); mempool_destroy(ceph_wb_pagevec_pool); - - ceph_fscache_unregister(); } static void __ceph_umount_begin(struct ceph_fs_client *fsc) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..d0142cc5c41b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,7 +21,6 @@ #include <linux/ceph/libceph.h> #ifdef CONFIG_CEPH_FSCACHE -#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #endif @@ -135,7 +134,7 @@ struct ceph_fs_client { #endif #ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif }; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 3b7e3b9e4fd2..346ae8716deb 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -188,7 +188,7 @@ config CIFS_SMB_DIRECT config CIFS_FSCACHE bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + depends on CIFS=m && FSCACHE_OLD_API || CIFS=y && FSCACHE_OLD_API=y help Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data to be cached locally on disk through the general filesystem cache diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d282caf9f037..ea00e1a91250 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -416,11 +416,17 @@ skip_rdma: from_kuid(&init_user_ns, ses->cred_uid)); spin_lock(&ses->chan_lock); + if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0)) + seq_puts(m, "\tPrimary channel: DISCONNECTED "); + if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); - for (j = 1; j < ses->chan_count; j++) + for (j = 1; j < ses->chan_count; j++) { cifs_dump_channel(m, j, &ses->chans[j]); + if (CIFS_CHAN_NEEDS_RECONNECT(ses, j)) + seq_puts(m, "\tDISCONNECTED "); + } } spin_unlock(&ses->chan_lock); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 353bd0dd7026..342717bf1dc2 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -84,9 +84,9 @@ struct key_type cifs_spnego_key_type = { /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * -cifs_get_spnego_key(struct cifs_ses *sesInfo) +cifs_get_spnego_key(struct cifs_ses *sesInfo, + struct TCP_Server_Info *server) { - struct TCP_Server_Info *server = cifs_ses_server(sesInfo); struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index e6a0451877d4..7f102ffeb675 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -29,7 +29,8 @@ struct cifs_spnego_msg { #ifdef __KERNEL__ extern struct key_type cifs_spnego_key_type; -extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo); +extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo, + struct TCP_Server_Info *server); #endif /* KERNEL */ #endif /* _CIFS_SPNEGO_H */ diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 12bde7bfda86..8f386dd9939e 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -393,26 +393,14 @@ static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg) static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state) { - int i; - switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - for (i = 0; i < swnreg->tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting) - swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(swnreg->tcon->ses); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - for (i = 0; i < swnreg->tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting) - swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(swnreg->tcon->ses); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -510,10 +498,10 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (tcon->ses->server->tcpStatus != CifsExiting) tcon->ses->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index d118282071b3..0912d8bbbac1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -141,9 +141,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; + spin_lock(&cifs_tcp_ses_lock); if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || - server->tcpStatus == CifsNeedNegotiate) + server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return rc; + } + spin_unlock(&cifs_tcp_ses_lock); if (!server->session_estab) { memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dca42aa87d30..36b2e0cb9736 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -26,6 +26,7 @@ #include <linux/random.h> #include <linux/uuid.h> #include <linux/xattr.h> +#include <uapi/linux/magic.h> #include <net/ipv6.h> #include "cifsfs.h" #include "cifspdu.h" @@ -202,7 +203,7 @@ cifs_read_super(struct super_block *sb) sb->s_time_max = ts.tv_sec; } - sb->s_magic = CIFS_MAGIC_NUMBER; + sb->s_magic = CIFS_SUPER_MAGIC; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; rc = super_setup_bdi(sb); @@ -773,7 +774,7 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb) sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); - p = s = full_path; + s = full_path; do { struct inode *dir = d_inode(dentry); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b50da1901ebd..9e5d9e192ef0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.33" +#define CIFS_VERSION "2.34" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index be74606724c7..f84978b76bb6 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -24,8 +24,6 @@ #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" -#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ - #define SMB_PATH_MAX 260 #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -113,7 +111,13 @@ enum statusEnum { CifsGood, CifsExiting, CifsNeedReconnect, - CifsNeedNegotiate + CifsNeedNegotiate, + CifsInNegotiate, + CifsNeedSessSetup, + CifsInSessSetup, + CifsNeedTcon, + CifsInTcon, + CifsInFilesInvalidate }; enum securityEnum { @@ -263,13 +267,16 @@ struct smb_version_operations { /* check if we need to negotiate */ bool (*need_neg)(struct TCP_Server_Info *); /* negotiate to the server */ - int (*negotiate)(const unsigned int, struct cifs_ses *); + int (*negotiate)(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); /* set negotiated write size */ unsigned int (*negotiate_wsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx); /* set negotiated read size */ unsigned int (*negotiate_rsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx); /* setup smb sessionn */ int (*sess_setup)(const unsigned int, struct cifs_ses *, + struct TCP_Server_Info *server, const struct nls_table *); /* close smb session */ int (*logoff)(const unsigned int, struct cifs_ses *); @@ -414,7 +421,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *); - int (*generate_signingkey)(struct cifs_ses *); + int (*generate_signingkey)(struct cifs_ses *ses, + struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *, bool allocate_crypto); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, @@ -582,7 +590,7 @@ struct TCP_Server_Info { char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; struct smb_version_operations *ops; struct smb_version_values *vals; - /* updates to tcpStatus protected by GlobalMid_Lock */ + /* updates to tcpStatus protected by cifs_tcp_ses_lock */ enum statusEnum tcpStatus; /* what we think the status is */ char *hostname; /* hostname portion of UNC string */ struct socket *ssocket; @@ -920,7 +928,7 @@ struct cifs_ses { struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; /* updates protected by GlobalMid_Lock */ + enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -939,17 +947,13 @@ struct cifs_ses { struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ - bool need_reconnect:1; /* connection reset, uid now invalid */ bool domainAuto:1; - bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; - __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; - /* * Network interfaces available on the server this session is * connected to. @@ -969,45 +973,34 @@ struct cifs_ses { spinlock_t chan_lock; /* ========= begin: protected by chan_lock ======== */ #define CIFS_MAX_CHANNELS 16 +#define CIFS_ALL_CHANNELS_SET(ses) \ + ((1UL << (ses)->chan_count) - 1) +#define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \ + ((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses)) +#define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \ + ((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses)) +#define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \ + test_bit((index), &(ses)->chans_need_reconnect) + struct cifs_chan chans[CIFS_MAX_CHANNELS]; - struct cifs_chan *binding_chan; size_t chan_count; size_t chan_max; atomic_t chan_seq; /* round robin state */ + + /* + * chans_need_reconnect is a bitmap indicating which of the channels + * under this smb session needs to be reconnected. + * If not multichannel session, only one bit will be used. + * + * We will ask for sess and tcon reconnection only if all the + * channels are marked for needing reconnection. This will + * enable the sessions on top to continue to live till any + * of the channels below are active. + */ + unsigned long chans_need_reconnect; /* ========= end: protected by chan_lock ======== */ }; -/* - * When binding a new channel, we need to access the channel which isn't fully - * established yet. - */ - -static inline -struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses) -{ - if (ses->binding) - return ses->binding_chan; - else - return NULL; -} - -/* - * Returns the server pointer of the session. When binding a new - * channel this returns the last channel which isn't fully established - * yet. - * - * This function should be use for negprot/sess.setup codepaths. For - * the other requests see cifs_pick_channel(). - */ -static inline -struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses) -{ - if (ses->binding) - return ses->binding_chan->server; - else - return ses->server; -} - static inline bool cap_unix(struct cifs_ses *ses) { diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index d2ff438fd31f..68b9a436af4b 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2560,7 +2560,7 @@ typedef struct { __le32 EaSize; /* length of the xattrs */ __u8 ShortNameLength; __u8 Reserved; - __u8 ShortName[12]; + __u8 ShortName[24]; char FileName[1]; } __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f3073a62ce57..e0dc147e69a8 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -131,7 +131,8 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf , struct smb_hdr *out_buf, int *bytes_returned); -extern int cifs_reconnect(struct TCP_Server_Info *server); +extern int cifs_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session); extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); @@ -164,6 +165,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, extern enum securityEnum select_sectype(struct TCP_Server_Info *server, enum securityEnum requested); extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); extern struct timespec64 cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec64); @@ -293,11 +295,15 @@ extern int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc); extern int cifs_negotiate_protocol(const unsigned int xid, - struct cifs_ses *ses); + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct nls_table *nls_info); extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required); -extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); +extern int CIFSSMBNegotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, @@ -504,8 +510,10 @@ extern int cifs_verify_signature(struct smb_rqst *rqst, extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb30signingkey(struct cifs_ses *); -extern int generate_smb311signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server); +extern int generate_smb311signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, @@ -599,6 +607,20 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) bool is_server_using_iface(struct TCP_Server_Info *server, struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); +void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); + +unsigned int +cifs_ses_get_chan_index(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_set_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_clear_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +bool +cifs_chan_needs_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 243d17696f06..071e2f21a7db 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -73,6 +73,16 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp; struct list_head *tmp1; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + tcon->tidStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + tcon->tidStatus = CifsInFilesInvalidate; + spin_unlock(&cifs_tcp_ses_lock); + /* list all files open on tree connection and mark them invalid */ spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { @@ -89,6 +99,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInFilesInvalidate) + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -120,15 +135,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * only tree disconnect, open, and write, (and ulogoff which does not * have tcon) are allowed as we start force umount */ + spin_lock(&cifs_tcp_ses_lock); if (tcon->tidStatus == CifsExiting) { if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb_command); return -ENODEV; } } + spin_unlock(&cifs_tcp_ses_lock); retries = server->nr_targets; @@ -148,8 +166,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } /* are we still trying to reconnect? */ - if (server->tcpStatus != CifsNeedReconnect) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); break; + } + spin_unlock(&cifs_tcp_ses_lock); if (retries && --retries) continue; @@ -166,31 +188,49 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) retries = server->nr_targets; } - if (!ses->need_reconnect && !tcon->need_reconnect) + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); nls_codepage = load_nls_default(); /* - * need to prevent multiple threads trying to simultaneously - * reconnect the same SMB session - */ - mutex_lock(&ses->session_mutex); - - /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EHOSTDOWN; - mutex_unlock(&ses->session_mutex); goto out; } + spin_unlock(&cifs_tcp_ses_lock); - rc = cifs_negotiate_protocol(0, ses); - if (rc == 0 && ses->need_reconnect) - rc = cifs_setup_session(0, ses, nls_codepage); + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + + rc = -EHOSTDOWN; + goto out; + } + spin_unlock(&ses->chan_lock); + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) + rc = cifs_setup_session(0, ses, server, nls_codepage); /* do we need to reconnect tcon? */ if (rc || !tcon->need_reconnect) { @@ -198,6 +238,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) goto out; } +skip_sess_setup: cifs_mark_open_files_invalid(tcon); rc = cifs_tree_connect(0, tcon, nls_codepage); mutex_unlock(&ses->session_mutex); @@ -337,8 +378,13 @@ static int smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon, void **request_buf, void **response_buf) { - if (tcon->ses->need_reconnect || tcon->need_reconnect) + spin_lock(&tcon->ses->chan_lock); + if (cifs_chan_needs_reconnect(tcon->ses, tcon->ses->server) || + tcon->need_reconnect) { + spin_unlock(&tcon->ses->chan_lock); return -EHOSTDOWN; + } + spin_unlock(&tcon->ses->chan_lock); return __smb_init(smb_command, wct, tcon, request_buf, response_buf); } @@ -476,14 +522,15 @@ should_set_ext_sec_flag(enum securityEnum sectype) } int -CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) +CIFSSMBNegotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { NEGOTIATE_REQ *pSMB; NEGOTIATE_RSP *pSMBr; int rc = 0; int bytes_returned; int i; - struct TCP_Server_Info *server = ses->server; u16 count; if (!server) { @@ -600,8 +647,12 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) * the tcon is no longer on the list, so no need to take lock before * checking this. */ - if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) - return 0; + spin_lock(&tcon->ses->chan_lock); + if ((tcon->need_reconnect) || CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses)) { + spin_unlock(&tcon->ses->chan_lock); + return -EIO; + } + spin_unlock(&tcon->ses->chan_lock); rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, (void **)&smb_buffer); @@ -696,9 +747,14 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) return -EIO; mutex_lock(&ses->session_mutex); - if (ses->need_reconnect) + spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + spin_unlock(&ses->chan_lock); goto session_already_dead; /* no need to send SMBlogoff if uid already closed due to reconnect */ + } + spin_unlock(&ses->chan_lock); + rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); if (rc) { mutex_unlock(&ses->session_mutex); @@ -1401,7 +1457,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 82577a7a5bb1..0f36deff790e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -166,14 +166,17 @@ static void cifs_resolve_server(struct work_struct *work) * Mark all sessions and tcons for reconnect. * * @server needs to be previously set to CifsNeedReconnect. + * */ -static void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server) +static void +cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct mid_q_entry *mid, *nmid; struct list_head retry_list; - struct TCP_Server_Info *pserver; server->maxBuf = 0; server->max_read = 0; @@ -191,16 +194,37 @@ static void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - ses->need_reconnect = true; - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) + spin_lock(&ses->chan_lock); + if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) + goto next_session; + + cifs_chan_set_need_reconnect(ses, server); + + /* If all channels need reconnect, then tcon needs reconnect */ + if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) + goto next_session; + + ses->status = CifsNeedReconnect; + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; + tcon->tidStatus = CifsNeedReconnect; + } if (ses->tcon_ipc) ses->tcon_ipc->need_reconnect = true; + +next_session: + spin_unlock(&ses->chan_lock); } spin_unlock(&cifs_tcp_ses_lock); + /* + * before reconnecting the tcp session, mark the smb session (uid) + * and the tid bad so they are not used until reconnected + */ + cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect and tearing down socket\n", + __func__); /* do not want to be sending data on a socket we are freeing */ - cifs_dbg(FYI, "%s: tearing down socket\n", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state, @@ -248,16 +272,16 @@ static void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) { - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->nr_targets = num_targets; if (server->tcpStatus == CifsExiting) { /* the demux thread will exit normally next time through the loop */ - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); wake_up(&server->response_q); return false; } server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); return true; } @@ -268,15 +292,21 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num * mark all smb sessions as reconnecting for tcp session * reconnect tcp session * wake up waiters on reconnection? - (not needed currently) + * + * if mark_smb_session is passed as true, unconditionally mark + * the smb session (and tcon) for reconnect as well. This value + * doesn't really matter for non-multichannel scenario. + * */ -static int __cifs_reconnect(struct TCP_Server_Info *server) +static int __cifs_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session) { int rc = 0; if (!cifs_tcp_ses_needs_reconnect(server, 1)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); do { try_to_freeze(); @@ -299,10 +329,10 @@ static int __cifs_reconnect(struct TCP_Server_Info *server) } else { atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); } @@ -371,7 +401,9 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ return rc; } -static int reconnect_dfs_server(struct TCP_Server_Info *server) +static int +reconnect_dfs_server(struct TCP_Server_Info *server, + bool mark_smb_session) { int rc = 0; const char *refpath = server->current_fullpath + 1; @@ -395,7 +427,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) if (!cifs_tcp_ses_needs_reconnect(server, num_targets)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); do { try_to_freeze(); @@ -416,10 +448,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) */ atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); @@ -430,29 +462,32 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) dfs_cache_free_tgts(&tl); /* Need to set up echo worker again once connection has been established */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + spin_unlock(&cifs_tcp_ses_lock); + wake_up(&server->response_q); return rc; } -int cifs_reconnect(struct TCP_Server_Info *server) +int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { spin_unlock(&cifs_tcp_ses_lock); - return __cifs_reconnect(server); + return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); - return reconnect_dfs_server(server); + return reconnect_dfs_server(server, mark_smb_session); } #else -int cifs_reconnect(struct TCP_Server_Info *server) +int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { - return __cifs_reconnect(server); + return __cifs_reconnect(server, mark_smb_session); } #endif @@ -534,15 +569,18 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ + spin_lock(&cifs_tcp_ses_lock); if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { + spin_unlock(&cifs_tcp_ses_lock); cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); - cifs_reconnect(server); + cifs_reconnect(server, false); return true; } + spin_unlock(&cifs_tcp_ses_lock); return false; } @@ -576,7 +614,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) /* reconnect if no credits and no requests in flight */ if (zero_credits(server)) { - cifs_reconnect(server); + cifs_reconnect(server, false); return -ECONNABORTED; } @@ -587,13 +625,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) else length = sock_recvmsg(server->ssocket, smb_msg, 0); - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ESHUTDOWN; + } if (server->tcpStatus == CifsNeedReconnect) { - cifs_reconnect(server); + spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(server, false); return -ECONNABORTED; } + spin_unlock(&cifs_tcp_ses_lock); if (length == -ERESTARTSYS || length == -EAGAIN || @@ -610,7 +653,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (length <= 0) { cifs_dbg(FYI, "Received no data or error: %d\n", length); - cifs_reconnect(server); + cifs_reconnect(server, false); return -ECONNABORTED; } } @@ -689,11 +732,11 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) * initialize frame). */ cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); - cifs_reconnect(server); + cifs_reconnect(server, true); break; default: cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); - cifs_reconnect(server); + cifs_reconnect(server, true); } return false; @@ -771,9 +814,9 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cancel_delayed_work_sync(&server->echo); cancel_delayed_work_sync(&server->resolve); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->tcpStatus = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); wake_up_all(&server->response_q); /* check if we have blocked requests that need to free */ @@ -866,7 +909,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - server->vals->header_preamble_size) { cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } @@ -913,7 +956,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } @@ -1017,7 +1060,7 @@ next_pdu: server->vals->header_preamble_size) { cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n", server->pdu_size); - cifs_reconnect(server); + cifs_reconnect(server, true); continue; } @@ -1069,7 +1112,7 @@ next_pdu: server->ops->is_status_io_timeout(buf)) { num_io_timeout++; if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { - cifs_reconnect(server); + cifs_reconnect(server, false); num_io_timeout = 0; continue; } @@ -1139,7 +1182,7 @@ next_pdu: } memalloc_noreclaim_restore(noreclaim_flag); - module_put_and_exit(0); + module_put_and_kthread_exit(0); } /* @@ -1271,10 +1314,8 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * { struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; - if (ctx->nosharesock) { - server->nosharesock = true; + if (ctx->nosharesock) return 0; - } /* this server does not share socket */ if (server->nosharesock) @@ -1392,9 +1433,9 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) else cancel_delayed_work_sync(&server->reconnect); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->tcpStatus = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_crypto_secmech_release(server); @@ -1438,6 +1479,9 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, goto out_err; } + if (ctx->nosharesock) + tcp_ses->nosharesock = true; + tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); @@ -1452,8 +1496,10 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->max_in_flight = 0; tcp_ses->credits = 1; if (primary_server) { + spin_lock(&cifs_tcp_ses_lock); ++primary_server->srv_count; tcp_ses->primary_server = primary_server; + spin_unlock(&cifs_tcp_ses_lock); } init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); @@ -1542,7 +1588,9 @@ smbd_connected: * to the struct since the kernel thread not created yet * no need to spinlock this update of tcpStatus */ + spin_lock(&cifs_tcp_ses_lock); tcp_ses->tcpStatus = CifsNeedNegotiate; + spin_unlock(&cifs_tcp_ses_lock); if ((ctx->max_credits < 20) || (ctx->max_credits > 60000)) tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE; @@ -1559,6 +1607,10 @@ smbd_connected: /* fscache server cookies are based on primary channel only */ if (!CIFS_SERVER_IS_CHAN(tcp_ses)) cifs_fscache_get_client_cookie(tcp_ses); +#ifdef CONFIG_CIFS_FSCACHE + else + tcp_ses->fscache = tcp_ses->primary_server->fscache; +#endif /* CONFIG_CIFS_FSCACHE */ /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); @@ -1755,15 +1807,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); return; } - spin_unlock(&cifs_tcp_ses_lock); /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); - spin_lock(&GlobalMid_Lock); if (ses->status == CifsGood) ses->status = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_free_ipc(ses); @@ -1980,11 +2030,13 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", ses->status); - mutex_lock(&ses->session_mutex); - if (ses->need_reconnect) { + spin_lock(&ses->chan_lock); + if (cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "Session needs reconnect\n"); - rc = cifs_negotiate_protocol(xid, ses); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses, server); if (rc) { mutex_unlock(&ses->session_mutex); /* problem -- put our ses reference */ @@ -1993,7 +2045,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) return ERR_PTR(rc); } - rc = cifs_setup_session(xid, ses, + rc = cifs_setup_session(xid, ses, server, ctx->local_nls); if (rc) { mutex_unlock(&ses->session_mutex); @@ -2002,8 +2054,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) free_xid(xid); return ERR_PTR(rc); } + mutex_unlock(&ses->session_mutex); + + spin_lock(&ses->chan_lock); } - mutex_unlock(&ses->session_mutex); + spin_unlock(&ses->chan_lock); /* existing SMB ses has a server reference already */ cifs_put_tcp_session(server, 0); @@ -2053,28 +2108,33 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses->sectype = ctx->sectype; ses->sign = ctx->sign; - mutex_lock(&ses->session_mutex); /* add server as first channel */ spin_lock(&ses->chan_lock); ses->chans[0].server = server; ses->chan_count = 1; ses->chan_max = ctx->multichannel ? ctx->max_channels:1; + ses->chans_need_reconnect = 1; spin_unlock(&ses->chan_lock); - rc = cifs_negotiate_protocol(xid, ses); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses, server); if (!rc) - rc = cifs_setup_session(xid, ses, ctx->local_nls); + rc = cifs_setup_session(xid, ses, server, ctx->local_nls); + mutex_unlock(&ses->session_mutex); /* each channel uses a different signing key */ memcpy(ses->chans[0].signkey, ses->smb3signingkey, sizeof(ses->smb3signingkey)); - mutex_unlock(&ses->session_mutex); if (rc) goto get_ses_fail; - /* success, put it on the list and add it as first channel */ + /* + * success, put it on the list and add it as first channel + * note: the session becomes active soon after this. So you'll + * need to lock before changing something in the session. + */ spin_lock(&cifs_tcp_ses_lock); list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2154,6 +2214,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) /* tc_count can never go negative */ WARN_ON(tcon->tc_count < 0); + list_del_init(&tcon->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + if (tcon->use_witness) { int rc; @@ -2164,9 +2227,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) } } - list_del_init(&tcon->tcon_list); - spin_unlock(&cifs_tcp_ses_lock); - xid = get_xid(); if (ses->server->ops->tree_disconnect) ses->server->ops->tree_disconnect(xid, tcon); @@ -2283,10 +2343,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) } } - /* - * BB Do we need to wrap session_mutex around this TCon call and Unix - * SetFS as we do on SessSetup and reconnect? - */ xid = get_xid(); rc = ses->server->ops->tree_connect(xid, ses, ctx->UNC, tcon, ctx->local_nls); @@ -3022,12 +3078,15 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * for just this mount. */ reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx); + spin_lock(&cifs_tcp_ses_lock); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EACCES; goto out; } + spin_unlock(&cifs_tcp_ses_lock); } else tcon->unix_ext = 0; /* server does not support them */ @@ -3043,12 +3102,6 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) cifs_dbg(VFS, "read only mount of RW share\n"); /* no need to log a RW mount of a typical RW share */ } - /* - * The cookie is initialized from volume info returned above. - * Inside cifs_fscache_get_super_cookie it checks - * that we do not get super cookie twice. - */ - cifs_fscache_get_super_cookie(tcon); } /* @@ -3063,6 +3116,13 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); + /* + * The cookie is initialized from volume info returned above. + * Inside cifs_fscache_get_super_cookie it checks + * that we do not get super cookie twice. + */ + cifs_fscache_get_super_cookie(tcon); + out: mnt_ctx->server = server; mnt_ctx->ses = ses; @@ -3423,6 +3483,7 @@ static int connect_dfs_root(struct mount_ctx *mnt_ctx, struct dfs_cache_tgt_list */ mount_put_conns(mnt_ctx); mount_get_dfs_conns(mnt_ctx); + set_root_ses(mnt_ctx); full_path = build_unc_path_to_root(ctx, cifs_sb, true); if (IS_ERR(full_path)) @@ -3700,7 +3761,9 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { bool is_unicode; + spin_lock(&cifs_tcp_ses_lock); tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; tcon->tid = smb_buffer_response->Tid; bcc_ptr = pByteArea(smb_buffer_response); @@ -3790,26 +3853,32 @@ cifs_umount(struct cifs_sb_info *cifs_sb) } int -cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) +cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; /* only send once per connect */ - if (!server->ops->need_neg(server)) + spin_lock(&cifs_tcp_ses_lock); + if (!server->ops->need_neg(server) || + server->tcpStatus != CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return 0; + } + server->tcpStatus = CifsInNegotiate; + spin_unlock(&cifs_tcp_ses_lock); - rc = server->ops->negotiate(xid, ses); + rc = server->ops->negotiate(xid, ses, server); if (rc == 0) { - spin_lock(&GlobalMid_Lock); - if (server->tcpStatus == CifsNeedNegotiate) - server->tcpStatus = CifsGood; + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInNegotiate) + server->tcpStatus = CifsNeedSessSetup; else rc = -EHOSTDOWN; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -3817,12 +3886,26 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *server = cifs_ses_server(ses); + bool is_binding = false; - if (!ses->binding) { + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedSessSetup) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + ses->status = CifsInSessSetup; + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) ses->capabilities &= (~server->vals->cap_unix); @@ -3840,7 +3923,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, server->sec_mode, server->capabilities, server->timeAdj); if (server->ops->sess_setup) - rc = server->ops->sess_setup(xid, ses, nls_info); + rc = server->ops->sess_setup(xid, ses, server, nls_info); if (rc) cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); @@ -4111,18 +4194,6 @@ cifs_prune_tlinks(struct work_struct *work) } #ifdef CONFIG_CIFS_DFS_UPCALL -static void mark_tcon_tcp_ses_for_reconnect(struct cifs_tcon *tcon) -{ - int i; - - for (i = 0; i < tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (tcon->ses->chans[i].server->tcpStatus != CifsExiting) - tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } -} - /* Update dfs referral path of superblock */ static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, const char *target) @@ -4200,6 +4271,17 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct dfs_cache_tgt_iterator *tit; bool target_match; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); tit = dfs_cache_get_tgt_iterator(tl); @@ -4299,7 +4381,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - mark_tcon_tcp_ses_for_reconnect(tcon); + cifs_ses_mark_for_reconnect(tcon->ses); } dfs_cache_free_tgts(tl); @@ -4358,6 +4440,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru { const struct smb_version_operations *ops = tcon->ses->server->ops; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + return ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); } #endif diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 5c1259d2eeac..e9b0fa2a9614 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,12 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - for (i = 0; i < tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (tcon->ses->chans[i].server->tcpStatus != CifsExiting) - tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(tcon->ses); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 6a179ae753c1..e3ed25dc6f3f 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -435,6 +435,42 @@ out: } /* + * Remove duplicate path delimiters. Windows is supposed to do that + * but there are some bugs that prevent rename from working if there are + * multiple delimiters. + * + * Returns a sanitized duplicate of @path. The caller is responsible for + * cleaning up the original. + */ +#define IS_DELIM(c) ((c) == '/' || (c) == '\\') +static char *sanitize_path(char *path) +{ + char *cursor1 = path, *cursor2 = path; + + /* skip all prepended delimiters */ + while (IS_DELIM(*cursor1)) + cursor1++; + + /* copy the first letter */ + *cursor2 = *cursor1; + + /* copy the remainder... */ + while (*(cursor1++)) { + /* ... skipping all duplicated delimiters */ + if (IS_DELIM(*cursor1) && IS_DELIM(*cursor2)) + continue; + *(++cursor2) = *cursor1; + } + + /* if the last character is a delimiter, skip it */ + if (IS_DELIM(*(cursor2 - 1))) + cursor2--; + + *(cursor2) = '\0'; + return kstrdup(path, GFP_KERNEL); +} + +/* * Parse a devname into substrings and populate the ctx->UNC and ctx->prepath * fields with the result. Returns 0 on success and an error otherwise * (e.g. ENOMEM or EINVAL) @@ -493,7 +529,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (!*pos) return 0; - ctx->prepath = kstrdup(pos, GFP_KERNEL); + ctx->prepath = sanitize_path(pos); if (!ctx->prepath) return -ENOMEM; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 7e409a38a2d7..003c5f1f4dfb 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -16,14 +16,7 @@ * Key layout of CIFS server cache index object */ struct cifs_server_key { - struct { - uint16_t family; /* address family */ - __be16 port; /* IP port */ - } hdr; - union { - struct in_addr ipv4_addr; - struct in6_addr ipv6_addr; - }; + __u64 conn_id; } __packed; /* @@ -31,42 +24,23 @@ struct cifs_server_key { */ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { - const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; - const struct sockaddr_in *addr = (struct sockaddr_in *) sa; - const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; struct cifs_server_key key; - uint16_t key_len = sizeof(key.hdr); - - memset(&key, 0, sizeof(key)); /* - * Should not be a problem as sin_family/sin6_family overlays - * sa_family field + * Check if cookie was already initialized so don't reinitialize it. + * In the future, as we integrate with newer fscache features, + * we may want to instead add a check if cookie has changed */ - key.hdr.family = sa->sa_family; - switch (sa->sa_family) { - case AF_INET: - key.hdr.port = addr->sin_port; - key.ipv4_addr = addr->sin_addr; - key_len += sizeof(key.ipv4_addr); - break; - - case AF_INET6: - key.hdr.port = addr6->sin6_port; - key.ipv6_addr = addr6->sin6_addr; - key_len += sizeof(key.ipv6_addr); - break; - - default: - cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); - server->fscache = NULL; + if (server->fscache) return; - } + + memset(&key, 0, sizeof(key)); + key.conn_id = server->conn_id; server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, - &key, key_len, + &key, sizeof(key), NULL, 0, server, 0, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", @@ -92,7 +66,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) * In the future, as we integrate with newer fscache features, * we may want to instead add a check if cookie has changed */ - if (tcon->fscache == NULL) + if (tcon->fscache) return; sharename = extract_sharename(tcon->treeName); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 82848412ad85..279622e4eb1c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1356,11 +1356,6 @@ iget_no_retry: goto out; } -#ifdef CONFIG_CIFS_FSCACHE - /* populate tcon->resource_id */ - tcon->resource_id = CIFS_I(inode)->uniqueid; -#endif - if (rc && tcon->pipe) { cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); @@ -1375,7 +1370,6 @@ iget_no_retry: iget_failed(inode); inode = ERR_PTR(rc); } - out: kfree(path); free_xid(xid); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index fa9fbd6a819c..43b16b6d108c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,10 +896,10 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (mid->server->tcpStatus != CifsExiting) mid->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); } } diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index fe707f45da89..6d242af536cb 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -121,7 +121,9 @@ typedef struct _AUTHENTICATE_MESSAGE { int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 2c10b186ed6e..d12490e12be5 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -65,6 +65,53 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) return false; } +unsigned int +cifs_ses_get_chan_index(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int i; + + for (i = 0; i < ses->chan_count; i++) { + if (ses->chans[i].server == server) + return i; + } + + /* If we didn't find the channel, it is likely a bug */ + WARN_ON(1); + return 0; +} + +void +cifs_chan_set_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + set_bit(chan_index, &ses->chans_need_reconnect); + cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n", + chan_index, ses->chans_need_reconnect); +} + +void +cifs_chan_clear_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + clear_bit(chan_index, &ses->chans_need_reconnect); + cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n", + chan_index, ses->chans_need_reconnect); +} + +bool +cifs_chan_needs_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { @@ -95,9 +142,9 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) } if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); ses->chan_max = 1; spin_unlock(&ses->chan_lock); + cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); return 0; } spin_unlock(&ses->chan_lock); @@ -222,6 +269,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* Auth */ ctx.domainauto = ses->domainAuto; ctx.domainname = ses->domainName; + ctx.server_hostname = ses->server->hostname; ctx.username = ses->user_name; ctx.password = ses->password; ctx.sectype = ses->sectype; @@ -260,9 +308,8 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, chan_server = cifs_get_tcp_session(&ctx, ses->server); - mutex_lock(&ses->session_mutex); spin_lock(&ses->chan_lock); - chan = ses->binding_chan = &ses->chans[ses->chan_count]; + chan = &ses->chans[ses->chan_count]; chan->server = chan_server; if (IS_ERR(chan->server)) { rc = PTR_ERR(chan->server); @@ -270,8 +317,15 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + ses->chan_count++; + atomic_set(&ses->chan_seq, 0); + + /* Mark this channel as needing connect/setup */ + cifs_chan_set_need_reconnect(ses, chan->server); + spin_unlock(&ses->chan_lock); + mutex_lock(&ses->session_mutex); /* * We need to allocate the server crypto now as we will need * to sign packets before we generate the channel signing key @@ -280,37 +334,29 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, rc = smb311_crypto_shash_allocate(chan->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + mutex_unlock(&ses->session_mutex); goto out; } - ses->binding = true; - rc = cifs_negotiate_protocol(xid, ses); - if (rc) - goto out; - - rc = cifs_setup_session(xid, ses, cifs_sb->local_nls); - if (rc) - goto out; - - /* success, put it on the list - * XXX: sharing ses between 2 tcp servers is not possible, the - * way "internal" linked lists works in linux makes element - * only able to belong to one list - * - * the binding session is already established so the rest of - * the code should be able to look it up, no need to add the - * ses to the new server. - */ + rc = cifs_negotiate_protocol(xid, ses, chan->server); + if (!rc) + rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls); - spin_lock(&ses->chan_lock); - ses->chan_count++; - atomic_set(&ses->chan_seq, 0); - spin_unlock(&ses->chan_lock); + mutex_unlock(&ses->session_mutex); out: - ses->binding = false; - ses->binding_chan = NULL; - mutex_unlock(&ses->session_mutex); + if (rc && chan->server) { + spin_lock(&ses->chan_lock); + /* we rely on all bits beyond chan_count to be clear */ + cifs_chan_clear_need_reconnect(ses, chan->server); + ses->chan_count--; + /* + * chan_count should never reach 0 as at least the primary + * channel is always allocated + */ + WARN_ON(ses->chan_count < 1); + spin_unlock(&ses->chan_lock); + } if (rc && chan->server) cifs_put_tcp_session(chan->server, 0); @@ -318,7 +364,22 @@ out: return rc; } -static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) +/* Mark all session channels for reconnect */ +void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) +{ + int i; + + for (i = 0; i < ses->chan_count; i++) { + spin_lock(&cifs_tcp_ses_lock); + if (ses->chans[i].server->tcpStatus != CifsExiting) + ses->chans[i].server->tcpStatus = CifsNeedReconnect; + spin_unlock(&cifs_tcp_ses_lock); + } +} + +static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, + struct TCP_Server_Info *server, + SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; @@ -331,7 +392,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); - pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq); pSMB->req.VcNumber = cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -342,7 +403,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; - if (ses->server->sign) + if (server->sign) pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_UNICODE) { @@ -576,8 +637,8 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, { unsigned int tioffset; /* challenge message target info area */ unsigned int tilen; /* challenge message target info area length */ - CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; + __u32 server_flags; if (blob_len < sizeof(CHALLENGE_MESSAGE)) { cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len); @@ -595,12 +656,37 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, return -EINVAL; } + server_flags = le32_to_cpu(pblob->NegotiateFlags); + cifs_dbg(FYI, "%s: negotiate=0x%08x challenge=0x%08x\n", __func__, + ses->ntlmssp->client_flags, server_flags); + + if ((ses->ntlmssp->client_flags & (NTLMSSP_NEGOTIATE_SEAL | NTLMSSP_NEGOTIATE_SIGN)) && + (!(server_flags & NTLMSSP_NEGOTIATE_56) && !(server_flags & NTLMSSP_NEGOTIATE_128))) { + cifs_dbg(VFS, "%s: requested signing/encryption but server did not return either 56-bit or 128-bit session key size\n", + __func__); + return -EINVAL; + } + if (!(server_flags & NTLMSSP_NEGOTIATE_NTLM) && !(server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) { + cifs_dbg(VFS, "%s: server does not seem to support either NTLMv1 or NTLMv2\n", __func__); + return -EINVAL; + } + if (ses->server->sign && !(server_flags & NTLMSSP_NEGOTIATE_SIGN)) { + cifs_dbg(VFS, "%s: forced packet signing but server does not seem to support it\n", + __func__); + return -EOPNOTSUPP; + } + if ((ses->ntlmssp->client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && + !(server_flags & NTLMSSP_NEGOTIATE_KEY_XCH)) + pr_warn_once("%s: authentication has been weakened as server does not support key exchange\n", + __func__); + + ses->ntlmssp->server_flags = server_flags; + memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); - /* BB we could decode pblob->NegotiateFlags; some may be useful */ /* In particular we can examine sign flags */ /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ - ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); + tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); if (tioffset > blob_len || tioffset + tilen > blob_len) { @@ -680,10 +766,10 @@ static inline void cifs_security_buffer_from_str(SECURITY_BUFFER *pbuf, int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); NEGOTIATE_MESSAGE *sec_blob; __u32 flags; unsigned char *tmp; @@ -707,13 +793,13 @@ int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | - NTLMSSP_NEGOTIATE_SEAL; - if (server->sign) - flags |= NTLMSSP_NEGOTIATE_SIGN; + NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL | + NTLMSSP_NEGOTIATE_SIGN; if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; tmp = *pbuffer + sizeof(NEGOTIATE_MESSAGE); + ses->ntlmssp->client_flags = flags; sec_blob->NegotiateFlags = cpu_to_le32(flags); /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */ @@ -737,6 +823,7 @@ setup_ntlm_neg_ret: int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc; @@ -765,15 +852,8 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; - flags = NTLMSSP_NEGOTIATE_56 | - NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | - NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | - NTLMSSP_NEGOTIATE_SEAL | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; - if (ses->server->sign) - flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) - flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); @@ -820,9 +900,9 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, *pbuffer, &tmp, nls_cp); - if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) || - (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) - && !calc_seckey(ses)) { + if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && + (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) && + !calc_seckey(ses)) { memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); @@ -880,6 +960,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) struct sess_data { unsigned int xid; struct cifs_ses *ses; + struct TCP_Server_Info *server; struct nls_table *nls_cp; void (*func)(struct sess_data *); int result; @@ -946,30 +1027,36 @@ static int sess_establish_session(struct sess_data *sess_data) { struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = + mutex_lock(&server->srv_mutex); + if (!server->session_estab) { + if (server->sign) { + server->session_key.response = kmemdup(ses->auth_key.response, ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - mutex_unlock(&ses->server->srv_mutex); + if (!server->session_key.response) { + mutex_unlock(&server->srv_mutex); return -ENOMEM; } - ses->server->session_key.len = + server->session_key.len = ses->auth_key.len; } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; + server->sequence_number = 0x2; + server->session_estab = true; } - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); + + /* Even if one channel is active, session is in good state */ + spin_lock(&cifs_tcp_ses_lock); + server->tcpStatus = CifsGood; ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -1004,6 +1091,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) SESSION_SETUP_ANDX *pSMB; char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; __u16 bytes_remaining; @@ -1015,7 +1103,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); @@ -1113,6 +1201,7 @@ sess_auth_kerberos(struct sess_data *sess_data) SESSION_SETUP_ANDX *pSMB; char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; __u16 bytes_remaining; struct key *spnego_key = NULL; @@ -1127,9 +1216,9 @@ sess_auth_kerberos(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); - spnego_key = cifs_get_spnego_key(ses); + spnego_key = cifs_get_spnego_key(ses, server); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); spnego_key = NULL; @@ -1253,12 +1342,13 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) { SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; char *bcc_ptr; pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); return -ENOSYS; @@ -1292,6 +1382,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) struct smb_hdr *smb_buf; SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u16 bytes_remaining; char *bcc_ptr; unsigned char *ntlmsspblob = NULL; @@ -1319,7 +1410,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) /* Build security blob before we assemble the request */ rc = build_ntlmssp_negotiate_blob(&ntlmsspblob, - &blob_len, ses, + &blob_len, ses, server, sess_data->nls_cp); if (rc) goto out; @@ -1394,6 +1485,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) struct smb_hdr *smb_buf; SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u16 bytes_remaining; char *bcc_ptr; unsigned char *ntlmsspblob = NULL; @@ -1410,7 +1502,8 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; smb_buf = (struct smb_hdr *)pSMB; rc = build_ntlmssp_auth_blob(&ntlmsspblob, - &blob_len, ses, sess_data->nls_cp); + &blob_len, ses, server, + sess_data->nls_cp); if (rc) goto out_free_ntlmsspblob; sess_data->iov[1].iov_len = blob_len; @@ -1494,11 +1587,13 @@ out: sess_data->result = rc; } -static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +static int select_sec(struct sess_data *sess_data) { int type; + struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - type = cifs_select_sectype(ses->server, ses->sectype); + type = cifs_select_sectype(server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); @@ -1529,7 +1624,8 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) } int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) + struct TCP_Server_Info *server, + const struct nls_table *nls_cp) { int rc = 0; struct sess_data *sess_data; @@ -1543,15 +1639,16 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, if (!sess_data) return -ENOMEM; - rc = select_sec(ses, sess_data); - if (rc) - goto out; - sess_data->xid = xid; sess_data->ses = ses; + sess_data->server = server; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; + rc = select_sec(sess_data); + if (rc) + goto out; + while (sess_data->func) sess_data->func(sess_data); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3b83839fc2c2..8272c91e15ef 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -7,6 +7,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <uapi/linux/magic.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" @@ -163,7 +164,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) { __u64 mid = 0; __u16 last_mid, cur_mid; - bool collision; + bool collision, reconnect = false; spin_lock(&GlobalMid_Lock); @@ -215,7 +216,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) * an eventual reconnect to clean out the pending_mid_q. */ if (num_mids > 32768) - server->tcpStatus = CifsNeedReconnect; + reconnect = true; if (!collision) { mid = (__u64)cur_mid; @@ -225,6 +226,13 @@ cifs_get_next_mid(struct TCP_Server_Info *server) cur_mid++; } spin_unlock(&GlobalMid_Lock); + + if (reconnect) { + spin_lock(&cifs_tcp_ses_lock); + server->tcpStatus = CifsNeedReconnect; + spin_unlock(&cifs_tcp_ses_lock); + } + return mid; } @@ -414,14 +422,16 @@ cifs_need_neg(struct TCP_Server_Info *server) } static int -cifs_negotiate(const unsigned int xid, struct cifs_ses *ses) +cifs_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc; - rc = CIFSSMBNegotiate(xid, ses); + rc = CIFSSMBNegotiate(xid, ses, server); if (rc == -EAGAIN) { /* retry only once on 1st time connection */ - set_credits(ses->server, 1); - rc = CIFSSMBNegotiate(xid, ses); + set_credits(server, 1); + rc = CIFSSMBNegotiate(xid, ses, server); if (rc == -EAGAIN) rc = -EHOSTDOWN; } @@ -878,7 +888,7 @@ cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon, { int rc = -EOPNOTSUPP; - buf->f_type = CIFS_MAGIC_NUMBER; + buf->f_type = CIFS_SUPER_MAGIC; /* * We could add a second check for a QFS Unix capability bit diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index ca692b2283cd..4125fd113cfb 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -13,8 +13,6 @@ #ifndef _SMB2_GLOB_H #define _SMB2_GLOB_H -#define SMB2_MAGIC_NUMBER 0xFE534D42 - /* ***************************************************************** * Constants go here diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index cdcdef32759e..b25623e3fe3d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -847,16 +847,17 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve * SMB2 header. * * @ses: server session structure + * @server: pointer to server info * @iov: array containing the SMB request we will send to the server * @nvec: number of array entries for the iov */ int -smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) +smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct kvec *iov, int nvec) { int i, rc; struct sdesc *d; struct smb2_hdr *hdr; - struct TCP_Server_Info *server = cifs_ses_server(ses); hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c5b1dea54ebc..af5d0830bc8a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -13,6 +13,7 @@ #include <linux/sort.h> #include <crypto/aead.h> #include <linux/fiemap.h> +#include <uapi/linux/magic.h> #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" @@ -121,9 +122,13 @@ smb2_add_credits(struct TCP_Server_Info *server, optype, scredits, add); } + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect - || server->tcpStatus == CifsExiting) + || server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return; + } + spin_unlock(&cifs_tcp_ses_lock); switch (rc) { case -1: @@ -208,11 +213,15 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, return rc; spin_lock(&server->req_lock); } else { + spin_unlock(&server->req_lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->req_lock); + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; } + spin_unlock(&cifs_tcp_ses_lock); + spin_lock(&server->req_lock); scredits = server->credits; /* can deadlock with reopen */ if (scredits <= 8) { @@ -384,14 +393,16 @@ smb2_need_neg(struct TCP_Server_Info *server) } static int -smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) +smb2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc; spin_lock(&GlobalMid_Lock); - cifs_ses_server(ses)->CurrentMid = 0; + server->CurrentMid = 0; spin_unlock(&GlobalMid_Lock); - rc = SMB2_negotiate(xid, ses); + rc = SMB2_negotiate(xid, ses, server); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) rc = -EHOSTDOWN; @@ -2747,7 +2758,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, goto qfs_exit; rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - buf->f_type = SMB2_MAGIC_NUMBER; + buf->f_type = SMB2_SUPER_MAGIC; info = (struct smb2_fs_full_size_info *)( le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), @@ -2789,7 +2800,7 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB311_posix_qfs_info(xid, tcon, fid.persistent_fid, fid.volatile_fid, buf); - buf->f_type = SMB2_MAGIC_NUMBER; + buf->f_type = SMB2_SUPER_MAGIC; SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return rc; } @@ -4808,7 +4819,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { if (!is_offloaded) - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } @@ -4981,10 +4992,12 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->callback(mid); } else { + spin_lock(&cifs_tcp_ses_lock); spin_lock(&GlobalMid_Lock); if (dw->server->tcpStatus == CifsNeedReconnect) { mid->mid_state = MID_RETRY_NEEDED; spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); mid->callback(mid); } else { mid->mid_state = MID_REQUEST_SUBMITTED; @@ -4992,6 +5005,7 @@ static void smb2_decrypt_offload(struct work_struct *work) list_add_tail(&mid->qhead, &dw->server->pending_mid_q); spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); } } cifs_mid_q_entry_release(mid); @@ -5221,13 +5235,13 @@ smb3_receive_transform(struct TCP_Server_Info *server, sizeof(struct smb2_hdr)) { cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_server_dbg(VFS, "Transform message is broken\n"); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2f5f2c4c6183..8d471df69c59 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -142,7 +142,7 @@ static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) { - int rc; + int rc = 0; struct nls_table *nls_codepage; struct cifs_ses *ses; int retries; @@ -162,6 +162,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) return 0; + spin_lock(&cifs_tcp_ses_lock); if (tcon->tidStatus == CifsExiting) { /* * only tree disconnect, open, and write, @@ -171,11 +172,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if ((smb2_command != SMB2_WRITE) && (smb2_command != SMB2_CREATE) && (smb2_command != SMB2_TREE_DISCONNECT)) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb2_command); return -ENODEV; } } + spin_unlock(&cifs_tcp_ses_lock); if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || (!tcon->ses->server) || !server) return -EIO; @@ -214,8 +217,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } /* are we still trying to reconnect? */ - if (server->tcpStatus != CifsNeedReconnect) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); break; + } + spin_unlock(&cifs_tcp_ses_lock); if (retries && --retries) continue; @@ -232,64 +239,70 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, retries = server->nr_targets; } - if (!tcon->ses->need_reconnect && !tcon->need_reconnect) + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { + spin_unlock(&ses->chan_lock); return 0; + } + cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", + tcon->ses->chans_need_reconnect, + tcon->need_reconnect); + spin_unlock(&ses->chan_lock); nls_codepage = load_nls_default(); /* - * need to prevent multiple threads trying to simultaneously reconnect - * the same SMB session - */ - mutex_lock(&tcon->ses->session_mutex); - - /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EHOSTDOWN; - mutex_unlock(&tcon->ses->session_mutex); goto out; } + spin_unlock(&cifs_tcp_ses_lock); /* - * If we are reconnecting an extra channel, bind + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session */ - if (CIFS_SERVER_IS_CHAN(server)) { - ses->binding = true; - ses->binding_chan = cifs_ses_find_chan(ses, server); + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + + goto out; } + spin_unlock(&ses->chan_lock); - rc = cifs_negotiate_protocol(0, tcon->ses); - if (!rc && tcon->ses->need_reconnect) { - rc = cifs_setup_session(0, tcon->ses, nls_codepage); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) { + rc = cifs_setup_session(0, ses, server, nls_codepage); if ((rc == -EACCES) && !tcon->retry) { + mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; - ses->binding = false; - ses->binding_chan = NULL; - mutex_unlock(&tcon->ses->session_mutex); goto failed; } } - /* - * End of channel binding - */ - ses->binding = false; - ses->binding_chan = NULL; if (rc || !tcon->need_reconnect) { - mutex_unlock(&tcon->ses->session_mutex); + mutex_unlock(&ses->session_mutex); goto out; } +skip_sess_setup: cifs_mark_open_files_invalid(tcon); if (tcon->use_persistent) tcon->need_reopen_files = true; rc = cifs_tree_connect(0, tcon, nls_codepage); - mutex_unlock(&tcon->ses->session_mutex); + mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { @@ -833,7 +846,9 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) */ int -SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) +SMB2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct smb_rqst rqst; struct smb2_negotiate_req *req; @@ -842,7 +857,6 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec rsp_iov; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server = cifs_ses_server(ses); int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; @@ -1221,6 +1235,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) struct SMB2_sess_data { unsigned int xid; struct cifs_ses *ses; + struct TCP_Server_Info *server; struct nls_table *nls_cp; void (*func)(struct SMB2_sess_data *); int result; @@ -1242,9 +1257,10 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_req *req; - struct TCP_Server_Info *server = cifs_ses_server(ses); unsigned int total_len; + bool is_binding = false; rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, server, (void **) &req, @@ -1252,11 +1268,16 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - if (sess_data->ses->binding) { - req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid); + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (is_binding) { + req->hdr.SessionId = cpu_to_le64(ses->Suid); req->hdr.Flags |= SMB2_FLAGS_SIGNED; req->PreviousSessionId = 0; req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; + cifs_dbg(FYI, "Binding to sess id: %llx\n", ses->Suid); } else { /* First session, not a reauthenticate */ req->hdr.SessionId = 0; @@ -1266,6 +1287,8 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) */ req->PreviousSessionId = cpu_to_le64(sess_data->previous_session); req->Flags = 0; /* MBZ */ + cifs_dbg(FYI, "Fresh session. Previous: %llx\n", + sess_data->previous_session); } /* enough to enable echos and oplocks and one max size write */ @@ -1325,7 +1348,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* BB add code to build os and lm fields */ rc = cifs_send_recv(sess_data->xid, sess_data->ses, - cifs_ses_server(sess_data->ses), + sess_data->server, &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov); @@ -1340,11 +1363,11 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) { int rc = 0; struct cifs_ses *ses = sess_data->ses; - struct TCP_Server_Info *server = cifs_ses_server(ses); + struct TCP_Server_Info *server = sess_data->server; mutex_lock(&server->srv_mutex); if (server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); + rc = server->ops->generate_signingkey(ses, server); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); @@ -1359,13 +1382,16 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - /* keep existing ses state if binding */ - if (!ses->binding) { - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); - } + + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); + + /* Even if one channel is active, session is in good state */ + spin_lock(&cifs_tcp_ses_lock); + server->tcpStatus = CifsGood; + ses->status = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); return rc; } @@ -1376,15 +1402,17 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct cifs_spnego_msg *msg; struct key *spnego_key = NULL; struct smb2_sess_setup_rsp *rsp = NULL; + bool is_binding = false; rc = SMB2_sess_alloc_buffer(sess_data); if (rc) goto out; - spnego_key = cifs_get_spnego_key(ses); + spnego_key = cifs_get_spnego_key(ses, server); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); if (rc == -ENOKEY) @@ -1405,8 +1433,12 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; } + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep session key if binding */ - if (!ses->binding) { + if (!is_binding) { ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { @@ -1427,7 +1459,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; /* keep session id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1459,10 +1491,12 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_rsp *rsp = NULL; unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ u16 blob_length = 0; + bool is_binding = false; /* * If memory allocation is successful, caller of this function @@ -1480,7 +1514,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) goto out_err; rc = build_ntlmssp_negotiate_blob(&ntlmssp_blob, - &blob_length, ses, + &blob_length, ses, server, sess_data->nls_cp); if (rc) goto out_err; @@ -1519,8 +1553,12 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep existing ses id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1545,11 +1583,13 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_req *req; struct smb2_sess_setup_rsp *rsp = NULL; unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ u16 blob_length = 0; + bool is_binding = false; rc = SMB2_sess_alloc_buffer(sess_data); if (rc) @@ -1558,8 +1598,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; req->hdr.SessionId = cpu_to_le64(ses->Suid); - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - sess_data->nls_cp); + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, + ses, server, + sess_data->nls_cp); if (rc) { cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); goto out; @@ -1580,8 +1621,12 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep existing ses id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1612,11 +1657,13 @@ out: } static int -SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +SMB2_select_sec(struct SMB2_sess_data *sess_data) { int type; + struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype); + type = smb2_select_sectype(server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); @@ -1640,10 +1687,10 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); struct SMB2_sess_data *sess_data; cifs_dbg(FYI, "Session Setup\n"); @@ -1657,15 +1704,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, if (!sess_data) return -ENOMEM; - rc = SMB2_select_sec(ses, sess_data); - if (rc) - goto out; sess_data->xid = xid; sess_data->ses = ses; + sess_data->server = server; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; sess_data->previous_session = ses->Suid; + rc = SMB2_select_sec(sess_data); + if (rc) + goto out; + /* * Initialize the session hash with the server one. */ @@ -1704,8 +1753,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return -EIO; /* no need to send SMB logoff if uid already closed due to reconnect */ - if (ses->need_reconnect) + spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + spin_unlock(&ses->chan_lock); goto smb2_session_already_dead; + } + spin_unlock(&ses->chan_lock); rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, ses->server, (void **) &req, &total_len); @@ -1867,7 +1920,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->share_flags = le32_to_cpu(rsp->ShareFlags); tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); + spin_lock(&cifs_tcp_ses_lock); tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); @@ -1913,8 +1968,13 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; - if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) + spin_lock(&ses->chan_lock); + if ((tcon->need_reconnect) || + (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); close_cached_dir_lease(&tcon->crfid); @@ -3797,13 +3857,16 @@ SMB2_echo(struct TCP_Server_Info *server) .rq_nvec = 1 }; unsigned int total_len; - cifs_dbg(FYI, "In echo request\n"); + cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); /* No need to send echo on newly established connections */ mod_delayed_work(cifsiod_wq, &server->reconnect, 0); return rc; } + spin_unlock(&cifs_tcp_ses_lock); rc = smb2_plain_req_init(SMB2_ECHO, NULL, server, (void **)&req, &total_len); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 096fada16ebd..4a7062fd1c26 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -123,8 +123,11 @@ extern void smb2_set_related(struct smb_rqst *rqst); * SMB2 Worker functions - most of protocol specific implementation details * are contained within these calls. */ -extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses); +extern int SMB2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses); extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, @@ -276,6 +279,7 @@ extern void smb2_copy_fs_info_to_kstatfs( struct kstatfs *kst); extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); extern int smb311_update_preauth_hash(struct cifs_ses *ses, + struct TCP_Server_Info *server, struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2bf047b390a9..b70a49b4edc0 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -100,7 +100,8 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) goto out; found: - if (ses->binding) { + if (cifs_chan_needs_reconnect(ses, server) && + !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { /* * If we are in the process of binding a new channel * to an existing session, use the master connection @@ -390,12 +391,18 @@ struct derivation_triplet { static int generate_smb3signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct derivation_triplet *ptriplet) { int rc; -#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS - struct TCP_Server_Info *server = ses->server; -#endif + bool is_binding = false; + int chan_index = 0; + + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + chan_index = cifs_ses_get_chan_index(ses, server); + /* TODO: introduce ref counting for channels when the can be freed */ + spin_unlock(&ses->chan_lock); /* * All channels use the same encryption/decryption keys but @@ -407,10 +414,10 @@ generate_smb3signingkey(struct cifs_ses *ses, * master connection signing key stored in the session */ - if (ses->binding) { + if (is_binding) { rc = generate_key(ses, ptriplet->signing.label, ptriplet->signing.context, - cifs_ses_binding_channel(ses)->signkey, + ses->chans[chan_index].signkey, SMB3_SIGN_KEY_SIZE); if (rc) return rc; @@ -422,6 +429,7 @@ generate_smb3signingkey(struct cifs_ses *ses, if (rc) return rc; + /* safe to access primary channel, since it will never go away */ memcpy(ses->chans[0].signkey, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); @@ -470,7 +478,8 @@ generate_smb3signingkey(struct cifs_ses *ses, } int -generate_smb30signingkey(struct cifs_ses *ses) +generate_smb30signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct derivation_triplet triplet; @@ -494,11 +503,12 @@ generate_smb30signingkey(struct cifs_ses *ses) d->context.iov_base = "ServerOut"; d->context.iov_len = 10; - return generate_smb3signingkey(ses, &triplet); + return generate_smb3signingkey(ses, server, &triplet); } int -generate_smb311signingkey(struct cifs_ses *ses) +generate_smb311signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct derivation_triplet triplet; @@ -522,7 +532,7 @@ generate_smb311signingkey(struct cifs_ses *ses) d->context.iov_base = ses->preauth_sha_hash; d->context.iov_len = 64; - return generate_smb3signingkey(ses, &triplet); + return generate_smb3signingkey(ses, server, &triplet); } int @@ -624,8 +634,12 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (!is_signed) return 0; - if (server->tcpStatus == CifsNeedNegotiate) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return 0; + } + spin_unlock(&cifs_tcp_ses_lock); if (!is_binding && !server->session_estab) { strncpy(shdr->Signature, "BSRSPYL", 8); return 0; @@ -741,30 +755,41 @@ static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb2_hdr *shdr, struct mid_q_entry **mid) { - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } if (server->tcpStatus == CifsNeedNegotiate && - shdr->Command != SMB2_NEGOTIATE) + shdr->Command != SMB2_NEGOTIATE) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } if (ses->status == CifsNew) { if ((shdr->Command != SMB2_SESSION_SETUP) && - (shdr->Command != SMB2_NEGOTIATE)) + (shdr->Command != SMB2_NEGOTIATE)) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are setting up session */ } if (ses->status == CifsExiting) { - if (shdr->Command != SMB2_LOGOFF) + if (shdr->Command != SMB2_LOGOFF) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are shutting down the session */ } + spin_unlock(&cifs_tcp_ses_lock); *mid = smb2_mid_entry_alloc(shdr, server); if (*mid == NULL) @@ -837,9 +862,13 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate && - shdr->Command != SMB2_NEGOTIATE) + shdr->Command != SMB2_NEGOTIATE) { + spin_unlock(&cifs_tcp_ses_lock); return ERR_PTR(-EAGAIN); + } + spin_unlock(&cifs_tcp_ses_lock); smb2_seq_num_into_buf(server, shdr); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 61ea3d3f95b4..93f0e8c1ea23 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -430,9 +430,9 @@ unmask: * be taken as the remainder of this one. We need to kill the * socket so the server throws away the partial SMB */ - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); } @@ -578,10 +578,14 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, return -ERESTARTSYS; spin_lock(&server->req_lock); } else { + spin_unlock(&server->req_lock); + + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->req_lock); + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; } + spin_unlock(&cifs_tcp_ses_lock); /* * For normal commands, reserve the last MAX_COMPOUND @@ -596,6 +600,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, * for servers that are slow to hand out credits on * new sessions. */ + spin_lock(&server->req_lock); if (!optype && num_credits == 1 && server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { @@ -723,28 +728,36 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { + spin_lock(&cifs_tcp_ses_lock); if (ses->server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; } if (ses->server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && - (in_buf->Command != SMB_COM_NEGOTIATE)) + (in_buf->Command != SMB_COM_NEGOTIATE)) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are setting up session */ } if (ses->status == CifsExiting) { /* check if SMB session is bad because we are setting it up */ - if (in_buf->Command != SMB_COM_LOGOFF_ANDX) + if (in_buf->Command != SMB_COM_LOGOFF_ANDX) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are shutting down session */ } + spin_unlock(&cifs_tcp_ses_lock); *ppmidQ = AllocMidQEntry(in_buf, ses->server); if (*ppmidQ == NULL) @@ -1044,19 +1057,11 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!ses) return NULL; - spin_lock(&ses->chan_lock); - if (!ses->binding) { - /* round robin */ - if (ses->chan_count > 1) { - index = (uint)atomic_inc_return(&ses->chan_seq); - index %= ses->chan_count; - } - spin_unlock(&ses->chan_lock); - return ses->chans[index].server; - } else { - spin_unlock(&ses->chan_lock); - return cifs_ses_server(ses); - } + /* round robin */ + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + + return ses->chans[index].server; } int @@ -1084,8 +1089,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* * Wait for all the requests to become available. @@ -1188,12 +1197,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ + spin_lock(&cifs_tcp_ses_lock); if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + spin_unlock(&cifs_tcp_ses_lock); + mutex_lock(&server->srv_mutex); - smb311_update_preauth_hash(ses, rqst[0].rq_iov, - rqst[0].rq_nvec); + smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec); mutex_unlock(&server->srv_mutex); + + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); for (i = 0; i < num_rqst; i++) { rc = wait_for_response(server, midQ[i]); @@ -1256,15 +1270,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ + spin_lock(&cifs_tcp_ses_lock); if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; + spin_unlock(&cifs_tcp_ses_lock); mutex_lock(&server->srv_mutex); - smb311_update_preauth_hash(ses, &iov, 1); + smb311_update_preauth_hash(ses, server, &iov, 1); mutex_unlock(&server->srv_mutex); + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); out: /* @@ -1353,8 +1371,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1494,8 +1516,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1553,10 +1579,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ + spin_lock(&cifs_tcp_ses_lock); if ((rc == -ERESTARTSYS) && (midQ->mid_state == MID_REQUEST_SUBMITTED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { + spin_unlock(&cifs_tcp_ses_lock); if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the @@ -1595,7 +1623,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* We got the response - restart system call. */ rstart = 1; + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); rc = cifs_sync_mid_result(midQ, server); if (rc != 0) diff --git a/fs/coredump.c b/fs/coredump.c index a6b3c196cdef..7dece20b162b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -347,13 +347,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code, int flags) +static int zap_process(struct task_struct *start, int exit_code) { struct task_struct *t; int nr = 0; /* ignore all signals except SIGKILL, see prepare_signal() */ - start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; + start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -372,13 +372,13 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) static int zap_threads(struct task_struct *tsk, struct core_state *core_state, int exit_code) { + struct signal_struct *signal = tsk->signal; int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); - if (!signal_group_exit(tsk->signal)) { - tsk->signal->core_state = core_state; - tsk->signal->group_exit_task = tsk; - nr = zap_process(tsk, exit_code, 0); + if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) { + signal->core_state = core_state; + nr = zap_process(tsk, exit_code); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); tsk->flags |= PF_DUMPCORE; atomic_set(&core_state->nr_threads, nr); @@ -426,8 +426,6 @@ static void coredump_finish(bool core_dumped) spin_lock_irq(¤t->sighand->siglock); if (core_dumped && !__fatal_signal_pending(current)) current->signal->group_exit_code |= 0x80; - current->signal->group_exit_task = NULL; - current->signal->flags = SIGNAL_GROUP_EXIT; next = current->signal->core_state->dumper.next; current->signal->core_state = NULL; spin_unlock_irq(¤t->sighand->siglock); @@ -709,26 +709,26 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_entry(mapping, index, false); } -static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, struct page *to, unsigned long vaddr) +static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) { + return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); +} + +static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) +{ + pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); void *vto, *kaddr; - pgoff_t pgoff; long rc; int id; - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; } - vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)kaddr, vaddr, to); + vto = kmap_atomic(vmf->cow_page); + copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); kunmap_atomic(vto); dax_read_unlock(id); return 0; @@ -1005,22 +1005,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) -{ - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; -} - static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, pfn_t *pfnp) { - const sector_t sector = dax_iomap_sector(iomap, pos); - pgoff_t pgoff; + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc; long length; - rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); - if (rc) - return rc; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), NULL, pfnp); @@ -1126,42 +1117,87 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) +static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, + unsigned int offset, size_t size) { - sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); - pgoff_t pgoff; - long rc, id; void *kaddr; - bool page_aligned = false; - unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); + long ret; - if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - (size == PAGE_SIZE)) - page_aligned = true; + ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + if (ret > 0) { + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, kaddr + offset, size); + } + return ret; +} - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; +static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +{ + const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + u64 length = iomap_length(iter); + s64 written = 0; + + /* already zeroed? we're done. */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + do { + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); + long rc; + int id; + + id = dax_read_lock(); + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + else + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size); + dax_read_unlock(id); - id = dax_read_lock(); + if (rc < 0) + return rc; + pos += size; + length -= size; + written += size; + if (did_zero) + *did_zero = true; + } while (length > 0); - if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); - else - rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); - if (rc < 0) { - dax_read_unlock(id); - return rc; - } + return written; +} - if (!page_aligned) { - memset(kaddr + offset, 0, size); - dax_flush(iomap->dax_dev, kaddr + offset, size); - } - dax_read_unlock(id); - return size; +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_DAX | IOMAP_ZERO, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_zero_iter(&iter, did_zero); + return ret; } +EXPORT_SYMBOL_GPL(dax_zero_range); + +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + const struct iomap_ops *ops) +{ + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) @@ -1169,7 +1205,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const struct iomap *iomap = &iomi->iomap; loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; ssize_t ret = 0; @@ -1203,9 +1238,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); - const sector_t sector = dax_iomap_sector(iomap, pos); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; - pgoff_t pgoff; void *kaddr; if (fatal_signal_pending(current)) { @@ -1213,10 +1247,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); - if (ret) - break; - map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); if (map_len < 0) { @@ -1230,11 +1260,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - /* - * The userspace address for the memory copy has already been - * validated via access_ok() in either vfs_read() or - * vfs_write(), depending on which operation we are doing. - */ if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); @@ -1274,6 +1299,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(iter), + .flags = IOMAP_DAX, }; loff_t done = 0; int ret; @@ -1332,19 +1358,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, const struct iomap_iter *iter) { - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); - unsigned long vaddr = vmf->address; vm_fault_t ret; int error = 0; switch (iter->iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: - clear_user_highpage(vmf->cow_page, vaddr); + clear_user_highpage(vmf->cow_page, vmf->address); break; case IOMAP_MAPPED: - error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, - sector, vmf->cow_page, vaddr); + error = copy_cow_page_dax(vmf, iter); break; default: WARN_ON_ONCE(1); @@ -1430,7 +1453,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, .inode = mapping->host, .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, .len = PAGE_SIZE, - .flags = IOMAP_FAULT, + .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = 0; void *entry; @@ -1539,7 +1562,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct iomap_iter iter = { .inode = mapping->host, .len = PMD_SIZE, - .flags = IOMAP_FAULT, + .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 7d162b0efbf0..950c63fa4d0b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -147,7 +147,7 @@ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { - if ((inode->i_mode & 07777) == 0444 && + if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 283c7b94edda..bfac462dd3e8 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -9,6 +9,8 @@ ******************************************************************************* ******************************************************************************/ +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lock.h" #include "user.h" @@ -254,10 +256,12 @@ void dlm_callback_work(struct work_struct *work) continue; } else if (callbacks[i].flags & DLM_CB_BAST) { bastfn(lkb->lkb_astparam, callbacks[i].mode); + trace_dlm_bast(ls, lkb, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; castfn(lkb->lkb_astparam); + trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } @@ -295,7 +299,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; - int count = 0; + int count = 0, sum = 0; + bool empty; clear_bit(LSFL_CB_DELAY, &ls->ls_flags); @@ -311,14 +316,17 @@ more: if (count == MAX_CB_QUEUE) break; } + empty = list_empty(&ls->ls_cb_delay); mutex_unlock(&ls->ls_cb_mutex); - if (count) - log_rinfo(ls, "dlm_callback_resume %d", count); - if (count == MAX_CB_QUEUE) { + sum += count; + if (!empty) { count = 0; cond_resched(); goto more; } + + if (sum) + log_rinfo(ls, "%s %d", __func__, sum); } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 47e9d57e4cae..8fb04ebbafb5 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -635,6 +635,35 @@ static int table_open2(struct inode *inode, struct file *file) return 0; } +static ssize_t table_write2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + int n, len, lkb_nodeid, lkb_status, error; + char name[DLM_RESNAME_MAXLEN + 1] = {}; + struct dlm_ls *ls = seq->private; + unsigned int lkb_flags; + char buf[256] = {}; + uint32_t lkb_id; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d", + &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status); + if (n != 5) + return -EINVAL; + + len = strnlen(name, DLM_RESNAME_MAXLEN); + error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags, + lkb_nodeid, lkb_status); + if (error) + return error; + + return count; +} + static int table_open3(struct inode *inode, struct file *file) { struct seq_file *seq; @@ -675,6 +704,7 @@ static const struct file_operations format2_fops = { .owner = THIS_MODULE, .open = table_open2, .read = seq_read, + .write = table_write2, .llseek = seq_lseek, .release = seq_release }; @@ -724,10 +754,35 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf, return rv; } +static ssize_t waiters_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + int mstype, to_nodeid; + char buf[128] = {}; + uint32_t lkb_id; + int n, error; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid); + if (n != 3) + return -EINVAL; + + error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid); + if (error) + return error; + + return count; +} + static const struct file_operations waiters_fops = { .owner = THIS_MODULE, .open = simple_open, .read = waiters_read, + .write = waiters_write, .llseek = default_llseek, }; @@ -768,6 +823,42 @@ static int dlm_version_show(struct seq_file *file, void *offset) } DEFINE_SHOW_ATTRIBUTE(dlm_version); +static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + void *buf; + int ret; + + if (count > PAGE_SIZE || count < sizeof(struct dlm_header)) + return -EINVAL; + + buf = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, count)) { + ret = -EFAULT; + goto out; + } + + ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count); + if (ret) + goto out; + + kfree(buf); + return count; + +out: + kfree(buf); + return ret; +} + +static const struct file_operations dlm_rawmsg_fops = { + .open = simple_open, + .write = dlm_rawmsg_write, + .llseek = no_llseek, +}; + void *dlm_create_debug_comms_file(int nodeid, void *data) { struct dentry *d_node; @@ -782,6 +873,7 @@ void *dlm_create_debug_comms_file(int nodeid, void *data) debugfs_create_file("send_queue_count", 0444, d_node, data, &dlm_send_queue_cnt_fops); debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops); return d_node; } @@ -809,7 +901,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &format2_fops); @@ -840,7 +932,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &waiters_fops); diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 45ebbe602bbf..b6692f81ec83 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -84,8 +84,7 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto out_free; } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5f57538b5d45..74a9590a4dd5 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -41,12 +41,6 @@ #include <linux/dlm.h> #include "config.h" -/* Size of the temp buffer midcomms allocates on the stack. - We try to make this large enough so most messages fit. - FIXME: should sctp make this unnecessary? */ - -#define DLM_INBUF_LEN 148 - struct dlm_ls; struct dlm_lkb; struct dlm_rsb; @@ -554,8 +548,9 @@ struct dlm_ls { uint32_t ls_generation; uint32_t ls_exflags; int ls_lvblen; - int ls_count; /* refcount of processes in + atomic_t ls_count; /* refcount of processes in the dlm using this ls */ + wait_queue_head_t ls_count_wait; int ls_create_count; /* create/release refcount */ unsigned long ls_flags; /* LSFL_ */ unsigned long ls_scan_time; @@ -581,6 +576,7 @@ struct dlm_ls { struct list_head ls_new_rsb; /* new rsb structs */ spinlock_t ls_remove_spin; + wait_queue_head_t ls_remove_wait; char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; int ls_remove_len; @@ -632,6 +628,8 @@ struct dlm_ls { struct rw_semaphore ls_in_recovery; /* block local requests */ struct rw_semaphore ls_recv_active; /* block dlm_recv */ struct list_head ls_requestqueue;/* queue remote requests */ + atomic_t ls_requestqueue_cnt; + wait_queue_head_t ls_requestqueue_wait; struct mutex ls_requestqueue_mutex; struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c502c065d007..bdb51d209ba2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -53,6 +53,8 @@ R: do_xxxx() L: receive_xxxx_reply() <- R: send_xxxx_reply() */ +#include <trace/events/dlm.h> + #include <linux/types.h> #include <linux/rbtree.h> #include <linux/slab.h> @@ -1178,7 +1180,8 @@ static void detach_lkb(struct dlm_lkb *lkb) } } -static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, + int start, int end) { struct dlm_lkb *lkb; int rv; @@ -1199,7 +1202,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT); if (rv >= 0) lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); @@ -1215,6 +1218,11 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) return 0; } +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + return _create_lkb(ls, lkb_ret, 1, 0); +} + static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; @@ -1618,21 +1626,24 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) } /* If there's an rsb for the same resource being removed, ensure - that the remove message is sent before the new lookup message. - It should be rare to need a delay here, but if not, then it may - be worthwhile to add a proper wait mechanism rather than a delay. */ + * that the remove message is sent before the new lookup message. + */ + +#define DLM_WAIT_PENDING_COND(ls, r) \ + (ls->ls_remove_len && \ + !rsb_cmp(r, ls->ls_remove_name, \ + ls->ls_remove_len)) static void wait_pending_remove(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; restart: spin_lock(&ls->ls_remove_spin); - if (ls->ls_remove_len && - !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { + if (DLM_WAIT_PENDING_COND(ls, r)) { log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); + r->res_dir_nodeid, r->res_name); spin_unlock(&ls->ls_remove_spin); - msleep(1); + wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); goto restart; } spin_unlock(&ls->ls_remove_spin); @@ -1784,6 +1795,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); send_remove(r); @@ -3437,6 +3449,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_lock_start(ls, lkb, mode, flags); + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); if (error) @@ -3450,6 +3464,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, mode, flags, error); + if (convert || error) __put_lkb(ls, lkb); if (error == -EAGAIN || error == -EDEADLK) @@ -3481,6 +3497,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + error = set_unlock_args(flags, astarg, &args); if (error) goto out_put; @@ -3495,6 +3513,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -3973,6 +3993,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) int from = ms->m_header.h_nodeid; int error = 0; + /* currently mixing of user/kernel locks are not supported */ + if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) { + log_error(lkb->lkb_resource->res_ls, + "got user dlm message for a kernel lock"); + error = -EINVAL; + goto out; + } + switch (ms->m_type) { case DLM_MSG_CONVERT: case DLM_MSG_UNLOCK: @@ -4001,6 +4029,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) error = -EINVAL; } +out: if (error) log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", @@ -4050,6 +4079,7 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); rv = _create_message(ls, sizeof(struct dlm_message) + len, dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); @@ -6301,3 +6331,64 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, return error; } +/* debug functionality */ +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status) +{ + struct dlm_lksb *lksb; + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + /* we currently can't set a valid user lock */ + if (lkb_flags & DLM_IFL_USER) + return -EOPNOTSUPP; + + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) + return -ENOMEM; + + error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1); + if (error) { + kfree(lksb); + return error; + } + + lkb->lkb_flags = lkb_flags; + lkb->lkb_nodeid = lkb_nodeid; + lkb->lkb_lksb = lksb; + /* user specific pointer, just don't have it NULL for kernel locks */ + if (~lkb_flags & DLM_IFL_USER) + lkb->lkb_astparam = (void *)0xDEADBEEF; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) { + kfree(lksb); + __put_lkb(ls, lkb); + return error; + } + + lock_rsb(r); + attach_lkb(r, lkb); + add_lkb(r, lkb, lkb_status); + unlock_rsb(r); + put_rsb(r); + + return 0; +} + +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, lkb_id, &lkb); + if (error) + return error; + + error = add_to_waiters(lkb, mstype, to_nodeid); + dlm_put_lkb(lkb); + return error; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 456c6ec3ef6f..252a5898f908 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -58,6 +58,10 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, int nodeid, int pid); int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status); +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid); static inline int is_master(struct dlm_rsb *r) { diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 10eddfa6c3d7..0d3833a124a3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -216,8 +216,7 @@ static int do_uevent(struct dlm_ls *ls, int in) return ls->ls_uevent_result; } -static int dlm_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int dlm_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); @@ -314,7 +313,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id) list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_global_id == id) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -331,7 +330,7 @@ struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_local_handle == lockspace) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -348,7 +347,7 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_device.minor == minor) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -360,24 +359,24 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) void dlm_put_lockspace(struct dlm_ls *ls) { - spin_lock(&lslist_lock); - ls->ls_count--; - spin_unlock(&lslist_lock); + if (atomic_dec_and_test(&ls->ls_count)) + wake_up(&ls->ls_count_wait); } static void remove_lockspace(struct dlm_ls *ls) { - for (;;) { - spin_lock(&lslist_lock); - if (ls->ls_count == 0) { - WARN_ON(ls->ls_create_count != 0); - list_del(&ls->ls_list); - spin_unlock(&lslist_lock); - return; - } +retry: + wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0); + + spin_lock(&lslist_lock); + if (atomic_read(&ls->ls_count) != 0) { spin_unlock(&lslist_lock); - ssleep(1); + goto retry; } + + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); } static int threads_start(void) @@ -481,7 +480,8 @@ static int new_lockspace(const char *name, const char *cluster, memcpy(ls->ls_name, name, namelen); ls->ls_namelen = namelen; ls->ls_lvblen = lvblen; - ls->ls_count = 0; + atomic_set(&ls->ls_count, 0); + init_waitqueue_head(&ls->ls_count_wait); ls->ls_flags = 0; ls->ls_scan_time = jiffies; @@ -511,6 +511,7 @@ static int new_lockspace(const char *name, const char *cluster, } spin_lock_init(&ls->ls_remove_spin); + init_waitqueue_head(&ls->ls_remove_wait); for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, @@ -564,6 +565,8 @@ static int new_lockspace(const char *name, const char *cluster, init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); INIT_LIST_HEAD(&ls->ls_requestqueue); + atomic_set(&ls->ls_requestqueue_cnt, 0); + init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); @@ -868,7 +871,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * until this returns. * * Force has 4 possible values: - * 0 - don't destroy locksapce if it has any LKBs + * 0 - don't destroy lockspace if it has any LKBs * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs * 2 - destroy lockspace regardless of LKBs * 3 - destroy lockspace as part of a forced shutdown diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8f715c620e1f..e284d696c1fd 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -53,9 +53,12 @@ #include <net/sctp/sctp.h> #include <net/ipv6.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" +#include "memory.h" #include "config.h" #define NEEDED_RMEM (4*1024*1024) @@ -84,7 +87,6 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -189,6 +191,24 @@ static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void writequeue_entry_ctor(void *data) +{ + struct writequeue_entry *entry = data; + + INIT_LIST_HEAD(&entry->msgs); +} + +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) +{ + return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), + 0, 0, writequeue_entry_ctor); +} + +struct kmem_cache *dlm_lowcomms_msg_cache_create(void) +{ + return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL); +} + /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { @@ -199,7 +219,10 @@ static struct writequeue_entry *con_next_wq(struct connection *con) e = list_first_entry(&con->writequeue, struct writequeue_entry, list); - if (e->len == 0) + /* if len is zero nothing is to send, if there are users filling + * buffers we wait until the users are done so we can send more. + */ + if (e->users || e->len == 0) return NULL; return e; @@ -265,8 +288,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - mutex_init(&con->wq_alloc); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -486,11 +507,9 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); - read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_listen_data_ready(struct sock *sk) @@ -505,15 +524,14 @@ static void lowcomms_write_space(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (!con) - goto out; + return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { log_print("successful connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); - goto out; + return; } clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -524,8 +542,6 @@ static void lowcomms_write_space(struct sock *sk) } queue_work(send_workqueue, &con->swork); -out: - read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) @@ -592,42 +608,41 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) static void lowcomms_error_report(struct sock *sk) { struct connection *con; - struct sockaddr_storage saddr; void (*orig_report)(struct sock *) = NULL; + struct inet_sock *inet; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con == NULL) goto out; orig_report = listen_sock.sk_error_report; - if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { - printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d, port %d, " - "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, dlm_config.ci_tcp_port, - sk->sk_err, sk->sk_err_soft); - } else if (saddr.ss_family == AF_INET) { - struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + inet = inet_sk(sk); + switch (sk->sk_family) { + case AF_INET: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %pI4, port %d, " + "sending to node %d at %pI4, dport %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, &sin4->sin_addr.s_addr, - dlm_config.ci_tcp_port, sk->sk_err, + con->nodeid, &inet->inet_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); - } else { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr; - + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %u.%u.%u.%u, " - "port %d, sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, sin6->sin6_addr.s6_addr32[0], - sin6->sin6_addr.s6_addr32[1], - sin6->sin6_addr.s6_addr32[2], - sin6->sin6_addr.s6_addr32[3], - dlm_config.ci_tcp_port, sk->sk_err, + "sending to node %d at %pI6c, " + "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sk->sk_v6_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); + break; +#endif + default: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "invalid socket family %d set, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + sk->sk_family, sk->sk_err, sk->sk_err_soft); + goto out; } /* below sendcon only handling */ @@ -646,7 +661,6 @@ static void lowcomms_error_report(struct sock *sk) queue_work(send_workqueue, &con->swork); out: - read_unlock_bh(&sk->sk_callback_lock); if (orig_report) orig_report(sk); } @@ -666,20 +680,20 @@ static void restore_callbacks(struct socket *sock) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } static void add_listen_sock(struct socket *sock, struct listen_connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); save_listen_callbacks(sock); con->sock = sock; @@ -687,7 +701,7 @@ static void add_listen_sock(struct socket *sock, struct listen_connection *con) sk->sk_allocation = GFP_NOFS; /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_listen_data_ready; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Make a socket active */ @@ -695,7 +709,7 @@ static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); con->sock = sock; sk->sk_user_data = con; @@ -705,7 +719,7 @@ static void add_sock(struct socket *sock, struct connection *con) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -733,7 +747,7 @@ static void dlm_page_release(struct kref *kref) ref); __free_page(e->page); - kfree(e); + dlm_free_writequeue(e); } static void dlm_msg_release(struct kref *kref) @@ -741,7 +755,7 @@ static void dlm_msg_release(struct kref *kref) struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); kref_put(&msg->entry->ref, dlm_page_release); - kfree(msg); + dlm_free_msg(msg); } static void free_entry(struct writequeue_entry *e) @@ -925,6 +939,7 @@ static int receive_from_sock(struct connection *con) msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); if (ret == -EAGAIN) break; else if (ret <= 0) @@ -1013,10 +1028,28 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { - unsigned char *b=(unsigned char *)&peeraddr; - log_print("connect from non cluster node"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); + switch (peeraddr.ss_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; + + log_print("connect from non cluster IPv4 node %pI4", + &sin->sin_addr); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; + + log_print("connect from non cluster IPv6 node %pI6c", + &sin6->sin6_addr); + break; + } +#endif + default: + log_print("invalid family from non cluster node"); + break; + } + sock_release(newsock); return -1; } @@ -1177,33 +1210,33 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -static struct writequeue_entry *new_writequeue_entry(struct connection *con, - gfp_t allocation) +static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; - entry = kzalloc(sizeof(*entry), allocation); + entry = dlm_allocate_writequeue(); if (!entry) return NULL; - entry->page = alloc_page(allocation | __GFP_ZERO); + entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { - kfree(entry); + dlm_free_writequeue(entry); return NULL; } + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->dirty = false; entry->con = con; entry->users = 1; kref_init(&entry->ref); - INIT_LIST_HEAD(&entry->msgs); - return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + char **ppc, void (*cb)(void *data), + void *data) { struct writequeue_entry *e; @@ -1215,74 +1248,54 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, *ppc = page_address(e->page) + e->end; if (cb) - cb(mh); + cb(data); e->end += len; e->users++; - spin_unlock(&con->writequeue_lock); - - return e; + goto out; } } - spin_unlock(&con->writequeue_lock); - e = new_writequeue_entry(con, allocation); + e = new_writequeue_entry(con); if (!e) - return NULL; + goto out; kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; atomic_inc(&con->writequeue_cnt); - - spin_lock(&con->writequeue_lock); if (cb) - cb(mh); + cb(data); list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); +out: + spin_unlock(&con->writequeue_lock); return e; }; static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + void (*cb)(void *data), + void *data) { struct writequeue_entry *e; struct dlm_msg *msg; - bool sleepable; - msg = kzalloc(sizeof(*msg), allocation); + msg = dlm_allocate_msg(allocation); if (!msg) return NULL; - /* this mutex is being used as a wait to avoid multiple "fast" - * new writequeue page list entry allocs in new_wq_entry in - * normal operation which is sleepable context. Without it - * we could end in multiple writequeue entries with one - * dlm message because multiple callers were waiting at - * the writequeue_lock in new_wq_entry(). - */ - sleepable = gfpflags_normal_context(allocation); - if (sleepable) - mutex_lock(&con->wq_alloc); - kref_init(&msg->ref); - e = new_wq_entry(con, len, allocation, ppc, cb, mh); + e = new_wq_entry(con, len, ppc, cb, data); if (!e) { - if (sleepable) - mutex_unlock(&con->wq_alloc); - - kfree(msg); + dlm_free_msg(msg); return NULL; } - if (sleepable) - mutex_unlock(&con->wq_alloc); - + msg->retransmit = false; + msg->orig_msg = NULL; msg->ppc = *ppc; msg->len = len; msg->entry = e; @@ -1291,8 +1304,8 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, } struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + char **ppc, void (*cb)(void *data), + void *data) { struct connection *con; struct dlm_msg *msg; @@ -1313,7 +1326,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } - msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data); if (!msg) { srcu_read_unlock(&connections_srcu, idx); return NULL; @@ -1403,7 +1416,6 @@ static void send_to_sock(struct connection *con) if (!e) break; - e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); @@ -1411,6 +1423,7 @@ static void send_to_sock(struct connection *con) ret = kernel_sendpage(con->sock, e->page, offset, len, msg_flags); + trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && @@ -1680,9 +1693,9 @@ static void _stop_conn(struct connection *con, bool and_other) set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) { - write_lock_bh(&con->sock->sk->sk_callback_lock); + lock_sock(con->sock->sk); con->sock->sk->sk_user_data = NULL; - write_unlock_bh(&con->sock->sk->sk_callback_lock); + release_sock(con->sock->sk); } if (con->othercon && and_other) _stop_conn(con->othercon, false); @@ -1775,7 +1788,7 @@ static int dlm_listen_for_all(void) result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); + log_print("Can't create comms socket: %d", result); goto out; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 4ccae07cf005..29369feea991 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -38,8 +38,8 @@ void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh); + char **ppc, void (*cb)(void *data), + void *data); void dlm_lowcomms_commit_msg(struct dlm_msg *msg); void dlm_lowcomms_put_msg(struct dlm_msg *msg); int dlm_lowcomms_resend_msg(struct dlm_msg *msg); @@ -47,6 +47,8 @@ int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_receive_done(int nodeid); +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void); +struct kmem_cache *dlm_lowcomms_msg_cache_create(void); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/main.c b/fs/dlm/main.c index afc66a1346d3..1c5be4b70ac1 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -19,6 +19,9 @@ #include "config.h" #include "lowcomms.h" +#define CREATE_TRACE_POINTS +#include <trace/events/dlm.h> + static int __init init_dlm(void) { int error; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 731d489aa323..61f906e705db 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -442,8 +442,7 @@ static int ping_members(struct dlm_ls *ls) int error = 0; list_for_each_entry(memb, &ls->ls_nodes, list) { - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; break; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 5918f4d39586..ce35c3c19aeb 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -10,32 +10,61 @@ ******************************************************************************/ #include "dlm_internal.h" +#include "midcomms.h" +#include "lowcomms.h" #include "config.h" #include "memory.h" +static struct kmem_cache *writequeue_cache; +static struct kmem_cache *mhandle_cache; +static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { + writequeue_cache = dlm_lowcomms_writequeue_cache_create(); + if (!writequeue_cache) + goto out; + + mhandle_cache = dlm_midcomms_cache_create(); + if (!mhandle_cache) + goto mhandle; + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) - return -ENOMEM; + goto lkb; + + msg_cache = dlm_lowcomms_msg_cache_create(); + if (!msg_cache) + goto msg; rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); - if (!rsb_cache) { - kmem_cache_destroy(lkb_cache); - return -ENOMEM; - } + if (!rsb_cache) + goto rsb; return 0; + +rsb: + kmem_cache_destroy(msg_cache); +msg: + kmem_cache_destroy(lkb_cache); +lkb: + kmem_cache_destroy(mhandle_cache); +mhandle: + kmem_cache_destroy(writequeue_cache); +out: + return -ENOMEM; } void dlm_memory_exit(void) { + kmem_cache_destroy(writequeue_cache); + kmem_cache_destroy(mhandle_cache); + kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); } @@ -89,3 +118,32 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } +struct dlm_mhandle *dlm_allocate_mhandle(void) +{ + return kmem_cache_alloc(mhandle_cache, GFP_NOFS); +} + +void dlm_free_mhandle(struct dlm_mhandle *mhandle) +{ + kmem_cache_free(mhandle_cache, mhandle); +} + +struct writequeue_entry *dlm_allocate_writequeue(void) +{ + return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC); +} + +void dlm_free_writequeue(struct writequeue_entry *writequeue) +{ + kmem_cache_free(writequeue_cache, writequeue); +} + +struct dlm_msg *dlm_allocate_msg(gfp_t allocation) +{ + return kmem_cache_alloc(msg_cache, allocation); +} + +void dlm_free_msg(struct dlm_msg *msg) +{ + kmem_cache_free(msg_cache, msg); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 4f218ea4b187..7bd3f1a391ca 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,6 +20,12 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); +struct dlm_mhandle *dlm_allocate_mhandle(void); +void dlm_free_mhandle(struct dlm_mhandle *mhandle); +struct writequeue_entry *dlm_allocate_writequeue(void); +void dlm_free_writequeue(struct writequeue_entry *writequeue); +struct dlm_msg *dlm_allocate_msg(gfp_t allocation); +void dlm_free_msg(struct dlm_msg *msg); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 7ae39ec8d9b0..3635e42b0669 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -137,6 +137,7 @@ #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" +#include "memory.h" #include "lock.h" #include "util.h" #include "midcomms.h" @@ -220,6 +221,12 @@ DEFINE_STATIC_SRCU(nodes_srcu); */ static DEFINE_MUTEX(close_lock); +struct kmem_cache *dlm_midcomms_cache_create(void) +{ + return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle), + 0, 0, NULL); +} + static inline const char *dlm_state_str(int state) { switch (state) { @@ -279,7 +286,7 @@ static void dlm_mhandle_release(struct rcu_head *rcu) struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); dlm_lowcomms_put_msg(mh->msg); - kfree(mh); + dlm_free_mhandle(mh); } static void dlm_mhandle_delete(struct midcomms_node *node, @@ -909,11 +916,11 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (le32_to_cpu(hd->h_version)) { - case DLM_VERSION_3_1: + switch (hd->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_VERSION_3_2: + case cpu_to_le32(DLM_VERSION_3_2): dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: @@ -969,7 +976,7 @@ void dlm_midcomms_receive_done(int nodeid) spin_unlock(&node->state_lock); /* do nothing FIN has it's own ack send */ break; - }; + } srcu_read_unlock(&nodes_srcu, idx); } @@ -1020,8 +1027,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, header_out(&opts->o_header); } -static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +static void midcomms_new_msg_cb(void *data) { + struct dlm_mhandle *mh = data; + atomic_inc(&mh->node->send_queue_cnt); spin_lock(&mh->node->send_queue_lock); @@ -1071,10 +1080,12 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, /* this is a bug, however we going on and hope it will be resolved */ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = kzalloc(sizeof(*mh), GFP_NOFS); + mh = dlm_allocate_mhandle(); if (!mh) goto err; + mh->committed = false; + mh->ack_rcv = NULL; mh->idx = idx; mh->node = node; @@ -1083,7 +1094,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, NULL, NULL); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } @@ -1092,13 +1103,13 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, ppc); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } break; default: - kfree(mh); + dlm_free_mhandle(mh); WARN_ON(1); goto err; } @@ -1134,7 +1145,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_lowcomms_commit_msg(mh->msg); dlm_lowcomms_put_msg(mh->msg); /* mh is not part of rcu list in this case */ - kfree(mh); + dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: dlm_midcomms_commit_msg_3_2(mh); @@ -1231,7 +1242,7 @@ void dlm_midcomms_add_member(int nodeid) } node->users++; - pr_debug("users inc count %d\n", node->users); + pr_debug("node %d users inc count %d\n", nodeid, node->users); spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); @@ -1254,7 +1265,7 @@ void dlm_midcomms_remove_member(int nodeid) spin_lock(&node->state_lock); node->users--; - pr_debug("users dec count %d\n", node->users); + pr_debug("node %d users dec count %d\n", nodeid, node->users); /* hitting users count to zero means the * other side is running dlm_midcomms_stop() @@ -1425,3 +1436,51 @@ int dlm_midcomms_close(int nodeid) return ret; } + +/* debug functionality to send raw dlm msg from user space */ +struct dlm_rawmsg_data { + struct midcomms_node *node; + void *buf; +}; + +static void midcomms_new_rawmsg_cb(void *data) +{ + struct dlm_rawmsg_data *rd = data; + struct dlm_header *h = rd->buf; + + switch (h->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): + break; + default: + switch (h->h_cmd) { + case DLM_OPTS: + if (!h->u.h_seq) + h->u.h_seq = rd->node->seq_send++; + break; + default: + break; + } + break; + } +} + +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen) +{ + struct dlm_rawmsg_data rd; + struct dlm_msg *msg; + char *msgbuf; + + rd.node = node; + rd.buf = buf; + + msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS, + &msgbuf, midcomms_new_rawmsg_cb, &rd); + if (!msg) + return -ENOMEM; + + memcpy(msgbuf, buf, buflen); + dlm_lowcomms_commit_msg(msg); + return 0; +} + diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 579abc6929be..82bcd9661922 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -28,6 +28,9 @@ const char *dlm_midcomms_state(struct midcomms_node *node); unsigned long dlm_midcomms_flags(struct midcomms_node *node); int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); uint32_t dlm_midcomms_version(struct midcomms_node *node); +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen); +struct kmem_cache *dlm_midcomms_cache_create(void); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 6cba86470278..5821b777a1a7 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -601,7 +601,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) spin_lock(&ls->ls_recover_lock); status = ls->ls_recover_status; - stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + stop = dlm_recovery_stopped(ls); seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 97d052cea5a9..a55dfce705dd 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -124,8 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto fail; } diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index e89e0ff8bfa3..ccb5307c21e9 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,6 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->nodeid = nodeid; memcpy(&e->request, ms, ms->m_header.h_length); + atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); list_add_tail(&e->list, &ls->ls_requestqueue); mutex_unlock(&ls->ls_requestqueue_mutex); @@ -89,6 +90,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); if (dlm_locking_stopped(ls)) { @@ -115,14 +118,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) void dlm_wait_requestqueue(struct dlm_ls *ls) { - for (;;) { - mutex_lock(&ls->ls_requestqueue_mutex); - if (list_empty(&ls->ls_requestqueue)) - break; - mutex_unlock(&ls->ls_requestqueue_mutex); - schedule(); - } - mutex_unlock(&ls->ls_requestqueue_mutex); + wait_event(ls->ls_requestqueue_wait, + atomic_read(&ls->ls_requestqueue_cnt) == 0); } static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) @@ -130,7 +127,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) uint32_t type = ms->m_type; /* the ls is being cleaned up and freed by release_lockspace */ - if (!ls->ls_count) + if (!atomic_read(&ls->ls_count)) return 1; if (dlm_is_removed(ls, nodeid)) @@ -161,6 +158,8 @@ void dlm_purge_requestqueue(struct dlm_ls *ls) if (purge_request(ls, ms, e->nodeid)) { list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); } } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d66bbd2df191..2dd23a82e0de 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -537,7 +537,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (mnt_user_ns(path.mnt) != &init_user_ns) { + if (is_idmapped_mnt(path.mnt)) { rc = -EINVAL; printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); goto out_free; diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 756fe2d65272..8a3317e38e5a 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 579406504919..19e6c56a9f47 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -12,7 +12,7 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_out; + unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; /* indicate the algorithm will be used for decompression */ @@ -87,6 +87,8 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, return page->mapping == MNGD_MAPPING(sbi); } +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize); int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0e35ef3f9f3d..fa7ddb7ad980 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -9,37 +9,71 @@ #include <linux/dax.h> #include <trace/events/erofs.h> -struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) +void erofs_unmap_metabuf(struct erofs_buf *buf) +{ + if (buf->kmap_type == EROFS_KMAP) + kunmap(buf->page); + else if (buf->kmap_type == EROFS_KMAP_ATOMIC) + kunmap_atomic(buf->base); + buf->base = NULL; + buf->kmap_type = EROFS_NO_KMAP; +} + +void erofs_put_metabuf(struct erofs_buf *buf) +{ + if (!buf->page) + return; + erofs_unmap_metabuf(buf); + put_page(buf->page); + buf->page = NULL; +} + +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) { struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; - struct page *page; - - page = read_cache_page_gfp(mapping, blkaddr, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - /* should already be PageUptodate */ - if (!IS_ERR(page)) - lock_page(page); - return page; + erofs_off_t offset = blknr_to_addr(blkaddr); + pgoff_t index = offset >> PAGE_SHIFT; + struct page *page = buf->page; + + if (!page || page->index != index) { + erofs_put_metabuf(buf); + page = read_cache_page_gfp(mapping, index, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(page)) + return page; + /* should already be PageUptodate, no need to lock page */ + buf->page = page; + } + if (buf->kmap_type == EROFS_NO_KMAP) { + if (type == EROFS_KMAP) + buf->base = kmap(page); + else if (type == EROFS_KMAP_ATOMIC) + buf->base = kmap_atomic(page); + buf->kmap_type = type; + } else if (buf->kmap_type != type) { + DBG_BUGON(1); + return ERR_PTR(-EFAULT); + } + if (type == EROFS_NO_KMAP) + return NULL; + return buf->base + (offset & ~PAGE_MASK); } static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map, int flags) { - int err = 0; erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - trace_erofs_map_blocks_flatmode_enter(inode, map, flags); - - nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); lastblk = nblocks - tailendpacking; /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; - if (offset < blknr_to_addr(lastblk)) { map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; @@ -51,30 +85,23 @@ static int erofs_map_blocks_flatmode(struct inode *inode, vi->xattr_isize + erofs_blkoff(map->m_la); map->m_plen = inode->i_size - offset; - /* inline data should be located in one meta block */ - if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { + /* inline data should be located in the same meta block */ + if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) { erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", vi->nid); DBG_BUGON(1); - err = -EFSCORRUPTED; - goto err_out; + return -EFSCORRUPTED; } - map->m_flags |= EROFS_MAP_META; } else { erofs_err(inode->i_sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", vi->nid, inode->i_size, map->m_la); DBG_BUGON(1); - err = -EIO; - goto err_out; + return -EIO; } - - map->m_llen = map->m_plen; -err_out: - trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); - return err; + return 0; } static int erofs_map_blocks(struct inode *inode, @@ -83,12 +110,14 @@ static int erofs_map_blocks(struct inode *inode, struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; u64 chunknr; unsigned int unit; erofs_off_t pos; + void *kaddr; int err = 0; + trace_erofs_map_blocks_enter(inode, map, flags); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -97,8 +126,10 @@ static int erofs_map_blocks(struct inode *inode, goto out; } - if (vi->datalayout != EROFS_INODE_CHUNK_BASED) - return erofs_map_blocks_flatmode(inode, map, flags); + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { + err = erofs_map_blocks_flatmode(inode, map, flags); + goto out; + } if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) unit = sizeof(*idx); /* chunk index */ @@ -109,17 +140,18 @@ static int erofs_map_blocks(struct inode *inode, pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos)); - if (IS_ERR(page)) - return PTR_ERR(page); - + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out; + } map->m_la = chunknr << vi->chunkbits; map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = page_address(page) + erofs_blkoff(pos); + __le32 *blkaddr = kaddr + erofs_blkoff(pos); if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; @@ -130,7 +162,7 @@ static int erofs_map_blocks(struct inode *inode, goto out_unlock; } /* parse chunk indexes */ - idx = page_address(page) + erofs_blkoff(pos); + idx = kaddr + erofs_blkoff(pos); switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -143,10 +175,11 @@ static int erofs_map_blocks(struct inode *inode, break; } out_unlock: - unlock_page(page); - put_page(page); + erofs_put_metabuf(&buf); out: - map->m_llen = map->m_plen; + if (!err) + map->m_llen = map->m_plen; + trace_erofs_map_blocks_exit(inode, map, flags, 0); return err; } @@ -159,6 +192,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) /* primary device by default */ map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; + map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -169,6 +203,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; up_read(&devs->rwsem); } else if (devs->extra_devices) { down_read(&devs->rwsem); @@ -185,6 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_pa -= startoff; map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; break; } } @@ -215,9 +251,13 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret) return ret; - iomap->bdev = mdev.m_bdev; - iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; + if (flags & IOMAP_DAX) { + iomap->dax_dev = mdev.m_daxdev; + iomap->offset += mdev.m_dax_part_off; + } else { + iomap->bdev = mdev.m_bdev; + } iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; @@ -231,16 +271,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } if (map.m_flags & EROFS_MAP_META) { - struct page *ipage; + void *ptr; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(mdev.m_pa)); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - iomap->inline_data = page_address(ipage) + - erofs_blkoff(mdev.m_pa); - iomap->private = ipage; + ptr = erofs_read_metabuf(&buf, inode->i_sb, + erofs_blknr(mdev.m_pa), EROFS_KMAP); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa); + iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; @@ -251,12 +291,17 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - struct page *ipage = iomap->private; + void *ptr = iomap->private; + + if (ptr) { + struct erofs_buf buf = { + .page = kmap_to_page(ptr), + .base = ptr, + .kmap_type = EROFS_KMAP, + }; - if (ipage) { DBG_BUGON(iomap->type != IOMAP_INLINE); - unlock_page(ipage); - put_page(ipage); + erofs_put_metabuf(&buf); } else { DBG_BUGON(iomap->type == IOMAP_INLINE); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index bf37fc76b182..3efa686c7644 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,6 +16,14 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif +struct z_erofs_lz4_decompress_ctx { + struct z_erofs_decompress_req *rq; + /* # of encoded, decoded pages */ + unsigned int inpages, outpages; + /* decoded block total length (used for in-place decompression) */ + unsigned int oend; +}; + int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -56,11 +64,10 @@ int z_erofs_load_lz4_config(struct super_block *sb, * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, struct page **pagepool) { - const unsigned int nr = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -70,7 +77,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, unsigned int i, j, top; top = 0; - for (i = j = 0; i < nr; ++i, ++j) { + for (i = j = 0; i < ctx->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -112,41 +119,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, void *inpage, unsigned int *inputmargin, int *maptype, - bool support_0padding) + bool may_inplace) { - unsigned int nrpages_in, nrpages_out; - unsigned int ofull, oend, inputsize, total, i, j; + struct z_erofs_decompress_req *rq = ctx->rq; + unsigned int omargin, total, i, j; struct page **in; void *src, *tmp; - inputsize = rq->inputsize; - nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT; - oend = rq->pageofs_out + rq->outputsize; - ofull = PAGE_ALIGN(oend); - nrpages_out = ofull >> PAGE_SHIFT; - if (rq->inplace_io) { - if (rq->partial_decoding || !support_0padding || - ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize)) + omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + if (rq->partial_decoding || !may_inplace || + omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < nrpages_in; ++i) { + for (i = 0; i < ctx->inpages; ++i) { DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < nrpages_out - nrpages_in + i; ++j) + for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) if (rq->out[j] == rq->in[i]) goto docopy; } } - if (nrpages_in <= 1) { + if (ctx->inpages <= 1) { *maptype = 0; return inpage; } kunmap_atomic(inpage); might_sleep(); - src = erofs_vm_map_ram(rq->in, nrpages_in); + src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; @@ -155,7 +157,7 @@ static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = erofs_get_pcpubuf(nrpages_in); + src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_atomic(inpage); @@ -182,36 +184,53 @@ docopy: return src; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, +/* + * Get the exact inputsize with zero_padding feature. + * - For LZ4, it should work if zero_padding feature is on (5.3+); + * - For MicroLZMA, it'd be enabled all the time. + */ +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize) +{ + const char *padend; + + padend = memchr_inv(padbuf, 0, padbufsize); + if (!padend) + return -EFSCORRUPTED; + rq->inputsize -= padend - padbuf; + rq->pageofs_in += padend - padbuf; + return 0; +} + +static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, u8 *out) { + struct z_erofs_decompress_req *rq = ctx->rq; + bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *headpage, *src; - bool support_0padding; int ret, maptype; DBG_BUGON(*rq->in == NULL); headpage = kmap_atomic(*rq->in); - inputmargin = 0; - support_0padding = false; - /* decompression inplace is only safe when 0padding is enabled */ - if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) { + /* LZ4 decompression inplace is only safe if zero_padding is enabled */ + if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { support_0padding = true; - - while (!headpage[inputmargin & ~PAGE_MASK]) - if (!(++inputmargin & ~PAGE_MASK)) - break; - - if (inputmargin >= rq->inputsize) { + ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + EROFS_BLKSIZ - rq->pageofs_in)); + if (ret) { kunmap_atomic(headpage); - return -EIO; + return ret; } + may_inplace = !((rq->pageofs_in + rq->inputsize) & + (EROFS_BLKSIZ - 1)); } - rq->inputsize -= inputmargin; - src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, - &maptype, support_0padding); + inputmargin = rq->pageofs_in; + src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); @@ -240,9 +259,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, } if (maptype == 0) { - kunmap_atomic(src); + kunmap_atomic(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT); + vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); } else { @@ -255,14 +274,18 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; + ctx.rq = rq; + ctx.oend = rq->pageofs_out + rq->outputsize; + ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; + ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + /* one optimized fast path only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_atomic(*rq->out); dst_maptype = 0; @@ -270,27 +293,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); - if (ret < 0) + ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); + if (ret < 0) { return ret; - if (ret) { + } else if (ret > 0) { dst = page_address(*rq->out); dst_maptype = 1; - goto dstmap_out; + } else { + dst = erofs_vm_map_ram(rq->out, ctx.outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; } - dst = erofs_vm_map_ram(rq->out, nrpages_out); - if (!dst) - return -ENOMEM; - dst_maptype = 2; - dstmap_out: - ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); - + ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, nrpages_out); + vm_unmap_ram(dst, ctx.outpages); return ret; } @@ -299,7 +320,8 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; + const unsigned int righthalf = min_t(unsigned int, rq->outputsize, + PAGE_SIZE - rq->pageofs_out); unsigned char *src, *dst; if (nrpages_out > 2) { @@ -312,7 +334,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, return 0; } - src = kmap_atomic(*rq->in); + src = kmap_atomic(*rq->in) + rq->pageofs_in; if (rq->out[0]) { dst = kmap_atomic(rq->out[0]); memcpy(dst + rq->pageofs_out, src, righthalf); diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 50045510a1f4..05a3063cf2bc 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -156,7 +156,7 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int nrpages_in = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - unsigned int inputmargin, inlen, outlen, pageofs; + unsigned int inlen, outlen, pageofs; struct z_erofs_lzma *strm; u8 *kin; bool bounced = false; @@ -164,16 +164,13 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, /* 1. get the exact LZMA compressed size */ kin = kmap(*rq->in); - inputmargin = 0; - while (!kin[inputmargin & ~PAGE_MASK]) - if (!(++inputmargin & ~PAGE_MASK)) - break; - - if (inputmargin >= PAGE_SIZE) { + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + EROFS_BLKSIZ - rq->pageofs_in)); + if (err) { kunmap(*rq->in); - return -EFSCORRUPTED; + return err; } - rq->inputsize -= inputmargin; /* 2. get an available lzma context */ again: @@ -193,9 +190,9 @@ again: xz_dec_microlzma_reset(strm->state, inlen, outlen, !rq->partial_decoding); pageofs = rq->pageofs_out; - strm->buf.in = kin + inputmargin; + strm->buf.in = kin + rq->pageofs_in; strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in); inlen -= strm->buf.in_size; strm->buf.out = NULL; strm->buf.out_pos = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 083997a034e5..3ea62c6fb00a 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -17,19 +17,21 @@ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should * be incompatible with this kernel version. */ -#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 +#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 +#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ + (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ + EROFS_FEATURE_INCOMPAT_ZTAILPACKING) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -209,7 +211,7 @@ struct erofs_xattr_ibody_header { __le32 h_reserved; __u8 h_shared_count; __u8 h_reserved2[7]; - __le32 h_shared_xattrs[0]; /* shared xattr id array */ + __le32 h_shared_xattrs[]; /* shared xattr id array */ }; /* Name indexes */ @@ -226,7 +228,7 @@ struct erofs_xattr_entry { __u8 e_name_index; /* attribute name index */ __le16 e_value_size; /* size of attribute value */ /* followed by e_name and e_value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) @@ -292,13 +294,17 @@ struct z_erofs_lzma_cfgs { * (4B) + 2B + (4B) if compacted 2B is on. * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) + * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 +#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 struct z_erofs_map_header { - __le32 h_reserved1; + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 2345f1de438e..ff62f84f47d3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -13,8 +13,8 @@ * the inode payload page if it's an extended inode) in order to fill * inline data if possible. */ -static struct page *erofs_read_inode(struct inode *inode, - unsigned int *ofs) +static void *erofs_read_inode(struct erofs_buf *buf, + struct inode *inode, unsigned int *ofs) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -22,7 +22,7 @@ static struct page *erofs_read_inode(struct inode *inode, const erofs_off_t inode_loc = iloc(sbi, vi->nid); erofs_blk_t blkaddr, nblks = 0; - struct page *page; + void *kaddr; struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; unsigned int ifmt; @@ -34,14 +34,14 @@ static struct page *erofs_read_inode(struct inode *inode, erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", __func__, vi->nid, *ofs, blkaddr); - page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(page)) { + kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); + if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(page)); - return page; + vi->nid, PTR_ERR(kaddr)); + return kaddr; } - dic = page_address(page) + *ofs; + dic = kaddr + *ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { @@ -62,12 +62,12 @@ static struct page *erofs_read_inode(struct inode *inode, switch (erofs_inode_version(ifmt)) { case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); - /* check if the inode acrosses page boundary */ - if (*ofs + vi->inode_isize <= PAGE_SIZE) { + /* check if the extended inode acrosses block boundary */ + if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) { *ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = PAGE_SIZE - *ofs; + const unsigned int gotten = EROFS_BLKSIZ - *ofs; copied = kmalloc(vi->inode_isize, GFP_NOFS); if (!copied) { @@ -75,18 +75,16 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } memcpy(copied, dic, gotten); - unlock_page(page); - put_page(page); - - page = erofs_get_meta_page(sb, blkaddr + 1); - if (IS_ERR(page)) { - erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld", - vi->nid, PTR_ERR(page)); + kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, + EROFS_KMAP); + if (IS_ERR(kaddr)) { + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", + vi->nid, PTR_ERR(kaddr)); kfree(copied); - return page; + return kaddr; } *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, page_address(page), *ofs); + memcpy((u8 *)copied + gotten, kaddr, *ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); @@ -200,7 +198,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; else inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; - return page; + return kaddr; bogusimode: erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", @@ -209,12 +207,11 @@ bogusimode: err_out: DBG_BUGON(1); kfree(copied); - unlock_page(page); - put_page(page); + erofs_put_metabuf(buf); return ERR_PTR(err); } -static int erofs_fill_symlink(struct inode *inode, void *data, +static int erofs_fill_symlink(struct inode *inode, void *kaddr, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); @@ -222,7 +219,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= PAGE_SIZE) { + inode->i_size >= EROFS_BLKSIZ) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -232,8 +229,8 @@ static int erofs_fill_symlink(struct inode *inode, void *data, return -ENOMEM; m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross page boundary as well */ - if (m_pofs + inode->i_size > PAGE_SIZE) { + /* inline symlink data shouldn't cross block boundary */ + if (m_pofs + inode->i_size > EROFS_BLKSIZ) { kfree(lnk); erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", @@ -241,8 +238,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, DBG_BUGON(1); return -EFSCORRUPTED; } - - memcpy(lnk, data + m_pofs, inode->i_size); + memcpy(lnk, kaddr + m_pofs, inode->i_size); lnk[inode->i_size] = '\0'; inode->i_link = lnk; @@ -253,16 +249,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data, static int erofs_fill_inode(struct inode *inode, int isdir) { struct erofs_inode *vi = EROFS_I(inode); - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; unsigned int ofs; int err = 0; trace_erofs_fill_inode(inode, isdir); /* read inode base data from disk */ - page = erofs_read_inode(inode, &ofs); - if (IS_ERR(page)) - return PTR_ERR(page); + kaddr = erofs_read_inode(&buf, inode, &ofs); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -278,7 +275,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) inode->i_fop = &erofs_dir_fops; break; case S_IFLNK: - err = erofs_fill_symlink(inode, page_address(page), ofs); + err = erofs_fill_symlink(inode, kaddr, ofs); if (err) goto out_unlock; inode_nohighmem(inode); @@ -302,8 +299,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) inode->i_mapping->a_ops = &erofs_raw_access_aops; out_unlock: - unlock_page(page); - put_page(page); + erofs_put_metabuf(&buf); return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3265688af7f9..b8272fb95fd6 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -51,17 +51,24 @@ struct erofs_device_info { char *path; struct block_device *bdev; struct dax_device *dax_dev; + u64 dax_part_off; u32 blocks; u32 mapped_blkaddr; }; +enum { + EROFS_SYNC_DECOMPRESS_AUTO, + EROFS_SYNC_DECOMPRESS_FORCE_ON, + EROFS_SYNC_DECOMPRESS_FORCE_OFF +}; + struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; - /* strategy of sync decompression (false - auto, true - force on) */ - bool readahead_sync_decompress; + /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ + unsigned int sync_decompress; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -109,6 +116,7 @@ struct erofs_sb_info { #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; + u64 dax_part_off; u64 total_blocks; u32 primarydevice_blocks; @@ -134,6 +142,10 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; + + /* sysfs support */ + struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ + struct completion s_kobj_unregister; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -241,6 +253,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) #error erofs cannot be used in this platform #endif +enum erofs_kmap_type { + EROFS_NO_KMAP, /* don't map the buffer */ + EROFS_KMAP, /* use kmap() to map the buffer */ + EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ +}; + +struct erofs_buf { + struct page *page; + void *base; + enum erofs_kmap_type kmap_type; +}; +#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) + #define ROOT_NID(sb) ((sb)->root_nid) #define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) @@ -258,10 +283,13 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ return sbi->feature_##compat & EROFS_FEATURE_##feature; \ } -EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) +EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) +EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) +EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -296,6 +324,9 @@ struct erofs_inode { unsigned short z_advise; unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; + unsigned long z_tailextent_headlcn; + unsigned int z_idataoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -390,14 +421,14 @@ enum { #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) struct erofs_map_blocks { + struct erofs_buf buf; + erofs_off_t m_pa, m_la; u64 m_plen, m_llen; unsigned short m_deviceid; char m_algorithmformat; unsigned int m_flags; - - struct page *mpage; }; /* Flags used by erofs_map_blocks_flatmode() */ @@ -409,6 +440,8 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 +/* Used to map tail extent for tailpacking inline pcluster */ +#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -436,6 +469,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_dev { struct block_device *m_bdev; struct dax_device *m_daxdev; + u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; @@ -443,7 +477,10 @@ struct erofs_map_dev { /* data.c */ extern const struct file_operations erofs_file_fops; -struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +void erofs_unmap_metabuf(struct erofs_buf *buf); +void erofs_put_metabuf(struct erofs_buf *buf); +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -498,6 +535,12 @@ int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +/* sysfs.c */ +int erofs_register_sysfs(struct super_block *sb); +void erofs_unregister_sysfs(struct super_block *sb); +int __init erofs_init_sysfs(void); +void erofs_exit_sysfs(void); + /* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); static inline void erofs_pagepool_add(struct page **pagepool, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6a969b1e0ee6..915eefe0d7e2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include <linux/module.h> #include <linux/buffer_head.h> @@ -124,80 +125,50 @@ static bool check_layout_compatibility(struct super_block *sb, #ifdef CONFIG_EROFS_FS_ZIP /* read variable-sized metadata, offset will be aligned by 4-byte */ -static void *erofs_read_metadata(struct super_block *sb, struct page **pagep, +static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) { - struct page *page = *pagep; u8 *buffer, *ptr; int len, i, cnt; - erofs_blk_t blk; *offset = round_up(*offset, 4); - blk = erofs_blknr(*offset); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP); + if (IS_ERR(ptr)) + return ptr; - if (!page || page->index != blk) { - if (page) { - unlock_page(page); - put_page(page); - } - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) - goto err_nullpage; - } - - ptr = kmap(page); len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); - if (!buffer) { - buffer = ERR_PTR(-ENOMEM); - goto out; - } + if (!buffer) + return ERR_PTR(-ENOMEM); *offset += sizeof(__le16); *lengthp = len; for (i = 0; i < len; i += cnt) { cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); - blk = erofs_blknr(*offset); - - if (!page || page->index != blk) { - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) { - kfree(buffer); - goto err_nullpage; - } - ptr = kmap(page); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), + EROFS_KMAP); + if (IS_ERR(ptr)) { + kfree(buffer); + return ptr; } memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); *offset += cnt; } -out: - kunmap(page); - *pagep = page; return buffer; -err_nullpage: - *pagep = NULL; - return page; } static int erofs_load_compr_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { - struct erofs_sb_info *sbi; - struct page *page; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int algs, alg; erofs_off_t offset; - int size, ret; + int size, ret = 0; - sbi = EROFS_SB(sb); sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); - if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); @@ -205,20 +176,17 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } offset = EROFS_SUPER_OFFSET + sbi->sb_size; - page = NULL; alg = 0; - ret = 0; - for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { void *data; if (!(algs & 1)) continue; - data = erofs_read_metadata(sb, &page, &offset, &size); + data = erofs_read_metadata(sb, &buf, &offset, &size); if (IS_ERR(data)) { ret = PTR_ERR(data); - goto err; + break; } switch (alg) { @@ -234,13 +202,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } kfree(data); if (ret) - goto err; - } -err: - if (page) { - unlock_page(page); - put_page(page); + break; } + erofs_put_metabuf(&buf); return ret; } #else @@ -261,7 +225,7 @@ static int erofs_init_devices(struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); unsigned int ondisk_extradevs; erofs_off_t pos; - struct page *page = NULL; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_device_info *dif; struct erofs_deviceslot *dis; void *ptr; @@ -285,22 +249,13 @@ static int erofs_init_devices(struct super_block *sb, pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); idr_for_each_entry(&sbi->devs->tree, dif, id) { - erofs_blk_t blk = erofs_blknr(pos); struct block_device *bdev; - if (!page || page->index != blk) { - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } - - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) { - up_read(&sbi->devs->rwsem); - return PTR_ERR(page); - } - ptr = kmap(page); + ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), + EROFS_KMAP); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + break; } dis = ptr + erofs_blkoff(pos); @@ -309,22 +264,17 @@ static int erofs_init_devices(struct super_block *sb, sb->s_type); if (IS_ERR(bdev)) { err = PTR_ERR(bdev); - goto err_out; + break; } dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); dif->blocks = le32_to_cpu(dis->blocks); dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); sbi->total_blocks += dif->blocks; pos += EROFS_DEVT_SLOT_SIZE; } -err_out: up_read(&sbi->devs->rwsem); - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } + erofs_put_metabuf(&buf); return err; } @@ -411,6 +361,9 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_init_devices(sb, dsb); + + if (erofs_sb_has_ztailpacking(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); out: kunmap(page); put_page(page); @@ -423,7 +376,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) #ifdef CONFIG_EROFS_FS_ZIP ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; ctx->opt.max_sync_decompress_pages = 3; - ctx->opt.readahead_sync_decompress = false; + ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(&ctx->opt, XATTR_USER); @@ -644,7 +597,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off); sbi->devs = ctx->devs; ctx->devs = NULL; @@ -652,10 +605,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - if (test_opt(&sbi->opt, DAX_ALWAYS) && - !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { - errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(&sbi->opt, DAX_ALWAYS); + if (test_opt(&sbi->opt, DAX_ALWAYS)) { + BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); + + if (!sbi->dax_dev) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(&sbi->opt, DAX_ALWAYS); + } } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -695,6 +651,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + err = erofs_register_sysfs(sb); + if (err) + return err; + erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); return 0; } @@ -808,6 +768,7 @@ static void erofs_put_super(struct super_block *sb) DBG_BUGON(!sbi); + erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); @@ -852,6 +813,10 @@ static int __init erofs_module_init(void) if (err) goto zip_err; + err = erofs_init_sysfs(); + if (err) + goto sysfs_err; + err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; @@ -859,6 +824,8 @@ static int __init erofs_module_init(void) return 0; fs_err: + erofs_exit_sysfs(); +sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: z_erofs_lzma_exit(); @@ -877,6 +844,7 @@ static void __exit erofs_module_exit(void) /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); z_erofs_lzma_exit(); erofs_exit_shrinker(); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c new file mode 100644 index 000000000000..dac252bc9228 --- /dev/null +++ b/fs/erofs/sysfs.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. + * https://www.oppo.com/ + */ +#include <linux/sysfs.h> +#include <linux/kobject.h> + +#include "internal.h" + +enum { + attr_feature, + attr_pointer_ui, + attr_pointer_bool, +}; + +enum { + struct_erofs_sb_info, + struct_erofs_mount_opts, +}; + +struct erofs_attr { + struct attribute attr; + short attr_id; + int struct_type, offset; +}; + +#define EROFS_ATTR(_name, _mode, _id) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} +#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) +#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) + +#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .struct_type = struct_##_struct, \ + .offset = offsetof(struct _struct, _name),\ +} + +#define EROFS_ATTR_RW(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) + +#define EROFS_RO_ATTR(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) + +#define EROFS_ATTR_RW_UI(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_ui, _struct) + +#define EROFS_ATTR_RW_BOOL(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_bool, _struct) + +#define ATTR_LIST(name) (&erofs_attr_##name.attr) + +#ifdef CONFIG_EROFS_FS_ZIP +EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +#endif + +static struct attribute *erofs_attrs[] = { +#ifdef CONFIG_EROFS_FS_ZIP + ATTR_LIST(sync_decompress), +#endif + NULL, +}; +ATTRIBUTE_GROUPS(erofs); + +/* Features this copy of erofs supports */ +EROFS_ATTR_FEATURE(zero_padding); +EROFS_ATTR_FEATURE(compr_cfgs); +EROFS_ATTR_FEATURE(big_pcluster); +EROFS_ATTR_FEATURE(chunked_file); +EROFS_ATTR_FEATURE(device_table); +EROFS_ATTR_FEATURE(compr_head2); +EROFS_ATTR_FEATURE(sb_chksum); +EROFS_ATTR_FEATURE(ztailpacking); + +static struct attribute *erofs_feat_attrs[] = { + ATTR_LIST(zero_padding), + ATTR_LIST(compr_cfgs), + ATTR_LIST(big_pcluster), + ATTR_LIST(chunked_file), + ATTR_LIST(device_table), + ATTR_LIST(compr_head2), + ATTR_LIST(sb_chksum), + ATTR_LIST(ztailpacking), + NULL, +}; +ATTRIBUTE_GROUPS(erofs_feat); + +static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, + int struct_type, int offset) +{ + if (struct_type == struct_erofs_sb_info) + return (unsigned char *)sbi + offset; + if (struct_type == struct_erofs_mount_opts) + return (unsigned char *)&sbi->opt + offset; + return NULL; +} + +static ssize_t erofs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + + switch (a->attr_id) { + case attr_feature: + return sysfs_emit(buf, "supported\n"); + case attr_pointer_ui: + if (!ptr) + return 0; + return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); + case attr_pointer_bool: + if (!ptr) + return 0; + return sysfs_emit(buf, "%d\n", *(bool *)ptr); + } + return 0; +} + +static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != (unsigned int)t) + return -ERANGE; +#ifdef CONFIG_EROFS_FS_ZIP + if (!strcmp(a->attr.name, "sync_decompress") && + (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) + return -EINVAL; +#endif + *(unsigned int *)ptr = t; + return len; + case attr_pointer_bool: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != 0 && t != 1) + return -EINVAL; + *(bool *)ptr = !!t; + return len; + } + return 0; +} + +static void erofs_sb_release(struct kobject *kobj) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops erofs_attr_ops = { + .show = erofs_attr_show, + .store = erofs_attr_store, +}; + +static struct kobj_type erofs_sb_ktype = { + .default_groups = erofs_groups, + .sysfs_ops = &erofs_attr_ops, + .release = erofs_sb_release, +}; + +static struct kobj_type erofs_ktype = { + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kset erofs_root = { + .kobj = {.ktype = &erofs_ktype}, +}; + +static struct kobj_type erofs_feat_ktype = { + .default_groups = erofs_feat_groups, + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kobject erofs_feat = { + .kset = &erofs_root, +}; + +int erofs_register_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int err; + + sbi->s_kobj.kset = &erofs_root; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + goto put_sb_kobj; + return 0; + +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void erofs_unregister_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} + +int __init erofs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&erofs_root.kobj, "erofs"); + erofs_root.kobj.parent = fs_kobj; + ret = kset_register(&erofs_root); + if (ret) + goto root_err; + + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (ret) + goto feat_err; + return ret; + +feat_err: + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +root_err: + return ret; +} + +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 84da2c280012..ec9a1d780dc1 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -150,7 +150,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); /* last refcount should be connected with its managed pslot. */ erofs_workgroup_unfreeze(grp, 0); @@ -165,15 +165,19 @@ static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned int freed = 0; unsigned long index; + xa_lock(&sbi->managed_pslots); xa_for_each(&sbi->managed_pslots, index, grp) { /* try to shrink each valid workgroup */ if (!erofs_try_to_release_workgroup(sbi, grp)) continue; + xa_unlock(&sbi->managed_pslots); ++freed; if (!--nr_shrink) - break; + return freed; + xa_lock(&sbi->managed_pslots); } + xa_unlock(&sbi->managed_pslots); return freed; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 01c581e93c5f..8106bcb5a38d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -2,39 +2,20 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> #include "xattr.h" struct xattr_iter { struct super_block *sb; - struct page *page; + struct erofs_buf buf; void *kaddr; erofs_blk_t blkaddr; unsigned int ofs; }; -static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) -{ - /* the only user of kunmap() is 'init_inode_xattrs' */ - if (!atomic) - kunmap(it->page); - else - kunmap_atomic(it->kaddr); - - unlock_page(it->page); - put_page(it->page); -} - -static inline void xattr_iter_end_final(struct xattr_iter *it) -{ - if (!it->page) - return; - - xattr_iter_end(it, true); -} - static int init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -43,7 +24,6 @@ static int init_inode_xattrs(struct inode *inode) struct erofs_xattr_ibody_header *ih; struct super_block *sb; struct erofs_sb_info *sbi; - bool atomic_map; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -91,26 +71,23 @@ static int init_inode_xattrs(struct inode *inode) sb = inode->i_sb; sbi = EROFS_SB(sb); + it.buf = __EROFS_BUF_INITIALIZER; it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); - it.page = erofs_get_meta_page(sb, it.blkaddr); - if (IS_ERR(it.page)) { - ret = PTR_ERR(it.page); + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + ret = PTR_ERR(it.kaddr); goto out_unlock; } - /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = kmap(it.page); - atomic_map = false; - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); - vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); if (!vi->xattr_shared_xattrs) { - xattr_iter_end(&it, atomic_map); + erofs_put_metabuf(&it.buf); ret = -ENOMEM; goto out_unlock; } @@ -122,25 +99,22 @@ static int init_inode_xattrs(struct inode *inode) if (it.ofs >= EROFS_BLKSIZ) { /* cannot be unaligned */ DBG_BUGON(it.ofs != EROFS_BLKSIZ); - xattr_iter_end(&it, atomic_map); - it.page = erofs_get_meta_page(sb, ++it.blkaddr); - if (IS_ERR(it.page)) { + it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.page); + ret = PTR_ERR(it.kaddr); goto out_unlock; } - - it.kaddr = kmap_atomic(it.page); - atomic_map = true; it.ofs = 0; } vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); it.ofs += sizeof(__le32); } - xattr_iter_end(&it, atomic_map); + erofs_put_metabuf(&it.buf); /* paired with smp_mb() at the beginning of the function. */ smp_mb(); @@ -172,19 +146,11 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) if (it->ofs < EROFS_BLKSIZ) return 0; - xattr_iter_end(it, true); - it->blkaddr += erofs_blknr(it->ofs); - - it->page = erofs_get_meta_page(it->sb, it->blkaddr); - if (IS_ERR(it->page)) { - int err = PTR_ERR(it->page); - - it->page = NULL; - return err; - } - - it->kaddr = kmap_atomic(it->page); + it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); return 0; } @@ -207,11 +173,10 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr); - if (IS_ERR(it->page)) - return PTR_ERR(it->page); - - it->kaddr = kmap_atomic(it->page); + it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; } @@ -272,7 +237,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, entry.e_name_len - processed); /* handle name */ @@ -307,7 +272,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, value_sz - processed); op->value(it, processed, it->kaddr + it->ofs, slice); it->ofs += slice; @@ -386,8 +351,6 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) if (ret != -ENOATTR) break; } - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_size; } @@ -404,26 +367,16 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - - if (!i || blkaddr != it->it.blkaddr) { - if (i) - xattr_iter_end(&it->it, true); - - it->it.page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(it->it.page)) - return PTR_ERR(it->it.page); - - it->it.kaddr = kmap_atomic(it->it.page); - it->it.blkaddr = blkaddr; - } + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->it.kaddr)) + return PTR_ERR(it->it.kaddr); + it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); if (ret != -ENOATTR) break; } - if (vi->xattr_shared_count) - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_size; } @@ -452,10 +405,11 @@ int erofs_getxattr(struct inode *inode, int index, return ret; it.index = index; - it.name.len = strlen(name); if (it.name.len > EROFS_NAME_LEN) return -ERANGE; + + it.it.buf = __EROFS_BUF_INITIALIZER; it.name.name = name; it.buffer = buffer; @@ -465,6 +419,7 @@ int erofs_getxattr(struct inode *inode, int index, ret = inline_getxattr(inode, &it); if (ret == -ENOATTR) ret = shared_getxattr(inode, &it); + erofs_put_metabuf(&it.it.buf); return ret; } @@ -607,7 +562,6 @@ static int inline_listxattr(struct listxattr_iter *it) if (ret) break; } - xattr_iter_end_final(&it->it); return ret ? ret : it->buffer_ofs; } @@ -625,25 +579,16 @@ static int shared_listxattr(struct listxattr_iter *it) xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - if (!i || blkaddr != it->it.blkaddr) { - if (i) - xattr_iter_end(&it->it, true); - - it->it.page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(it->it.page)) - return PTR_ERR(it->it.page); - - it->it.kaddr = kmap_atomic(it->it.page); - it->it.blkaddr = blkaddr; - } + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->it.kaddr)) + return PTR_ERR(it->it.kaddr); + it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); if (ret) break; } - if (vi->xattr_shared_count) - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_ofs; } @@ -659,6 +604,7 @@ ssize_t erofs_listxattr(struct dentry *dentry, if (ret) return ret; + it.it.buf = __EROFS_BUF_INITIALIZER; it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; @@ -667,9 +613,10 @@ ssize_t erofs_listxattr(struct dentry *dentry, it.it.sb = dentry->d_sb; ret = inline_listxattr(&it); - if (ret < 0 && ret != -ENOATTR) - return ret; - return shared_listxattr(&it); + if (ret >= 0 || ret == -ENOATTR) + ret = shared_listxattr(&it); + erofs_put_metabuf(&it.it.buf); + return ret; } #ifdef CONFIG_EROFS_FS_POSIX_ACL diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 94090c74b3f7..332462c59f11 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -86,4 +86,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #endif #endif - diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 9a249bfc2770..498b7666efe8 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,12 +82,13 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) { + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; - if (pcl->pclusterpages > pcs->maxpages) + if (pclusterpages > pcs->maxpages) continue; kmem_cache_free(pcs->slab, pcl); @@ -298,6 +299,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, container_of(grp, struct z_erofs_pcluster, obj); int i; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. @@ -331,6 +333,7 @@ int erofs_try_to_free_cached_page(struct page *page) if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { unsigned int i; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { if (pcl->compressed_pages[i] == page) { WRITE_ONCE(pcl->compressed_pages[i], NULL); @@ -458,6 +461,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { + bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; struct erofs_workgroup *grp; @@ -469,12 +473,12 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : + map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) return PTR_ERR(pcl); atomic_set(&pcl->obj.refcount, 1); - pcl->obj.index = map->m_pa >> PAGE_SHIFT; pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? @@ -494,16 +498,25 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, mutex_init(&cl->lock); DBG_BUGON(!mutex_trylock(&cl->lock)); - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; - } + if (ztailpacking) { + pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->pageofs_in = erofs_blkoff(map->m_pa); + pcl->tailpacking_size = map->m_plen; + } else { + pcl->obj.index = map->m_pa >> PAGE_SHIFT; - if (grp != &pcl->obj) { - clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); - err = -EEXIST; - goto err_out; + grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + clt->pcl = container_of(grp, + struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; + } } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -532,17 +545,20 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (!PAGE_ALIGNED(map->m_pa)) { - DBG_BUGON(1); - return -EINVAL; + if (map->m_flags & EROFS_MAP_META) { + if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + goto tailpacking; } grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { +tailpacking: ret = z_erofs_register_collection(clt, inode, map); - if (!ret) goto out; if (ret != -EEXIST) @@ -558,9 +574,9 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt); - /* since file-backed online pages are traversed in reverse order */ - clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages; + clt->icpage_ptr = clt->pcl->compressed_pages + + z_erofs_pclusterpages(clt->pcl); return 0; } @@ -681,14 +697,31 @@ restart_now: if (err) goto err_out; - /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; + if (z_erofs_is_inline_pcluster(clt->pcl)) { + void *mp; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, + erofs_blknr(map->m_pa), EROFS_NO_KMAP); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + erofs_err(inode->i_sb, + "failed to get inline page, err %d", err); + goto err_out; + } + get_page(fe->map.buf.page); + WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page); + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + } else { + /* preload all compressed pages (can change mode if needed) */ + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, + map->m_la)) + cache_strategy = TRYALLOC; + else + cache_strategy = DONTALLOC; + + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); + } hitted: /* @@ -762,6 +795,21 @@ err_out: goto out; } +static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, + unsigned int readahead_pages) +{ + /* auto: enable for readpage, disable for readahead */ + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && + !readahead_pages) + return true; + + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && + (readahead_pages <= sbi->opt.max_sync_decompress_pages)) + return true; + + return false; +} + static void z_erofs_decompressqueue_work(struct work_struct *work); static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, bool sync, int bios) @@ -784,7 +832,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->opt.readahead_sync_decompress = true; + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -827,6 +877,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct z_erofs_pagevec_ctor ctor; unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; @@ -908,15 +959,20 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, overlapped = false; compressed_pages = pcl->compressed_pages; - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { unsigned int pagenr; page = compressed_pages[i]; - /* all compressed pages ought to be valid */ DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { if (erofs_page_is_managed(sbi, page)) { if (!PageUptodate(page)) @@ -961,11 +1017,16 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, partial = true; } - inputsize = pcl->pclusterpages * PAGE_SIZE; + if (z_erofs_is_inline_pcluster(pcl)) + inputsize = pcl->tailpacking_size; + else + inputsize = pclusterpages * PAGE_SIZE; + err = z_erofs_decompress(&(struct z_erofs_decompress_req) { .sb = sb, .in = compressed_pages, .out = pages, + .pageofs_in = pcl->pageofs_in, .pageofs_out = cl->pageofs, .inputsize = inputsize, .outputsize = outputsize, @@ -975,17 +1036,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, }, pagepool); out: - /* must handle all compressed pages before ending pages */ - for (i = 0; i < pcl->pclusterpages; ++i) { - page = compressed_pages[i]; - - if (erofs_page_is_managed(sbi, page)) - continue; + /* must handle all compressed pages before actual file pages */ + if (z_erofs_is_inline_pcluster(pcl)) { + page = compressed_pages[0]; + WRITE_ONCE(compressed_pages[0], NULL); + put_page(page); + } else { + for (i = 0; i < pclusterpages; ++i) { + page = compressed_pages[i]; - /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); + if (erofs_page_is_managed(sbi, page)) + continue; - WRITE_ONCE(compressed_pages[i], NULL); + /* recycle all individual short-lived pages */ + (void)z_erofs_put_shortlivedpage(pagepool, page); + WRITE_ONCE(compressed_pages[i], NULL); + } } for (i = 0; i < nr_pages; ++i) { @@ -1271,6 +1337,14 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); + /* close the main owned chain at first */ + owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (z_erofs_is_inline_pcluster(pcl)) { + move_to_bypass_jobqueue(pcl, qtail, owned_head); + continue; + } + /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { .m_pa = blknr_to_addr(pcl->obj.index), @@ -1280,10 +1354,6 @@ static void z_erofs_submit_queue(struct super_block *sb, cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); - do { struct page *page; @@ -1435,6 +1505,7 @@ skip: static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *pagepool = NULL; int err; @@ -1450,14 +1521,13 @@ static int z_erofs_readpage(struct file *file, struct page *page) (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); - if (f.map.mpage) - put_page(f.map.mpage); - + erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); return err; } @@ -1501,10 +1571,8 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); z_erofs_runqueue(inode->i_sb, &f, &pagepool, - sbi->opt.readahead_sync_decompress && - nr_pages <= sbi->opt.max_sync_decompress_pages); - if (f.map.mpage) - put_page(f.map.mpage); + z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 4a69515dea75..e043216b545f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -62,8 +62,16 @@ struct z_erofs_pcluster { /* A: lower limit of decompressed length and if full length or not */ unsigned int length; - /* I: physical cluster size in pages */ - unsigned short pclusterpages; + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; /* I: compression algorithm format */ unsigned char algorithmformat; @@ -94,6 +102,18 @@ struct z_erofs_decompressqueue { } u; }; +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 660489a7fb64..18d7fd1a5064 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,12 +7,17 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> +static int z_erofs_do_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, + int flags); + int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && + !erofs_sb_has_ztailpacking(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -30,7 +35,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) struct super_block *const sb = inode->i_sb; int err, headnr; erofs_off_t pos; - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; void *kaddr; struct z_erofs_map_header *h; @@ -51,18 +56,18 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - page = erofs_get_meta_page(sb, erofs_blknr(pos)); - if (IS_ERR(page)) { - err = PTR_ERR(page); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), + EROFS_KMAP_ATOMIC); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); goto out_unlock; } - kaddr = kmap_atomic(page); - h = kaddr + erofs_blkoff(pos); vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; @@ -94,13 +99,33 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) err = -EFSCORRUPTED; goto unmap_done; } +unmap_done: + erofs_put_metabuf(&buf); + if (err) + goto out_unlock; + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_unlock; + } /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -unmap_done: - kunmap_atomic(kaddr); - unlock_page(page); - put_page(page); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; @@ -117,37 +142,18 @@ struct z_erofs_maprecorder { u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; + erofs_off_t nextpackoff; }; static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, erofs_blk_t eblk) { struct super_block *const sb = m->inode->i_sb; - struct erofs_map_blocks *const map = m->map; - struct page *mpage = map->mpage; - - if (mpage) { - if (mpage->index == eblk) { - if (!m->kaddr) - m->kaddr = kmap_atomic(mpage); - return 0; - } - - if (m->kaddr) { - kunmap_atomic(m->kaddr); - m->kaddr = NULL; - } - put_page(mpage); - } - mpage = erofs_get_meta_page(sb, eblk); - if (IS_ERR(mpage)) { - map->mpage = NULL; - return PTR_ERR(mpage); - } - m->kaddr = kmap_atomic(mpage); - unlock_page(mpage); - map->mpage = mpage; + m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, + EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return 0; } @@ -169,6 +175,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, if (err) return err; + m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; di = m->kaddr + erofs_blkoff(pos); @@ -243,12 +250,12 @@ static int get_compacted_la_distance(unsigned int lclusterbits, static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, - unsigned int eofs, bool lookahead) + erofs_off_t pos, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk; + unsigned int vcnt, base, lo, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; @@ -260,8 +267,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + /* it doesn't equal to round_up(..) */ + m->nextpackoff = round_down(pos, vcnt << amortizedshift) + + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + eofs = erofs_blkoff(pos); base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -399,8 +410,7 @@ out: err = z_erofs_reload_indexes(m, erofs_blknr(pos)); if (err) return err; - return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos), - lookahead); + return unpack_compacted_index(m, amortizedshift, pos, lookahead); } static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, @@ -583,11 +593,12 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) +static int z_erofs_do_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, + int flags) { struct erofs_inode *const vi = EROFS_I(inode); + bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -597,22 +608,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, unsigned long initial_lcn; unsigned long long ofs, end; - trace_z_erofs_map_blocks_iter_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { - map->m_llen = map->m_la + 1 - inode->i_size; - map->m_la = inode->i_size; - map->m_flags = 0; - goto out; - } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - lclusterbits = vi->z_logical_clusterbits; - ofs = map->m_la; + ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); @@ -620,6 +617,9 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; + if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) + vi->z_idataoff = m.nextpackoff; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -659,11 +659,19 @@ int z_erofs_map_blocks_iter(struct inode *inode, } map->m_llen = end - map->m_la; - map->m_pa = blknr_to_addr(m.pblk); - err = z_erofs_get_extent_compressedlen(&m, initial_lcn); - if (err) - goto out; + if (flags & EROFS_GET_BLOCKS_FINDTAIL) + vi->z_tailextent_headlcn = m.lcn; + if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_META; + map->m_pa = vi->z_idataoff; + map->m_plen = vi->z_idata_size; + } else { + map->m_pa = blknr_to_addr(m.pblk); + err = z_erofs_get_extent_compressedlen(&m, initial_lcn); + if (err) + goto out; + } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; @@ -681,14 +689,38 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_flags |= EROFS_MAP_FULL_MAPPED; } unmap_out: - if (m.kaddr) - kunmap_atomic(m.kaddr); + erofs_unmap_metabuf(&m.map->buf); out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); + return err; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + int err = 0; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (map->m_la >= inode->i_size) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = z_erofs_fill_inode_lazy(inode); + if (err) + goto out; + + err = z_erofs_do_map_blocks(inode, map, flags); +out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ @@ -704,8 +736,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, struct erofs_map_blocks map = { .m_la = offset }; ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); - if (map.mpage) - put_page(map.mpage); + erofs_put_metabuf(&map.buf); if (ret < 0) return ret; diff --git a/fs/exec.c b/fs/exec.c index 537d92c41105..82db656ca709 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1045,7 +1045,7 @@ static int de_thread(struct task_struct *tsk) * Kill all other threads in the thread group. */ spin_lock_irq(lock); - if (signal_group_exit(sig)) { + if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. @@ -1054,7 +1054,7 @@ static int de_thread(struct task_struct *tsk) return -EAGAIN; } - sig->group_exit_task = tsk; + sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; @@ -1082,7 +1082,7 @@ static int de_thread(struct task_struct *tsk) write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that - * exit_notify() can't miss ->group_exit_task + * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) @@ -1149,7 +1149,7 @@ static int de_thread(struct task_struct *tsk) release_task(leader); } - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: @@ -1162,7 +1162,7 @@ no_thread_group: killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; @@ -1307,6 +1307,8 @@ int begin_new_exec(struct linux_binprm * bprm) */ force_uaccess_begin(); + if (me->flags & PF_KTHREAD) + free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index cc5cffc4a769..03f142307174 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -105,7 +105,7 @@ int exfat_load_bitmap(struct super_block *sb) struct exfat_dentry *ep; struct buffer_head *bh; - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index cb1c0d8c1714..a27b55ec060a 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -64,7 +64,6 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent { int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; unsigned int type, clu_offset, max_dentries; - sector_t sector; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; struct exfat_dentry *ep; @@ -115,7 +114,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, §or); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; @@ -156,7 +155,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dir_entry->namebuf.lfnbuf_len); brelse(bh); - ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i + 1, &bh); if (!ep) return -EIO; dir_entry->size = @@ -445,7 +444,6 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct timespec64 ts = current_time(inode); - sector_t sector; struct exfat_dentry *ep; struct buffer_head *bh; @@ -453,7 +451,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, * We cannot use exfat_get_dentry_set here because file ep is not * initialized yet. */ - ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ep) return -EIO; @@ -477,7 +475,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, exfat_update_bh(bh, IS_DIRSYNC(inode)); brelse(bh); - ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh); if (!ep) return -EIO; @@ -496,12 +494,11 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, struct super_block *sb = inode->i_sb; int ret = 0; int i, num_entries; - sector_t sector; u16 chksum; struct exfat_dentry *ep, *fep; struct buffer_head *fbh, *bh; - fep = exfat_get_dentry(sb, p_dir, entry, &fbh, §or); + fep = exfat_get_dentry(sb, p_dir, entry, &fbh); if (!fep) return -EIO; @@ -509,7 +506,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); for (i = 1; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) { ret = -EIO; goto release_fbh; @@ -531,13 +528,12 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, { struct super_block *sb = inode->i_sb; int i; - sector_t sector; unsigned short *uniname = p_uniname->name; struct exfat_dentry *ep; struct buffer_head *bh; int sync = IS_DIRSYNC(inode); - ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ep) return -EIO; @@ -545,7 +541,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, exfat_update_bh(bh, sync); brelse(bh); - ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh); if (!ep) return -EIO; @@ -555,7 +551,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, brelse(bh); for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) return -EIO; @@ -574,12 +570,11 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, { struct super_block *sb = inode->i_sb; int i; - sector_t sector; struct exfat_dentry *ep; struct buffer_head *bh; for (i = order; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) return -EIO; @@ -656,8 +651,8 @@ static int exfat_walk_fat_chain(struct super_block *sb, return 0; } -int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, - int entry, sector_t *sector, int *offset) +static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset) { int ret; unsigned int off, clu = 0; @@ -717,8 +712,7 @@ static int exfat_dir_readahead(struct super_block *sb, sector_t sec) } struct exfat_dentry *exfat_get_dentry(struct super_block *sb, - struct exfat_chain *p_dir, int entry, struct buffer_head **bh, - sector_t *sector) + struct exfat_chain *p_dir, int entry, struct buffer_head **bh) { unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); int off; @@ -740,8 +734,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb, if (!*bh) return NULL; - if (sector) - *sector = sec; return (struct exfat_dentry *)((*bh)->b_data + off); } @@ -892,7 +884,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, es->bh[es->num_bh++] = bh; } - /* validiate cached dentries */ + /* validate cached dentries */ for (i = 1; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) @@ -960,7 +952,7 @@ rewind: if (rewind && dentry == end_eidx) goto not_found; - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; @@ -1145,7 +1137,7 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, struct buffer_head *bh; for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) { - ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL); + ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ext_ep) return -EIO; @@ -1175,7 +1167,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 1d6da61157c9..619e5b4bed10 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -10,7 +10,6 @@ #include <linux/ratelimit.h> #include <linux/nls.h> -#define EXFAT_SUPER_MAGIC 0x2011BAB0UL #define EXFAT_ROOT_INO 1 #define EXFAT_CLUSTERS_UNTRACKED (~0u) @@ -459,11 +458,8 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, int num_entries, unsigned int type, struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); -int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, - int entry, sector_t *sector, int *offset); struct exfat_dentry *exfat_get_dentry(struct super_block *sb, - struct exfat_chain *p_dir, int entry, struct buffer_head **bh, - sector_t *sector); + struct exfat_chain *p_dir, int entry, struct buffer_head **bh); struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int num); struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index e949e563443c..a3464e56a7e1 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -84,9 +84,7 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc, static inline bool is_valid_cluster(struct exfat_sb_info *sbi, unsigned int clus) { - if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus) - return false; - return true; + return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters; } int exfat_ent_get(struct super_block *sb, unsigned int loc, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 6af0191b648f..d890fd34bb2d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -110,8 +110,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) exfat_set_volume_dirty(sb); num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); - num_clusters_phys = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi); + num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); @@ -228,12 +227,13 @@ void exfat_truncate(struct inode *inode, loff_t size) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int blocksize = i_blocksize(inode); loff_t aligned_size; int err; mutex_lock(&sbi->s_lock); - if (EXFAT_I(inode)->start_clu == 0) { + if (ei->start_clu == 0) { /* * Empty start_clu != ~0 (not allocated) */ @@ -251,8 +251,8 @@ void exfat_truncate(struct inode *inode, loff_t size) else mark_inode_dirty(inode); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; write_size: aligned_size = i_size_read(inode); if (aligned_size & (blocksize - 1)) { @@ -260,11 +260,11 @@ write_size: aligned_size++; } - if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode)) - EXFAT_I(inode)->i_size_ondisk = aligned_size; + if (ei->i_size_ondisk > i_size_read(inode)) + ei->i_size_ondisk = aligned_size; - if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode)) - EXFAT_I(inode)->i_size_aligned = aligned_size; + if (ei->i_size_aligned > i_size_read(inode)) + ei->i_size_aligned = aligned_size; mutex_unlock(&sbi->s_lock); } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1c7aa1ea4724..df805bd05508 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -31,7 +31,7 @@ static int __exfat_write_inode(struct inode *inode, int sync) return 0; /* - * If the indode is already unlinked, there is no need for updating it. + * If the inode is already unlinked, there is no need for updating it. */ if (ei->dir.dir == DIR_DELETED) return 0; @@ -114,10 +114,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters = 0; - if (EXFAT_I(inode)->i_size_ondisk > 0) + if (ei->i_size_ondisk > 0) num_clusters = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, - sbi); + EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); if (clu_offset >= num_clusters) num_to_be_allocated = clu_offset - num_clusters + 1; @@ -416,10 +415,10 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); - if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) { + if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, "invalid size(size(%llu) > aligned(%llu)\n", - i_size_read(inode), EXFAT_I(inode)->i_size_aligned); + i_size_read(inode), ei->i_size_aligned); return -EIO; } @@ -603,8 +602,8 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index d34e6193258d..d5bd8e6d9741 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/buffer_head.h> +#include <linux/blk_types.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -180,7 +181,7 @@ int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync) set_buffer_uptodate(bhs[i]); mark_buffer_dirty(bhs[i]); if (sync) - write_dirty_buffer(bhs[i], 0); + write_dirty_buffer(bhs[i], REQ_SYNC); } for (i = 0; i < nr_bhs && sync; i++) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 24b41103d1cc..af4eb39cc0c3 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -229,7 +229,7 @@ static int exfat_search_empty_slot(struct super_block *sb, i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); @@ -306,7 +306,6 @@ static int exfat_find_empty_entry(struct inode *inode, { int dentry; unsigned int ret, last_clu; - sector_t sector; loff_t size = 0; struct exfat_chain clu; struct exfat_dentry *ep = NULL; @@ -379,7 +378,7 @@ static int exfat_find_empty_entry(struct inode *inode, struct buffer_head *bh; ep = exfat_get_dentry(sb, - &(ei->dir), ei->entry + 1, &bh, §or); + &(ei->dir), ei->entry + 1, &bh); if (!ep) return -EIO; @@ -395,9 +394,9 @@ static int exfat_find_empty_entry(struct inode *inode, /* directory inode should be updated in here */ i_size_write(inode, size); - EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size; - EXFAT_I(inode)->i_size_aligned += sbi->cluster_size; - EXFAT_I(inode)->flags = p_dir->flags; + ei->i_size_ondisk += sbi->cluster_size; + ei->i_size_aligned += sbi->cluster_size; + ei->flags = p_dir->flags; inode->i_blocks += 1 << sbi->sect_per_clus_bits; } @@ -779,7 +778,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct exfat_inode_info *ei = EXFAT_I(inode); struct buffer_head *bh; - sector_t sector; int num_entries, entry, err = 0; mutex_lock(&EXFAT_SB(sb)->s_lock); @@ -791,7 +789,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) goto unlock; } - ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + ep = exfat_get_dentry(sb, &cdir, entry, &bh); if (!ep) { err = -EIO; goto unlock; @@ -895,7 +893,7 @@ static int exfat_check_dir_empty(struct super_block *sb, while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); @@ -932,7 +930,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); struct buffer_head *bh; - sector_t sector; int num_entries, entry, err; mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); @@ -957,7 +954,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } - ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + ep = exfat_get_dentry(sb, &cdir, entry, &bh); if (!ep) { err = -EIO; goto unlock; @@ -1005,13 +1002,12 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, struct exfat_inode_info *ei) { int ret, num_old_entries, num_new_entries; - sector_t sector_old, sector_new; struct exfat_dentry *epold, *epnew; struct super_block *sb = inode->i_sb; struct buffer_head *new_bh, *old_bh; int sync = IS_DIRSYNC(inode); - epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, §or_old); + epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh); if (!epold) return -EIO; @@ -1032,8 +1028,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (newentry < 0) return newentry; /* -EIO or -ENOSPC */ - epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh); if (!epnew) return -EIO; @@ -1046,12 +1041,10 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, brelse(old_bh); brelse(new_bh); - epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh, - §or_old); + epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh); if (!epold) return -EIO; - epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh); if (!epnew) { brelse(old_bh); return -EIO; @@ -1093,12 +1086,11 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) { int ret, newentry, num_new_entries, num_old_entries; - sector_t sector_mov, sector_new; struct exfat_dentry *epmov, *epnew; struct super_block *sb = inode->i_sb; struct buffer_head *mov_bh, *new_bh; - epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, §or_mov); + epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh); if (!epmov) return -EIO; @@ -1116,7 +1108,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, if (newentry < 0) return newentry; /* -EIO or -ENOSPC */ - epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, §or_new); + epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh); if (!epnew) return -EIO; @@ -1129,12 +1121,10 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, brelse(mov_bh); brelse(new_bh); - epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh, - §or_mov); + epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh); if (!epmov) return -EIO; - epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh); if (!epnew) { brelse(mov_bh); return -EIO; @@ -1216,7 +1206,7 @@ static int __exfat_rename(struct inode *old_parent_inode, exfat_chain_dup(&olddir, &ei->dir); dentry = ei->entry; - ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL); + ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); if (!ep) { ret = -EIO; goto out; @@ -1237,7 +1227,7 @@ static int __exfat_rename(struct inode *old_parent_inode, p_dir = &(new_ei->dir); new_entry = new_ei->entry; - ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); if (!ep) goto out; @@ -1277,7 +1267,7 @@ static int __exfat_rename(struct inode *old_parent_inode, if (!ret && new_inode) { /* delete entries of new_dir */ - ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); if (!ep) { ret = -EIO; goto del_out; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 314d5407a1be..ef115e673406 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -761,7 +761,7 @@ int exfat_create_upcase_table(struct super_block *sb) while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < sbi->dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 5539ffc20d16..8c9fb7dcec16 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -17,6 +17,7 @@ #include <linux/iversion.h> #include <linux/nls.h> #include <linux/buffer_head.h> +#include <linux/magic.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -364,11 +365,11 @@ static int exfat_read_root(struct inode *inode) inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) - & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; - EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; - EXFAT_I(inode)->i_size_aligned = i_size_read(inode); - EXFAT_I(inode)->i_size_ondisk = i_size_read(inode); + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; + ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; + ei->i_size_aligned = i_size_read(inode); + ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 3be9dd6412b7..d4f306aa5ace 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,6 +118,7 @@ struct ext2_sb_info { spinlock_t s_lock; struct mb_cache *s_ea_block_cache; struct dax_device *s_daxdev; + u64 s_dax_part_off; }; static inline spinlock_t * diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 333fa62661d5..602578b72d8c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> +#include <linux/dax.h> #include "ext2.h" #include "acl.h" #include "xattr.h" @@ -816,9 +817,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; - iomap->dax_dev = sbi->s_daxdev; + if (flags & IOMAP_DAX) + iomap->dax_dev = sbi->s_daxdev; + else + iomap->bdev = inode->i_sb->s_bdev; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -827,6 +830,8 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } else { iomap->type = IOMAP_MAPPED; iomap->addr = (u64)bno << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += sbi->s_dax_part_off; iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } @@ -1297,9 +1302,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); if (IS_DAX(inode)) { - error = iomap_zero_range(inode, newsize, - PAGE_ALIGN(newsize) - newsize, NULL, - &ext2_iomap_ops); + error = dax_zero_range(inode, newsize, + PAGE_ALIGN(newsize) - newsize, NULL, + &ext2_iomap_ops); } else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d8d580b609ba..94f1fbd7d3ac 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -802,7 +802,6 @@ static unsigned long descriptor_loc(struct super_block *sb, static int ext2_fill_super(struct super_block *sb, void *data, int silent) { - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; @@ -822,17 +821,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto failed; + return -ENOMEM; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - goto failed; + return -ENOMEM; } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = dax_dev; + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -946,11 +945,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { - if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, - bdev_nr_sectors(sb->s_bdev))) { + if (!sbi->s_daxdev) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); clear_opt(sbi->s_mount_opt, DAX); + } else if (blocksize != PAGE_SIZE) { + ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + clear_opt(sbi->s_mount_opt, DAX); } } @@ -1199,11 +1200,10 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: + fs_put_dax(sbi->s_daxdev); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); -failed: - fs_put_dax(dax_dev); return ret; } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 0613dfcbfd4a..5a35768d6149 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -246,7 +246,6 @@ retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); - ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); @@ -264,7 +263,6 @@ retry: } out_stop: ext4_journal_stop(handle); - ext4_fc_stop_update(inode); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 74b172a4adda..a6bb86f52b9a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -303,7 +303,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) goto done; brelse(bh); bh = NULL; - offset = 0; } done: err = 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 404dd50856e5..71a3cdceaa03 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1298,6 +1298,8 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 +#define EXT4_LABEL_MAX 16 + /* * Structure of the super block */ @@ -1347,7 +1349,7 @@ struct ext4_super_block { /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[16]; /* volume name */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* @@ -1661,7 +1663,7 @@ struct ext4_sb_info { struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ - atomic_t s_last_trim_minblks; + unsigned long s_last_trim_minblks; /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; @@ -1697,6 +1699,7 @@ struct ext4_sb_info { */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; + u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif @@ -1725,9 +1728,9 @@ struct ext4_sb_info { */ struct work_struct s_error_work; - /* Ext4 fast commit stuff */ + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; - atomic_t s_fc_ineligible_updates; + /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. @@ -1747,7 +1750,6 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; - u64 s_fc_avg_commit_time; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2399,8 +2401,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); + BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); #if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); @@ -2926,8 +2927,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); -void ext4_fc_start_ineligible(struct super_block *sb, int reason); -void ext4_fc_stop_ineligible(struct super_block *sb); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); @@ -2935,6 +2934,7 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); +void ext4_fc_destroy_dentry_cache(void); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; @@ -3096,6 +3096,9 @@ extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); +extern unsigned int ext4_list_backups(struct super_block *sb, + unsigned int *three, unsigned int *five, + unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, @@ -3110,6 +3113,8 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); +extern __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6def7339056d..3477a16d08ae 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -162,6 +162,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, { if (!ext4_handle_valid(handle)) return 0; + if (is_handle_aborted(handle)) + return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ecf819bf189..74c91da585d7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -27,8 +27,8 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> -#include <linux/backing-dev.h> #include <linux/iomap.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -1496,8 +1496,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, - EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } @@ -2025,7 +2024,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@ -2054,7 +2052,6 @@ prepend: + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@ -4407,8 +4404,7 @@ retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) @@ -4416,8 +4412,7 @@ retry: retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; @@ -4647,8 +4642,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, - (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) @@ -4697,8 +4690,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - ext4_fc_start_update(inode); - if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(inode, offset, len); goto exit; @@ -4762,7 +4753,6 @@ out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: - ext4_fc_stop_update(inode); return ret; } @@ -5344,7 +5334,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5383,7 +5373,6 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: @@ -5485,7 +5474,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; @@ -5560,7 +5549,6 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0f32b445582a..5ae8026a0c56 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -65,21 +65,11 @@ * * Fast Commit Ineligibility * ------------------------- - * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligibility is marked by calling one of the - * two following functions: - * - * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall - * back to full commit. This is useful in case of transient errors. * - * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all - * the fast commits happening between ext4_fc_start_ineligible() and - * ext4_fc_stop_ineligible() and one fast commit after the call to - * ext4_fc_stop_ineligible() to fall back to full commits. It is important to - * make one more fast commit to fall back to full commit after stop call so - * that it guaranteed that the fast commit ineligible operation contained - * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is - * followed by at least 1 full commit. + * Not all operations are supported by fast commits today (e.g extended + * attributes). Fast commit ineligibility is marked by calling + * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back + * to full commit. * * Atomicity of commits * -------------------- @@ -166,15 +156,13 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Make fast commit atomic updates more fine grained. Today, a fast commit - * eligible update must be protected within ext4_fc_start_update() and - * ext4_fc_stop_update(). These routines are called at much higher - * routines. This can be made more fine grained by combining with - * ext4_journal_start(). + * 1) Fast commit's commit path locks the entire file system during fast + * commit. This has significant performance penalty. Instead of that, we + * should use ext4_fc_start/stop_update functions to start inode level + * updates from ext4_journal_start/stop. Once we do that we can drop file + * system locking during commit path. * - * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - * - * 3) Handle more ineligible cases. + * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> @@ -329,44 +317,6 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) } /* - * Start a fast commit ineligible update. Any commits that happen while - * such an operation is in progress fall back to full commits. - */ -void ext4_fc_start_ineligible(struct super_block *sb, int reason) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - WARN_ON(reason >= EXT4_FC_REASON_MAX); - sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; - atomic_inc(&sbi->s_fc_ineligible_updates); -} - -/* - * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here - * to ensure that after stopping the ineligible update, at least one full - * commit takes place. - */ -void ext4_fc_stop_ineligible(struct super_block *sb) -{ - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); -} - -static inline int ext4_fc_is_ineligible(struct super_block *sb) -{ - return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); -} - -/* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full @@ -391,7 +341,7 @@ static int ext4_fc_track_template( (sbi->s_mount_state & EXT4_FC_REPLAY)) return -EOPNOTSUPP; - if (ext4_fc_is_ineligible(inode->i_sb)) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return -EINVAL; tid = handle->h_transaction->t_tid; @@ -796,7 +746,6 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); - dst += dlen; return true; } @@ -1123,6 +1072,32 @@ out: return ret; } +static void ext4_fc_update_stats(struct super_block *sb, int status, + u64 commit_time, int nblks) +{ + struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; + + jbd_debug(1, "Fast commit ended with status = %d", status); + if (status == EXT4_FC_STATUS_OK) { + stats->fc_num_commits++; + stats->fc_numblks += nblks; + if (likely(stats->s_fc_avg_commit_time)) + stats->s_fc_avg_commit_time = + (commit_time + + stats->s_fc_avg_commit_time * 3) / 4; + else + stats->s_fc_avg_commit_time = commit_time; + } else if (status == EXT4_FC_STATUS_FAILED || + status == EXT4_FC_STATUS_INELIGIBLE) { + if (status == EXT4_FC_STATUS_FAILED) + stats->fc_failed_commits++; + stats->fc_ineligible_commits++; + } else { + stats->fc_skipped_commits++; + } + trace_ext4_fc_commit_stop(sb, nblks, status); +} + /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit @@ -1135,18 +1110,15 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); - int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; + int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; trace_ext4_fc_commit_start(sb); start_time = ktime_get(); - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (ext4_fc_is_ineligible(sb))) { - reason = EXT4_FC_REASON_INELIGIBLE; - goto out; - } + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return jbd2_complete_transaction(journal, commit_tid); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1155,67 +1127,52 @@ restart_fc: if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; - reason = EXT4_FC_REASON_ALREADY_COMMITTED; - goto out; + ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); + return 0; } else if (ret) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_START_FAILED; - goto out; + /* + * Commit couldn't start. Just update stats and perform a + * full commit. + */ + ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); + return jbd2_complete_transaction(journal, commit_tid); + } + + /* + * After establishing journal barrier via jbd2_fc_begin_commit(), check + * if we are fast commit ineligible. + */ + if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { + status = EXT4_FC_STATUS_INELIGIBLE; + goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } atomic_inc(&sbi->s_fc_subtid); - jbd2_fc_end_commit(journal); -out: - /* Has any ineligible update happened since we started? */ - if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_INELIGIBLE; - } - - spin_lock(&sbi->s_fc_lock); - if (reason != EXT4_FC_REASON_OK && - reason != EXT4_FC_REASON_ALREADY_COMMITTED) { - sbi->s_fc_stats.fc_ineligible_commits++; - } else { - sbi->s_fc_stats.fc_num_commits++; - sbi->s_fc_stats.fc_numblks += nblks; - } - spin_unlock(&sbi->s_fc_lock); - nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; - trace_ext4_fc_commit_stop(sb, nblks, reason); - commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ret = jbd2_fc_end_commit(journal); /* - * weight the commit time higher than the average time so we don't - * react too strongly to vast changes in the commit time + * weight the commit time higher than the average time so we + * don't react too strongly to vast changes in the commit time */ - if (likely(sbi->s_fc_avg_commit_time)) - sbi->s_fc_avg_commit_time = (commit_time + - sbi->s_fc_avg_commit_time * 3) / 4; - else - sbi->s_fc_avg_commit_time = commit_time; - jbd_debug(1, - "Fast commit ended with blks = %d, reason = %d, subtid - %d", - nblks, reason, subtid); - if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal); - if (reason == EXT4_FC_REASON_FC_START_FAILED || - reason == EXT4_FC_REASON_INELIGIBLE) - return jbd2_complete_transaction(journal, commit_tid); - return 0; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ext4_fc_update_stats(sb, status, commit_time, nblks); + return ret; + +fallback: + ret = jbd2_fc_end_commit_fallback(journal); + ext4_fc_update_stats(sb, status, 0, 0); + return ret; } /* @@ -1812,11 +1769,14 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } } - ret = ext4_punch_hole(inode, - le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits, - le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits); - if (ret) - jbd_debug(1, "ext4_punch_hole returned %d", ret); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_remove_space(inode, lrange.fc_lblk, + lrange.fc_lblk + lrange.fc_len - 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (ret) { + iput(inode); + return 0; + } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); @@ -2173,7 +2133,7 @@ int ext4_fc_info_show(struct seq_file *seq, void *v) "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, - div_u64(sbi->s_fc_avg_commit_time, 1000)); + div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], @@ -2192,3 +2152,8 @@ int __init ext4_fc_init_dentry_cache(void) return 0; } + +void ext4_fc_destroy_dentry_cache(void) +{ + kmem_cache_destroy(ext4_fc_dentry_cachep); +} diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 937c381b4c85..083ad1cb705a 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -71,21 +71,19 @@ struct ext4_fc_tail { }; /* - * Fast commit reason codes + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: */ enum { - /* - * Commit status codes: - */ - EXT4_FC_REASON_OK = 0, - EXT4_FC_REASON_INELIGIBLE, - EXT4_FC_REASON_ALREADY_COMMITTED, - EXT4_FC_REASON_FC_START_FAILED, - EXT4_FC_REASON_FC_FAILED, - - /* - * Fast commit ineligiblity reasons: - */ EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, @@ -117,7 +115,10 @@ struct ext4_fc_stats { unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; unsigned long fc_num_commits; unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; unsigned long fc_numblks; + u64 s_fc_avg_commit_time; }; #define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4c5f41052351..8cc11715518a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -259,7 +259,6 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - ext4_fc_start_update(inode); inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) @@ -271,7 +270,6 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, out: inode_unlock(inode); - ext4_fc_stop_update(inode); if (likely(ret > 0)) { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); @@ -552,9 +550,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - ext4_fc_start_update(inode); ret = ext4_orphan_add(handle, inode); - ext4_fc_stop_update(inode); if (ret) { ext4_journal_stop(handle); goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 39a1ab129fdc..635bcf68a67e 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,7 +7,7 @@ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/iversion.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -1929,8 +1929,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) retry: err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bfd3545f1e5d..5f79d265d06a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> +#include <linux/dax.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -741,10 +742,11 @@ out_sem: if (ret) return ret; } - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); } - + if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || + map->m_flags & EXT4_MAP_MAPPED)) + ext4_fc_track_range(handle, inode, map->m_lblk, + map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; @@ -1844,30 +1846,16 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int bget_one(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, unsigned int len) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; int ret = 0, err = 0; int inline_data = ext4_has_inline_data(inode); struct buffer_head *inode_bh = NULL; + loff_t size; ClearPageChecked(page); @@ -1877,14 +1865,6 @@ static int __ext4_journalled_writepage(struct page *page, inode_bh = ext4_journalled_write_inline_data(inode, len, page); if (inode_bh == NULL) goto out; - } else { - page_bufs = page_buffers(page); - if (!page_bufs) { - BUG(); - goto out; - } - ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, bget_one); } /* * We need to release the page lock before we start the @@ -1905,7 +1885,8 @@ static int __ext4_journalled_writepage(struct page *page, lock_page(page); put_page(page); - if (page->mapping != mapping) { + size = i_size_read(inode); + if (page->mapping != mapping || page_offset(page) > size) { /* The page got truncated from under us */ ext4_journal_stop(handle); ret = 0; @@ -1915,6 +1896,13 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { + struct buffer_head *page_bufs = page_buffers(page); + + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); @@ -1935,9 +1923,6 @@ static int __ext4_journalled_writepage(struct page *page, out: unlock_page(page); out_no_pagelock: - if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, - NULL, bput_one); brelse(inode_bh); return ret; } @@ -2257,7 +2242,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; - io_end_size = 0; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) @@ -2282,7 +2266,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; - io_end_size = 0; *map_bh = false; out: *m_lblk = lblk; @@ -3271,7 +3254,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, - loff_t length) + loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; @@ -3288,8 +3271,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + if (flags & IOMAP_DAX) + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + else + iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; @@ -3309,9 +3294,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3348,8 +3337,8 @@ retry: * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ - WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); - if (IS_DAX(inode)) + WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); + if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback @@ -3420,7 +3409,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; out: - ext4_set_iomap(inode, iomap, &map, offset, length); + ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } @@ -3540,7 +3529,7 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, delalloc = ext4_iomap_is_delalloc(inode, &map); set_iomap: - ext4_set_iomap(inode, iomap, &map, offset, length); + ext4_set_iomap(inode, iomap, &map, offset, length, flags); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; @@ -3780,8 +3769,8 @@ static int ext4_block_zero_page_range(handle_t *handle, length = max; if (IS_DAX(inode)) { - return iomap_zero_range(inode, from, length, NULL, - &ext4_iomap_ops); + return dax_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -4523,7 +4512,7 @@ has_buffer: static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { - ext4_fsblk_t err_blk; + ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, @@ -4538,7 +4527,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode, int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { - ext4_fsblk_t err_blk; + ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, @@ -5320,7 +5309,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; } - ext4_fc_start_update(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; @@ -5344,7 +5333,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) { ext4_journal_stop(handle); - ext4_fc_stop_update(inode); return error; } /* Update corresponding info in inode so that everything is in @@ -5356,7 +5344,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { - ext4_fc_stop_update(inode); return error; } } @@ -5370,12 +5357,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { - ext4_fc_stop_update(inode); return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { - ext4_fc_stop_update(inode); return -EINVAL; } @@ -5427,8 +5412,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, - (oldsize > 0 ? oldsize - 1 : 0) >> - inode->i_sb->s_blocksize_bits); + EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, @@ -5499,7 +5483,6 @@ err_out: ext4_std_error(inode->i_sb, error); if (!error) error = rc; - ext4_fc_stop_update(inode); return error; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 606dee9e08a3..bbbedf27b71c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,6 +27,248 @@ #include "fsmap.h" #include <trace/events/ext4.h> +typedef void ext4_update_sb_callback(struct ext4_super_block *es, + const void *arg); + +/* + * Superblock modification callback function for changing file system + * label + */ +static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +{ + /* Sanity check, this should never happen */ + BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); + + memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); +} + +static +int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, + ext4_update_sb_callback func, + const void *arg) +{ + int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = sbi->s_sbh; + struct ext4_super_block *es = sbi->s_es; + + trace_ext4_update_sb(sb, bh->b_blocknr, 1); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + + lock_buffer(bh); + func(es, arg); + ext4_superblock_csum_set(sb); + unlock_buffer(bh); + + if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_err; + err = sync_dirty_buffer(bh); +out_err: + ext4_std_error(sb, err); + return err; +} + +/* + * Update one backup superblock in the group 'grp' using the callback + * function 'func' and argument 'arg'. If the handle is NULL the + * modification is not journalled. + * + * Returns: 0 when no modification was done (no superblock in the group) + * 1 when the modification was successful + * <0 on error + */ +static int ext4_update_backup_sb(struct super_block *sb, + handle_t *handle, ext4_group_t grp, + ext4_update_sb_callback func, const void *arg) +{ + int err = 0; + ext4_fsblk_t sb_block; + struct buffer_head *bh; + unsigned long offset = 0; + struct ext4_super_block *es; + + if (!ext4_bg_has_super(sb, grp)) + return 0; + + /* + * For the group 0 there is always 1k padding, so we have + * either adjust offset, or sb_block depending on blocksize + */ + if (grp == 0) { + sb_block = 1 * EXT4_MIN_BLOCK_SIZE; + offset = do_div(sb_block, sb->s_blocksize); + } else { + sb_block = ext4_group_first_block_no(sb, grp); + offset = 0; + } + + trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); + + bh = ext4_sb_bread(sb, sb_block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_bh; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + lock_buffer(bh); + if (ext4_has_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " + "superblock %llu\n", sb_block); + unlock_buffer(bh); + err = -EFSBADCRC; + goto out_bh; + } + func(es, arg); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if (err) + goto out_bh; + + if (handle) { + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_bh; + } else { + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + } + err = sync_dirty_buffer(bh); + +out_bh: + brelse(bh); + ext4_std_error(sb, err); + return (err) ? err : 1; +} + +/* + * Update primary and backup superblocks using the provided function + * func and argument arg. + * + * Only the primary superblock and at most two backup superblock + * modifications are journalled; the rest is modified without journal. + * This is safe because e2fsck will re-write them if there is a problem, + * and we're very unlikely to ever need more than two backups. + */ +static +int ext4_update_superblocks_fn(struct super_block *sb, + ext4_update_sb_callback func, + const void *arg) +{ + handle_t *handle; + ext4_group_t ngroups; + unsigned int three = 1; + unsigned int five = 5; + unsigned int seven = 7; + int err = 0, ret, i; + ext4_group_t grp, primary_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * We can't update superblocks while the online resize is running + */ + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, "Can't modify superblock while" + "performing online resize"); + return -EBUSY; + } + + /* + * We're only going to update primary superblock and two + * backup superblocks in this transaction. + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + /* Update primary superblock */ + err = ext4_update_primary_sb(sb, handle, func, arg); + if (err) { + ext4_msg(sb, KERN_ERR, "Failed to update primary " + "superblock"); + goto out_journal; + } + + primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); + ngroups = ext4_get_groups_count(sb); + + /* + * Update backup superblocks. We have to start from group 0 + * because it might not be where the primary superblock is + * if the fs is mounted with -o sb=<backup_sb_block> + */ + i = 0; + grp = 0; + while (grp < ngroups) { + /* Skip primary superblock */ + if (grp == primary_grp) + goto next_grp; + + ret = ext4_update_backup_sb(sb, handle, grp, func, arg); + if (ret < 0) { + /* Ignore bad checksum; try to update next sb */ + if (ret == -EFSBADCRC) + goto next_grp; + err = ret; + goto out_journal; + } + + i += ret; + if (handle && i > 1) { + /* + * We're only journalling primary superblock and + * two backup superblocks; the rest is not + * journalled. + */ + err = ext4_journal_stop(handle); + if (err) + goto out; + handle = NULL; + } +next_grp: + grp = ext4_list_backups(sb, &three, &five, &seven); + } + +out_journal: + if (handle) { + ret = ext4_journal_stop(handle); + if (ret && !err) + err = ret; + } +out: + clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); + smp_mb__after_atomic(); + return err ? err : 0; +} + /** * Swap memory between @a and @b for @len bytes. * @@ -169,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -252,7 +494,6 @@ revert: err_out1: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); ext4_double_up_write_data_sem(inode, inode_bl); err_out: @@ -743,7 +984,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, u32 flags = fa->flags; int err = -EOPNOTSUPP; - ext4_fc_start_update(inode); if (flags & ~EXT4_FL_USER_VISIBLE) goto out; @@ -764,7 +1004,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, goto out; err = ext4_ioctl_setproject(inode, fa->fsx_projid); out: - ext4_fc_stop_update(inode); return err; } @@ -850,6 +1089,64 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) return err; } +static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) +{ + size_t len; + int ret = 0; + char new_label[EXT4_LABEL_MAX + 1]; + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Copy the maximum length allowed for ext4 label with one more to + * find the required terminating null byte in order to test the + * label length. The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) + return -EFAULT; + + len = strnlen(new_label, EXT4_LABEL_MAX + 1); + if (len > EXT4_LABEL_MAX) + return -EINVAL; + + /* + * Clear the buffer after the new label + */ + memset(new_label + len, 0, EXT4_LABEL_MAX - len); + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); + + mnt_drop_write_file(filp); + return ret; +} + +static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) +{ + char label[EXT4_LABEL_MAX + 1]; + + /* + * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because + * FSLABEL_MAX must include terminating null byte, while s_volume_name + * does not have to. + */ + BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); + + memset(label, 0, sizeof(label)); + lock_buffer(sbi->s_sbh); + strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + unlock_buffer(sbi->s_sbh); + + if (copy_to_user(user_label, label, sizeof(label))) + return -EFAULT; + return 0; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1117,8 +1414,6 @@ resizefs_out: sizeof(range))) return -EFAULT; - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; @@ -1266,6 +1561,13 @@ resizefs_out: case EXT4_IOC_CHECKPOINT: return ext4_ioctl_checkpoint(filp, arg); + case FS_IOC_GETFSLABEL: + return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); + + case FS_IOC_SETFSLABEL: + return ext4_ioctl_setlabel(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1273,13 +1575,7 @@ resizefs_out: long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - long ret; - - ext4_fc_start_update(file_inode(filp)); - ret = __ext4_ioctl(filp, cmd, arg); - ext4_fc_stop_update(file_inode(filp)); - - return ret; + return __ext4_ioctl(filp, cmd, arg); } #ifdef CONFIG_COMPAT @@ -1347,6 +1643,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: case EXT4_IOC_CHECKPOINT: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 215b7068f548..cf2fd9fc7d98 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, - ext4_group_t group, int needed) + ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; @@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct list_head list; struct ext4_buddy e4b; int err; - int busy = 0; - int free, free_total = 0; + int free = 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) @@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - if (needed == 0) - needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; - INIT_LIST_HEAD(&list); -repeat: - free = 0; ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); - busy = 1; + *busy = 1; continue; } if (pa->pa_deleted) { @@ -4898,22 +4892,13 @@ repeat: call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - free_total += free; - - /* if we still need more blocks and some PAs were used, try again */ - if (free_total < needed && busy) { - ext4_unlock_group(sb, group); - cond_resched(); - busy = 0; - goto repeat; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", - free_total, group, grp->bb_free); - return free_total; + free, group, grp->bb_free); + return free; } /* @@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; - int freed = 0; + int freed = 0, busy = 0; + int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); + + if (needed == 0) + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; + repeat: for (i = 0; i < ngroups && needed > 0; i++) { - ret = ext4_mb_discard_group_preallocations(sb, i, needed); + ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; + cond_resched(); + } + + if (needed > 0 && busy && ++retry < 3) { + busy = 0; + goto repeat; } return freed; @@ -6373,7 +6369,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); if (ret >= 0) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); @@ -6404,6 +6400,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -6422,6 +6419,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < q->limits.discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + q->limits.discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) @@ -6474,7 +6478,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } if (!ret) - atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7e0b4f81c6c0..ff8916e1d38e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -437,12 +437,12 @@ int ext4_ext_migrate(struct inode *inode) percpu_down_write(&sbi->s_writepages_rwsem); /* - * Worst case we can touch the allocation bitmaps, a bgd - * block, and a block to link in the orphan list. We do need - * need to worry about credits for modifying the quota inode. + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need need to worry about + * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, - 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -459,6 +459,13 @@ int ext4_ext_migrate(struct inode *inode) ext4_journal_stop(handle); goto out_unlock; } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + */ + ei = EXT4_I(inode); + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later @@ -467,7 +474,6 @@ int ext4_ext_migrate(struct inode *inode) clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* @@ -492,17 +498,10 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out_tmp_inode; } - ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 64a579734f93..95aa212f0863 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -632,7 +632,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { if (next_blk == EXT_MAX_BLOCKS) { - o_start = o_end; ret = -ENODATA; goto out; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cb261714991..1d370364230e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { - gfp_flags = GFP_NOFS; + gfp_t new_gfp_flags = GFP_NOFS; if (io->io_bio) ext4_io_submit(io); else - gfp_flags |= __GFP_NOFAIL; - congestion_wait(BLK_RW_ASYNC, HZ/50); + new_gfp_flags |= __GFP_NOFAIL; + memalloc_retry_wait(gfp_flags); + gfp_flags = new_gfp_flags; goto retry_encrypt; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b63cb88ccdae..ee8f02f406cb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -717,12 +717,23 @@ out: * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... */ -static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, - unsigned *five, unsigned *seven) +unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, + unsigned int *five, unsigned int *seven) { - unsigned *min = three; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned int *min = three; int mult = 3; - unsigned ret; + unsigned int ret; + + if (ext4_has_feature_sparse_super2(sb)) { + do { + if (*min > 2) + return UINT_MAX; + ret = le32_to_cpu(es->s_backup_bgs[*min - 1]); + *min += 1; + } while (!ret); + return ret; + } if (!ext4_has_feature_sparse_super(sb)) { ret = *min; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e33b5eca694..db9fe4843529 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -47,6 +47,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -73,12 +75,9 @@ static int ext4_mark_recovery_complete(struct super_block *sb, static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); @@ -86,6 +85,16 @@ static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); +static int ext4_validate_options(struct fs_context *fc); +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb); +static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); +static int ext4_get_tree(struct fs_context *fc); +static int ext4_reconfigure(struct fs_context *fc); +static void ext4_fc_free(struct fs_context *fc); +static int ext4_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering @@ -113,13 +122,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * transaction start -> page lock(s) -> i_data_sem (rw) */ +static const struct fs_context_operations ext4_context_ops = { + .parse_param = ext4_parse_param, + .get_tree = ext4_get_tree, + .reconfigure = ext4_reconfigure, + .free = ext4_fc_free, +}; + + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "ext2", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); @@ -130,11 +148,12 @@ MODULE_ALIAS("ext2"); static struct file_system_type ext3_fs_type = { - .owner = THIS_MODULE, - .name = "ext3", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "ext3", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); @@ -260,8 +279,8 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -static __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); @@ -912,14 +931,20 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; - atomic_inc(&EXT4_SB(sb)->s_msg_count); - if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) - return; + if (sb) { + atomic_inc(&EXT4_SB(sb)->s_msg_count); + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), + "EXT4-fs")) + return; + } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + if (sb) + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + else + printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } @@ -1647,7 +1672,6 @@ static const struct super_operations ext4_sops = { .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, - .remount_fs = ext4_remount, .show_options = ext4_show_options, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, @@ -1665,7 +1689,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, @@ -1674,152 +1698,169 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, - Opt_nowarn_on_error, Opt_mblk_io_submit, - Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, + Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_debug, "debug"}, - {Opt_removed, "oldalloc"}, - {Opt_removed, "orlov"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_noload, "norecovery"}, - {Opt_noload, "noload"}, - {Opt_removed, "nobh"}, - {Opt_removed, "bh"}, - {Opt_commit, "commit=%u"}, - {Opt_min_batch_time, "min_batch_time=%u"}, - {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_dev, "journal_dev=%u"}, - {Opt_journal_path, "journal_path=%s"}, - {Opt_journal_checksum, "journal_checksum"}, - {Opt_nojournal_checksum, "nojournal_checksum"}, - {Opt_journal_async_commit, "journal_async_commit"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_err_abort, "data_err=abort"}, - {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_grpquota, "grpquota"}, - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_prjquota, "prjquota"}, - {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_i_version, "i_version"}, - {Opt_dax, "dax"}, - {Opt_dax_always, "dax=always"}, - {Opt_dax_inode, "dax=inode"}, - {Opt_dax_never, "dax=never"}, - {Opt_stripe, "stripe=%u"}, - {Opt_delalloc, "delalloc"}, - {Opt_warn_on_error, "warn_on_error"}, - {Opt_nowarn_on_error, "nowarn_on_error"}, - {Opt_lazytime, "lazytime"}, - {Opt_nolazytime, "nolazytime"}, - {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, - {Opt_nodelalloc, "nodelalloc"}, - {Opt_removed, "mblk_io_submit"}, - {Opt_removed, "nomblk_io_submit"}, - {Opt_block_validity, "block_validity"}, - {Opt_noblock_validity, "noblock_validity"}, - {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, - {Opt_journal_ioprio, "journal_ioprio=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, - {Opt_dioread_nolock, "dioread_nolock"}, - {Opt_dioread_lock, "nodioread_nolock"}, - {Opt_dioread_lock, "dioread_lock"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_init_itable, "init_itable=%u"}, - {Opt_init_itable, "init_itable"}, - {Opt_noinit_itable, "noinit_itable"}, -#ifdef CONFIG_EXT4_DEBUG - {Opt_fc_debug_force, "fc_debug_force"}, - {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, -#endif - {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, - {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption"}, - {Opt_inlinecrypt, "inlinecrypt"}, - {Opt_nombcache, "nombcache"}, - {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ - {Opt_removed, "prefetch_block_bitmaps"}, - {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, - {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, - {Opt_removed, "check=none"}, /* mount option from ext2/3 */ - {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ - {Opt_removed, "reservation"}, /* mount option from ext2/3 */ - {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ - {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ - {Opt_err, NULL}, +static const struct constant_table ext4_param_errors[] = { + {"continue", EXT4_MOUNT_ERRORS_CONT}, + {"panic", EXT4_MOUNT_ERRORS_PANIC}, + {"remount-ro", EXT4_MOUNT_ERRORS_RO}, + {} }; -static ext4_fsblk_t get_sb_block(void **data) -{ - ext4_fsblk_t sb_block; - char *options = (char *) *data; +static const struct constant_table ext4_param_data[] = { + {"journal", EXT4_MOUNT_JOURNAL_DATA}, + {"ordered", EXT4_MOUNT_ORDERED_DATA}, + {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, + {} +}; - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ +static const struct constant_table ext4_param_data_err[] = { + {"abort", Opt_data_err_abort}, + {"ignore", Opt_data_err_ignore}, + {} +}; - options += 3; - /* TODO: use simple_strtoll with >32bit ext4 */ - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; +static const struct constant_table ext4_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; - return sb_block; -} +static const struct constant_table ext4_param_dax[] = { + {"always", Opt_dax_always}, + {"inode", Opt_dax_inode}, + {"never", Opt_dax_never}, + {} +}; + +/* String parameter that allows empty argument */ +#define fsparam_string_empty(NAME, OPT) \ + __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) + +/* + * Mount option specification + * We don't use fsparam_flag_no because of the way we set the + * options and the way we show them in _ext4_show_options(). To + * keep the changes to a minimum, let's keep the negative options + * separate for now. + */ +static const struct fs_parameter_spec ext4_param_specs[] = { + fsparam_flag ("bsddf", Opt_bsd_df), + fsparam_flag ("minixdf", Opt_minix_df), + fsparam_flag ("grpid", Opt_grpid), + fsparam_flag ("bsdgroups", Opt_grpid), + fsparam_flag ("nogrpid", Opt_nogrpid), + fsparam_flag ("sysvgroups", Opt_nogrpid), + fsparam_u32 ("resgid", Opt_resgid), + fsparam_u32 ("resuid", Opt_resuid), + fsparam_u32 ("sb", Opt_sb), + fsparam_enum ("errors", Opt_errors, ext4_param_errors), + fsparam_flag ("nouid32", Opt_nouid32), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("oldalloc", Opt_removed), + fsparam_flag ("orlov", Opt_removed), + fsparam_flag ("user_xattr", Opt_user_xattr), + fsparam_flag ("nouser_xattr", Opt_nouser_xattr), + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("noacl", Opt_noacl), + fsparam_flag ("norecovery", Opt_noload), + fsparam_flag ("noload", Opt_noload), + fsparam_flag ("bh", Opt_removed), + fsparam_flag ("nobh", Opt_removed), + fsparam_u32 ("commit", Opt_commit), + fsparam_u32 ("min_batch_time", Opt_min_batch_time), + fsparam_u32 ("max_batch_time", Opt_max_batch_time), + fsparam_u32 ("journal_dev", Opt_journal_dev), + fsparam_bdev ("journal_path", Opt_journal_path), + fsparam_flag ("journal_checksum", Opt_journal_checksum), + fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), + fsparam_flag ("journal_async_commit",Opt_journal_async_commit), + fsparam_flag ("abort", Opt_abort), + fsparam_enum ("data", Opt_data, ext4_param_data), + fsparam_enum ("data_err", Opt_data_err, + ext4_param_data_err), + fsparam_string_empty + ("usrjquota", Opt_usrjquota), + fsparam_string_empty + ("grpjquota", Opt_grpjquota), + fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_noquota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("prjquota", Opt_prjquota), + fsparam_flag ("barrier", Opt_barrier), + fsparam_u32 ("barrier", Opt_barrier), + fsparam_flag ("nobarrier", Opt_nobarrier), + fsparam_flag ("i_version", Opt_i_version), + fsparam_flag ("dax", Opt_dax), + fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), + fsparam_u32 ("stripe", Opt_stripe), + fsparam_flag ("delalloc", Opt_delalloc), + fsparam_flag ("nodelalloc", Opt_nodelalloc), + fsparam_flag ("warn_on_error", Opt_warn_on_error), + fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), + fsparam_u32 ("debug_want_extra_isize", + Opt_debug_want_extra_isize), + fsparam_flag ("mblk_io_submit", Opt_removed), + fsparam_flag ("nomblk_io_submit", Opt_removed), + fsparam_flag ("block_validity", Opt_block_validity), + fsparam_flag ("noblock_validity", Opt_noblock_validity), + fsparam_u32 ("inode_readahead_blks", + Opt_inode_readahead_blks), + fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), + fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), + fsparam_flag ("dioread_nolock", Opt_dioread_nolock), + fsparam_flag ("nodioread_nolock", Opt_dioread_lock), + fsparam_flag ("dioread_lock", Opt_dioread_lock), + fsparam_flag ("discard", Opt_discard), + fsparam_flag ("nodiscard", Opt_nodiscard), + fsparam_u32 ("init_itable", Opt_init_itable), + fsparam_flag ("init_itable", Opt_init_itable), + fsparam_flag ("noinit_itable", Opt_noinit_itable), +#ifdef CONFIG_EXT4_DEBUG + fsparam_flag ("fc_debug_force", Opt_fc_debug_force), + fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), +#endif + fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), + fsparam_flag ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_flag ("inlinecrypt", Opt_inlinecrypt), + fsparam_flag ("nombcache", Opt_nombcache), + fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ + fsparam_flag ("prefetch_block_bitmaps", + Opt_removed), + fsparam_flag ("no_prefetch_block_bitmaps", + Opt_no_prefetch_block_bitmaps), + fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), + fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ + {} +}; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define DEFAULT_MB_OPTIMIZE_SCAN (-1) @@ -1828,90 +1869,22 @@ static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; -#ifdef CONFIG_QUOTA -static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname, *old_qname = get_qf_name(sb, sbi, qtype); - int ret = -1; - - if (sb_any_quota_loaded(sb) && !old_qname) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); - return -1; - } - if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_INFO, "Journaled quota options " - "ignored when QUOTA feature is enabled"); - return 1; - } - qname = match_strdup(args); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for storing quotafile name"); - return -1; - } - if (old_qname) { - if (strcmp(old_qname, qname) == 0) - ret = 1; - else - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", - QTYPE2NAME(qtype)); - goto errout; - } - if (strchr(qname, '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on filesystem root"); - goto errout; - } - rcu_assign_pointer(sbi->s_qf_names[qtype], qname); - set_opt(sb, QUOTA); - return 1; -errout: - kfree(qname); - return ret; -} - -static int clear_qf_name(struct super_block *sb, int qtype) -{ - - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *old_qname = get_qf_name(sb, sbi, qtype); - - if (sb_any_quota_loaded(sb) && old_qname) { - ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" - " when quota turned on"); - return -1; - } - rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); - synchronize_rcu(); - kfree(old_qname); - return 1; -} -#endif - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 -#define MOPT_CLEAR_ERR 0x0010 -#define MOPT_GTE0 0x0020 #ifdef CONFIG_QUOTA #define MOPT_Q 0 -#define MOPT_QFMT 0x0040 +#define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif -#define MOPT_DATAJ 0x0080 -#define MOPT_NO_EXT2 0x0100 -#define MOPT_NO_EXT3 0x0200 +#define MOPT_NO_EXT2 0x0020 +#define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) -#define MOPT_STRING 0x0400 -#define MOPT_SKIP 0x0800 -#define MOPT_2 0x1000 +#define MOPT_SKIP 0x0080 +#define MOPT_2 0x0100 static const struct mount_opts { int token; @@ -1944,40 +1917,17 @@ static const struct mount_opts { EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, - {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2}, - {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2}, + {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, - {Opt_commit, 0, MOPT_GTE0}, - {Opt_max_batch_time, 0, MOPT_GTE0}, - {Opt_min_batch_time, 0, MOPT_GTE0}, - {Opt_inode_readahead_blks, 0, MOPT_GTE0}, - {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, - {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_stripe, 0, MOPT_GTE0}, - {Opt_resuid, 0, MOPT_GTE0}, - {Opt_resgid, 0, MOPT_GTE0}, - {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, - {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, - {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, - {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, - {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, - {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, - MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_dax_type, 0, MOPT_EXT4_ONLY}, + {Opt_journal_dev, 0, MOPT_NO_EXT2}, + {Opt_journal_path, 0, MOPT_NO_EXT2}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, + {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1989,7 +1939,6 @@ static const struct mount_opts { #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, - {Opt_debug_want_extra_isize, 0, MOPT_GTE0}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, @@ -2000,23 +1949,15 @@ static const struct mount_opts { {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, - {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, - {Opt_offusrjquota, 0, MOPT_Q}, - {Opt_offgrpjquota, 0, MOPT_Q}, - {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, - {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, - {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, - {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_STRING}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, - {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, - {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif {Opt_err, 0, 0} }; @@ -2025,474 +1966,970 @@ static const struct mount_opts { static const struct ext4_sb_encodings { __u16 magic; char *name; - char *version; + unsigned int version; } ext4_sb_encoding_map[] = { - {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"}, + {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; -static int ext4_sb_read_encoding(const struct ext4_super_block *es, - const struct ext4_sb_encodings **encoding, - __u16 *flags) +static const struct ext4_sb_encodings * +ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) - break; - - if (i >= ARRAY_SIZE(ext4_sb_encoding_map)) - return -EINVAL; + return &ext4_sb_encoding_map[i]; - *encoding = &ext4_sb_encoding_map[i]; - *flags = le16_to_cpu(es->s_encoding_flags); - - return 0; + return NULL; } #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, - const char *opt, - const substring_t *arg, - bool is_remount) +static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) { #ifdef CONFIG_FS_ENCRYPTION struct ext4_sb_info *sbi = EXT4_SB(sb); int err; - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if (is_remount && !sbi->s_dummy_enc_policy.policy) { - ext4_msg(sb, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); - return -1; - } - err = fscrypt_set_test_dummy_encryption(sb, arg->from, + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_policy); if (err) { - if (err == -EEXIST) - ext4_msg(sb, KERN_WARNING, - "Can't change test_dummy_encryption on remount"); - else if (err == -EINVAL) - ext4_msg(sb, KERN_WARNING, - "Value of option \"%s\" is unrecognized", opt); - else - ext4_msg(sb, KERN_WARNING, - "Error processing option \"%s\" [%d]", - opt, err); - return -1; + ext4_msg(sb, KERN_WARNING, + "Error while setting test dummy encryption [%d]", err); + return err; } ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); #endif - return 1; + return 0; } -struct ext4_parsed_options { - unsigned long journal_devnum; - unsigned int journal_ioprio; - int mb_optimize_scan; +#define EXT4_SPEC_JQUOTA (1 << 0) +#define EXT4_SPEC_JQFMT (1 << 1) +#define EXT4_SPEC_DATAJ (1 << 2) +#define EXT4_SPEC_SB_BLOCK (1 << 3) +#define EXT4_SPEC_JOURNAL_DEV (1 << 4) +#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) +#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6) +#define EXT4_SPEC_s_want_extra_isize (1 << 7) +#define EXT4_SPEC_s_max_batch_time (1 << 8) +#define EXT4_SPEC_s_min_batch_time (1 << 9) +#define EXT4_SPEC_s_inode_readahead_blks (1 << 10) +#define EXT4_SPEC_s_li_wait_mult (1 << 11) +#define EXT4_SPEC_s_max_dir_size_kb (1 << 12) +#define EXT4_SPEC_s_stripe (1 << 13) +#define EXT4_SPEC_s_resuid (1 << 14) +#define EXT4_SPEC_s_resgid (1 << 15) +#define EXT4_SPEC_s_commit_interval (1 << 16) +#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) +#define EXT4_SPEC_s_sb_block (1 << 18) + +struct ext4_fs_context { + char *s_qf_names[EXT4_MAXQUOTAS]; + char *test_dummy_enc_arg; + int s_jquota_fmt; /* Format of quota to use */ + int mb_optimize_scan; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + unsigned short qname_spec; + unsigned long vals_s_flags; /* Bits to set in s_flags */ + unsigned long mask_s_flags; /* Bits changed in s_flags */ + unsigned long journal_devnum; + unsigned long s_commit_interval; + unsigned long s_stripe; + unsigned int s_inode_readahead_blks; + unsigned int s_want_extra_isize; + unsigned int s_li_wait_mult; + unsigned int s_max_dir_size_kb; + unsigned int journal_ioprio; + unsigned int vals_s_mount_opt; + unsigned int mask_s_mount_opt; + unsigned int vals_s_mount_opt2; + unsigned int mask_s_mount_opt2; + unsigned int vals_s_mount_flags; + unsigned int mask_s_mount_flags; + unsigned int opt_flags; /* MOPT flags */ + unsigned int spec; + u32 s_max_batch_time; + u32 s_min_batch_time; + kuid_t s_resuid; + kgid_t s_resgid; + ext4_fsblk_t s_sb_block; }; -static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, struct ext4_parsed_options *parsed_opts, - int is_remount) +static void ext4_fc_free(struct fs_context *fc) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fs_context *ctx = fc->fs_private; + int i; + + if (!ctx) + return; + + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(ctx->s_qf_names[i]); + + kfree(ctx->test_dummy_enc_arg); + kfree(ctx); +} + +int ext4_init_fs_context(struct fs_context *fc) +{ + struct ext4_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &ext4_context_ops; + + return 0; +} + +#ifdef CONFIG_QUOTA +/* + * Note the name of the specified quota file. + */ +static int note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + char *qname; + + if (param->size < 1) { + ext4_msg(NULL, KERN_ERR, "Missing quota name"); + return -EINVAL; + } + if (strchr(param->string, '/')) { + ext4_msg(NULL, KERN_ERR, + "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->s_qf_names[qtype]) { + if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { + ext4_msg(NULL, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + return -EINVAL; + } + return 0; + } + + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); + if (!qname) { + ext4_msg(NULL, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -ENOMEM; + } + ctx->s_qf_names[qtype] = qname; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} + +/* + * Clear the name of the specified quota file. + */ +static int unnote_qf_name(struct fs_context *fc, int qtype) +{ + struct ext4_fs_context *ctx = fc->fs_private; + + if (ctx->s_qf_names[qtype]) + kfree(ctx->s_qf_names[qtype]); + + ctx->s_qf_names[qtype] = NULL; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} +#endif + +#define EXT4_SET_CTX(name) \ +static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ + unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name |= flag; \ +} \ +static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ + unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name &= ~flag; \ +} \ +static inline unsigned long \ +ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ +{ \ + return (ctx->vals_s_##name & flag); \ +} \ + +EXT4_SET_CTX(flags); +EXT4_SET_CTX(mount_opt); +EXT4_SET_CTX(mount_opt2); +EXT4_SET_CTX(mount_flags); + +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; const struct mount_opts *m; + int is_remount; kuid_t uid; kgid_t gid; - int arg = 0; + int token; -#ifdef CONFIG_QUOTA - if (token == Opt_usrjquota) - return set_qf_name(sb, USRQUOTA, &args[0]); - else if (token == Opt_grpjquota) - return set_qf_name(sb, GRPQUOTA, &args[0]); - else if (token == Opt_offusrjquota) - return clear_qf_name(sb, USRQUOTA); - else if (token == Opt_offgrpjquota) - return clear_qf_name(sb, GRPQUOTA); -#endif - switch (token) { - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); - break; - case Opt_sb: - return 1; /* handled by get_sb_block() */ - case Opt_removed: - ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); - return 1; - case Opt_abort: - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - return 1; - case Opt_i_version: - sb->s_flags |= SB_I_VERSION; - return 1; - case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; - return 1; - case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; - return 1; - case Opt_inlinecrypt: -#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; -#else - ext4_msg(sb, KERN_ERR, "inline encryption not supported"); -#endif - return 1; - } + token = fs_parse(fc, ext4_param_specs, param, &result); + if (token < 0) + return token; + is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; - if (m->token == Opt_err) { - ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " - "or missing value", opt); - return -1; - } - - if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { - ext4_msg(sb, KERN_ERR, - "Mount option \"%s\" incompatible with ext2", opt); - return -1; - } - if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { - ext4_msg(sb, KERN_ERR, - "Mount option \"%s\" incompatible with ext3", opt); - return -1; - } + ctx->opt_flags |= m->flags; - if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) - return -1; - if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) - return -1; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { - set_opt2(sb, EXPLICIT_DELALLOC); + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { - set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); + ctx_set_mount_opt2(ctx, + EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else - return -1; - } - if (m->flags & MOPT_CLEAR_ERR) - clear_opt(sb, ERRORS_MASK); - if (token == Opt_noquota && sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return -1; + return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { - ext4_msg(sb, KERN_ERR, "%s option not supported", opt); - } else if (token == Opt_commit) { - if (arg == 0) - arg = JBD2_DEFAULT_MAX_COMMIT_AGE; - else if (arg > INT_MAX / HZ) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "%s option not supported", + param->key); + return 0; + } + + switch (token) { +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!*param->string) + return unnote_qf_name(fc, USRQUOTA); + else + return note_qf_name(fc, USRQUOTA, param); + case Opt_grpjquota: + if (!*param->string) + return unnote_qf_name(fc, GRPQUOTA); + else + return note_qf_name(fc, GRPQUOTA, param); +#endif + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5"); + break; + case Opt_sb: + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + ext4_msg(NULL, KERN_WARNING, + "Ignoring %s option on remount", param->key); + } else { + ctx->s_sb_block = result.uint_32; + ctx->spec |= EXT4_SPEC_s_sb_block; + } + return 0; + case Opt_removed: + ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", + param->key); + return 0; + case Opt_abort: + ctx_set_mount_flags(ctx, EXT4_MF_FS_ABORTED); + return 0; + case Opt_i_version: + ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); + ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n"); + ctx_set_flags(ctx, SB_I_VERSION); + return 0; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + ctx_set_flags(ctx, SB_INLINECRYPT); +#else + ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); +#endif + return 0; + case Opt_errors: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); + ctx_set_mount_opt(ctx, result.uint_32); + return 0; +#ifdef CONFIG_QUOTA + case Opt_jqfmt: + ctx->s_jquota_fmt = result.uint_32; + ctx->spec |= EXT4_SPEC_JQFMT; + return 0; +#endif + case Opt_data: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + ctx_set_mount_opt(ctx, result.uint_32); + ctx->spec |= EXT4_SPEC_DATAJ; + return 0; + case Opt_commit: + if (result.uint_32 == 0) + ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (result.uint_32 > INT_MAX / HZ) { + ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", - arg, INT_MAX / HZ); - return -1; + result.uint_32, INT_MAX / HZ); + return -EINVAL; } - sbi->s_commit_interval = HZ * arg; - } else if (token == Opt_debug_want_extra_isize) { - if ((arg & 1) || - (arg < 4) || - (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { - ext4_msg(sb, KERN_ERR, - "Invalid want_extra_isize %d", arg); - return -1; + ctx->s_commit_interval = HZ * result.uint_32; + ctx->spec |= EXT4_SPEC_s_commit_interval; + return 0; + case Opt_debug_want_extra_isize: + if ((result.uint_32 & 1) || (result.uint_32 < 4)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", result.uint_32); + return -EINVAL; } - sbi->s_want_extra_isize = arg; - } else if (token == Opt_max_batch_time) { - sbi->s_max_batch_time = arg; - } else if (token == Opt_min_batch_time) { - sbi->s_min_batch_time = arg; - } else if (token == Opt_inode_readahead_blks) { - if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { - ext4_msg(sb, KERN_ERR, + ctx->s_want_extra_isize = result.uint_32; + ctx->spec |= EXT4_SPEC_s_want_extra_isize; + return 0; + case Opt_max_batch_time: + ctx->s_max_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_batch_time; + return 0; + case Opt_min_batch_time: + ctx->s_min_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_min_batch_time; + return 0; + case Opt_inode_readahead_blks: + if (result.uint_32 && + (result.uint_32 > (1 << 30) || + !is_power_of_2(result.uint_32))) { + ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); - return -1; + return -EINVAL; } - sbi->s_inode_readahead_blks = arg; - } else if (token == Opt_init_itable) { - set_opt(sb, INIT_INODE_TABLE); - if (!args->from) - arg = EXT4_DEF_LI_WAIT_MULT; - sbi->s_li_wait_mult = arg; - } else if (token == Opt_max_dir_size_kb) { - sbi->s_max_dir_size_kb = arg; + ctx->s_inode_readahead_blks = result.uint_32; + ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; + return 0; + case Opt_init_itable: + ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); + ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (param->type == fs_value_is_string) + ctx->s_li_wait_mult = result.uint_32; + ctx->spec |= EXT4_SPEC_s_li_wait_mult; + return 0; + case Opt_max_dir_size_kb: + ctx->s_max_dir_size_kb = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; + return 0; #ifdef CONFIG_EXT4_DEBUG - } else if (token == Opt_fc_debug_max_replay) { - sbi->s_fc_debug_max_replay = arg; + case Opt_fc_debug_max_replay: + ctx->s_fc_debug_max_replay = result.uint_32; + ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; + return 0; #endif - } else if (token == Opt_stripe) { - sbi->s_stripe = arg; - } else if (token == Opt_resuid) { - uid = make_kuid(current_user_ns(), arg); + case Opt_stripe: + ctx->s_stripe = result.uint_32; + ctx->spec |= EXT4_SPEC_s_stripe; + return 0; + case Opt_resuid: + uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { - ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); - return -1; + ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", + result.uint_32); + return -EINVAL; } - sbi->s_resuid = uid; - } else if (token == Opt_resgid) { - gid = make_kgid(current_user_ns(), arg); + ctx->s_resuid = uid; + ctx->spec |= EXT4_SPEC_s_resuid; + return 0; + case Opt_resgid: + gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { - ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); - return -1; + ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", + result.uint_32); + return -EINVAL; } - sbi->s_resgid = gid; - } else if (token == Opt_journal_dev) { + ctx->s_resgid = gid; + ctx->spec |= EXT4_SPEC_s_resgid; + return 0; + case Opt_journal_dev: if (is_remount) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); - return -1; + return -EINVAL; } - parsed_opts->journal_devnum = arg; - } else if (token == Opt_journal_path) { - char *journal_path; + ctx->journal_devnum = result.uint_32; + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; + return 0; + case Opt_journal_path: + { struct inode *journal_inode; struct path path; int error; if (is_remount) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); - return -1; - } - journal_path = match_strdup(&args[0]); - if (!journal_path) { - ext4_msg(sb, KERN_ERR, "error: could not dup " - "journal device string"); - return -1; + return -EINVAL; } - error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + error = fs_lookup_param(fc, param, 1, &path); if (error) { - ext4_msg(sb, KERN_ERR, "error: could not find " - "journal device path: error %d", error); - kfree(journal_path); - return -1; + ext4_msg(NULL, KERN_ERR, "error: could not find " + "journal device path"); + return -EINVAL; } journal_inode = d_inode(path.dentry); - if (!S_ISBLK(journal_inode->i_mode)) { - ext4_msg(sb, KERN_ERR, "error: journal path %s " - "is not a block device", journal_path); - path_put(&path); - kfree(journal_path); - return -1; - } - - parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); + ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); - kfree(journal_path); - } else if (token == Opt_journal_ioprio) { - if (arg > 7) { - ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + return 0; + } + case Opt_journal_ioprio: + if (result.uint_32 > 7) { + ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); - return -1; - } - parsed_opts->journal_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); - } else if (token == Opt_test_dummy_encryption) { - return ext4_set_test_dummy_encryption(sb, opt, &args[0], - is_remount); - } else if (m->flags & MOPT_DATAJ) { - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { - ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return -1; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= m->mount_opt; + return -EINVAL; } -#ifdef CONFIG_QUOTA - } else if (m->flags & MOPT_QFMT) { - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot change journaled " - "quota options when quota turned on"); - return -1; + ctx->journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); + ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; + return 0; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + if (param->type == fs_value_is_flag) { + ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; + ctx->test_dummy_enc_arg = NULL; + return 0; } - if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_INFO, - "Quota format mount options ignored " - "when QUOTA feature is enabled"); - return 1; + if (*param->string && + !(!strcmp(param->string, "v1") || + !strcmp(param->string, "v2"))) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", + param->key); + return -EINVAL; } - sbi->s_jquota_fmt = m->mount_opt; + ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; + ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, + GFP_KERNEL); +#else + ext4_msg(NULL, KERN_WARNING, + "Test dummy encryption mount option ignored"); #endif - } else if (token == Opt_dax || token == Opt_dax_always || - token == Opt_dax_inode || token == Opt_dax_never) { + return 0; + case Opt_dax: + case Opt_dax_type: #ifdef CONFIG_FS_DAX - switch (token) { + { + int type = (token == Opt_dax) ? + Opt_dax : result.uint_32; + + switch (type) { case Opt_dax: case Opt_dax_always: - if (is_remount && - (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || - (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { - fail_dax_change_remount: - ext4_msg(sb, KERN_ERR, "can't change " - "dax mount option while remounting"); - return -1; - } - if (is_remount && - (test_opt(sb, DATA_FLAGS) == - EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - return -1; - } - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: - if (is_remount && - (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || - (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) - goto fail_dax_change_remount; - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: - if (is_remount && - ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || - (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || - !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) - goto fail_dax_change_remount; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } + return 0; + } #else - ext4_msg(sb, KERN_INFO, "dax option not supported"); - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - return -1; + ext4_msg(NULL, KERN_INFO, "dax option not supported"); + return -EINVAL; #endif - } else if (token == Opt_data_err_abort) { - sbi->s_mount_opt |= m->mount_opt; - } else if (token == Opt_data_err_ignore) { - sbi->s_mount_opt &= ~m->mount_opt; - } else if (token == Opt_mb_optimize_scan) { - if (arg != 0 && arg != 1) { - ext4_msg(sb, KERN_WARNING, + case Opt_data_err: + if (result.uint_32 == Opt_data_err_abort) + ctx_set_mount_opt(ctx, m->mount_opt); + else if (result.uint_32 == Opt_data_err_ignore) + ctx_clear_mount_opt(ctx, m->mount_opt); + return 0; + case Opt_mb_optimize_scan: + if (result.int_32 != 0 && result.int_32 != 1) { + ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); - return -1; + return -EINVAL; } - parsed_opts->mb_optimize_scan = arg; - } else { - if (!args->from) - arg = 1; + ctx->mb_optimize_scan = result.int_32; + return 0; + } + + /* + * At this point we should only be getting options requiring MOPT_SET, + * or MOPT_CLEAR. Anything else is a bug + */ + if (m->token == Opt_err) { + ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", + param->key); + WARN_ON(1); + return -EINVAL; + } + + else { + unsigned int set = 0; + + if ((param->type == fs_value_is_flag) || + result.uint_32 > 0) + set = 1; + if (m->flags & MOPT_CLEAR) - arg = !arg; + set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { - ext4_msg(sb, KERN_WARNING, - "buggy handling of option %s", opt); + ext4_msg(NULL, KERN_WARNING, + "buggy handling of option %s", + param->key); WARN_ON(1); - return -1; + return -EINVAL; } if (m->flags & MOPT_2) { - if (arg != 0) - sbi->s_mount_opt2 |= m->mount_opt; + if (set != 0) + ctx_set_mount_opt2(ctx, m->mount_opt); else - sbi->s_mount_opt2 &= ~m->mount_opt; + ctx_clear_mount_opt2(ctx, m->mount_opt); } else { - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; + if (set != 0) + ctx_set_mount_opt(ctx, m->mount_opt); else - sbi->s_mount_opt &= ~m->mount_opt; + ctx_clear_mount_opt(ctx, m->mount_opt); } } - return 1; + + return 0; } -static int parse_options(char *options, struct super_block *sb, - struct ext4_parsed_options *ret_opts, - int is_remount) +static int parse_options(struct fs_context *fc, char *options) { - struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); - char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; - substring_t args[MAX_OPT_ARGS]; - int token; + struct fs_parameter param; + int ret; + char *key; if (!options) - return 1; + return 0; - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, ret_opts, - is_remount) < 0) - return 0; + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + param.type = fs_value_is_flag; + param.string = NULL; + + if (value) { + if (value == key) + continue; + + *value++ = 0; + v_len = strlen(value); + param.string = kmemdup_nul(value, v_len, + GFP_KERNEL); + if (!param.string) + return -ENOMEM; + param.type = fs_value_is_string; + } + + param.key = key; + param.size = v_len; + + ret = ext4_parse_param(fc, ¶m); + if (param.string) + kfree(param.string); + if (ret < 0) + return ret; + } + } + + ret = ext4_validate_options(fc); + if (ret < 0) + return ret; + + return 0; +} + +static int parse_apply_sb_mount_options(struct super_block *sb, + struct ext4_fs_context *m_ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *s_mount_opts = NULL; + struct ext4_fs_context *s_ctx = NULL; + struct fs_context *fc = NULL; + int ret = -ENOMEM; + + if (!sbi->s_es->s_mount_opts[0]) + return 0; + + s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + return ret; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + goto out_free; + + s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!s_ctx) + goto out_free; + + fc->fs_private = s_ctx; + fc->s_fs_info = sbi; + + ret = parse_options(fc, s_mount_opts); + if (ret < 0) + goto parse_failed; + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) { +parse_failed: + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + ret = 0; + goto out_free; } + + if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) + m_ctx->journal_devnum = s_ctx->journal_devnum; + if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) + m_ctx->journal_ioprio = s_ctx->journal_ioprio; + + ret = ext4_apply_options(fc, sb); + +out_free: + kfree(s_ctx); + kfree(fc); + kfree(s_mount_opts); + return ret; +} + +static void ext4_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ #ifdef CONFIG_QUOTA + bool quota_feature = ext4_has_feature_quota(sb); + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + int i; + + if (quota_feature) + return; + + if (ctx->spec & EXT4_SPEC_JQUOTA) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + qname = ctx->s_qf_names[i]; /* May be NULL */ + if (qname) + set_opt(sb, QUOTA); + ctx->s_qf_names[i] = NULL; + qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, + lockdep_is_held(&sb->s_umount)); + if (qname) + kfree_rcu(qname); + } + } + + if (ctx->spec & EXT4_SPEC_JQFMT) + sbi->s_jquota_fmt = ctx->s_jquota_fmt; +#endif +} + +/* + * Check quota settings consistency. + */ +static int ext4_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + bool quota_feature = ext4_has_feature_quota(sb); + bool quota_loaded = sb_any_quota_loaded(sb); + bool usr_qf_name, grp_qf_name, usrquota, grpquota; + int quota_flags, i; + /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ - if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { - ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && + !ext4_has_feature_project(sb)) { + ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); - return 0; + return -EINVAL; } - usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); - grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); - if (usr_qf_name || grp_qf_name) { - if (test_opt(sb, USRQUOTA) && usr_qf_name) - clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && grp_qf_name) - clear_opt(sb, GRPQUOTA); + quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; + if (quota_loaded && + ctx->mask_s_mount_opt & quota_flags && + !ctx_test_mount_opt(ctx, quota_flags)) + goto err_quota_change; - if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { - ext4_msg(sb, KERN_ERR, "old and new quota " - "format mixing"); + if (ctx->spec & EXT4_SPEC_JQUOTA) { + + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + if (quota_loaded && + !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) + goto err_jquota_change; + + if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && + strcmp(get_qf_name(sb, sbi, i), + ctx->s_qf_names[i]) != 0) + goto err_jquota_specified; + } + + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, + "Journaled quota options ignored when " + "QUOTA feature is enabled"); return 0; } + } - if (!sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "not specified"); + if (ctx->spec & EXT4_SPEC_JQFMT) { + if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) + goto err_jquota_change; + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, "Quota format mount options " + "ignored when QUOTA feature is enabled"); return 0; } } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || + ctx->s_qf_names[USRQUOTA]); + grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || + ctx->s_qf_names[GRPQUOTA]); + + usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + test_opt(sb, USRQUOTA)); + + grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || + test_opt(sb, GRPQUOTA)); + + if (usr_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + grpquota = false; + } + + if (usr_qf_name || grp_qf_name) { + if (usrquota || grpquota) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + + if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { + ext4_msg(NULL, KERN_ERR, "journaled quota format " + "not specified"); + return -EINVAL; + } + } + + return 0; + +err_quota_change: + ext4_msg(NULL, KERN_ERR, + "Cannot change quota options when quota turned on"); + return -EINVAL; +err_jquota_change: + ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " + "options when quota turned on"); + return -EINVAL; +err_jquota_specified: + ext4_msg(NULL, KERN_ERR, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; +#else + return 0; #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { +} + +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + + if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext2"); + return -EINVAL; + } + if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext3"); + return -EINVAL; + } + + if (ctx->s_want_extra_isize > + (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", + ctx->s_want_extra_isize); + return -EINVAL; + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (blocksize < PAGE_SIZE) - ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " + ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " "experimental mount option 'dioread_nolock' " "for blocksize < PAGE_SIZE"); } + +#ifdef CONFIG_FS_ENCRYPTION + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) && + is_remount && !sbi->s_dummy_enc_policy.policy) { + ext4_msg(NULL, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } +#endif + + if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { + if (!sbi->s_journal) { + ext4_msg(NULL, KERN_WARNING, + "Remounting file system with no journal " + "so ignoring journalled data option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != + test_opt(sb, DATA_FLAGS)) { + ext4_msg(NULL, KERN_ERR, "Cannot change data mode " + "on remount"); + return -EINVAL; + } + } + + if (is_remount) { + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(NULL, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { +fail_dax_change_remount: + ext4_msg(NULL, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -EINVAL; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { + goto fail_dax_change_remount; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { + goto fail_dax_change_remount; + } + } + + return ext4_check_quota_consistency(fc, sb); +} + +static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + int ret = 0; + + sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; + sbi->s_mount_opt |= ctx->vals_s_mount_opt; + sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; + sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; + sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; + sbi->s_mount_flags |= ctx->vals_s_mount_flags; + sb->s_flags &= ~ctx->mask_s_flags; + sb->s_flags |= ctx->vals_s_flags; + + /* + * i_version differs from common mount option iversion so we have + * to let vfs know that it was set, otherwise it would get cleared + * on remount + */ + if (ctx->mask_s_flags & SB_I_VERSION) + fc->sb_flags |= SB_I_VERSION; + +#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) + APPLY(s_commit_interval); + APPLY(s_stripe); + APPLY(s_max_batch_time); + APPLY(s_min_batch_time); + APPLY(s_want_extra_isize); + APPLY(s_inode_readahead_blks); + APPLY(s_max_dir_size_kb); + APPLY(s_li_wait_mult); + APPLY(s_resgid); + APPLY(s_resuid); + +#ifdef CONFIG_EXT4_DEBUG + APPLY(s_fc_debug_max_replay); +#endif + + ext4_apply_quota_options(fc, sb); + + if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) + ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); + + return ret; +} + + +static int ext4_validate_options(struct fs_context *fc) +{ +#ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + char *usr_qf_name, *grp_qf_name; + + usr_qf_name = ctx->s_qf_names[USRQUOTA]; + grp_qf_name = ctx->s_qf_names[GRPQUOTA]; + + if (usr_qf_name || grp_qf_name) { + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + } +#endif return 1; } @@ -2533,12 +2970,12 @@ static inline void ext4_show_quota_options(struct seq_file *seq, static const char *token2str(int token) { - const struct match_token *t; + const struct fs_parameter_spec *spec; - for (t = tokens; t->token != Opt_err; t++) - if (t->token == token && !strchr(t->pattern, '=')) + for (spec = ext4_param_specs; spec->name != NULL; spec++) + if (spec->opt == token && !spec->type) break; - return t->pattern; + return spec->name; } /* @@ -2564,7 +3001,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) + m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -3876,21 +4313,52 @@ static void ext4_setup_csum_trigger(struct super_block *sb, sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } -static int ext4_fill_super(struct super_block *sb, void *data, int silent) +static void ext4_free_sbi(struct ext4_sb_info *sbi) +{ + if (!sbi) + return; + + kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); + kfree(sbi); +} + +static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return NULL; + + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + + if (!sbi->s_blockgroup_lock) + goto err_out; + + sb->s_fs_info = sbi; + sbi->s_sb = sb; + return sbi; +err_out: + fs_put_dax(sbi->s_daxdev); + kfree(sbi); + return NULL; +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); - char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh, **group_desc; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **flex_groups; ext4_fsblk_t block; - ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; unsigned long def_mount_opts; struct inode *root; - const char *descr; int ret = -ENOMEM; int blocksize, clustersize; unsigned int db_count; @@ -3899,32 +4367,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __u64 blocks_count; int err = 0; ext4_group_t first_not_zeroed; - struct ext4_parsed_options parsed_opts; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ - parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - parsed_opts.journal_devnum = 0; - parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; - if ((data && !orig_data) || !sbi) - goto out_free_base; - - sbi->s_daxdev = dax_dev; - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) - goto out_free_base; - - sb->s_fs_info = sbi; - sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sb_block = sb_block; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - /* Cleanup superblock name */ - strreplace(sb->s_id, '/', '!'); - /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); @@ -3938,10 +4391,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { - logical_sb_block = sb_block; + logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -4146,31 +4599,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - if (sbi->s_es->s_mount_opts[0]) { - char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, - sizeof(sbi->s_es->s_mount_opts), - GFP_KERNEL); - if (!s_mount_opts) - goto failed_mount; - if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - s_mount_opts); - } - kfree(s_mount_opts); - } + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &parsed_opts, 0)) + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) + goto failed_mount; + + err = ext4_apply_options(fc, sb); + if (err < 0) goto failed_mount; #ifdef CONFIG_UNICODE if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; - __u16 encoding_flags; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); - if (ext4_sb_read_encoding(es, &encoding_info, - &encoding_flags)) { + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); goto failed_mount; @@ -4179,15 +4629,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, - "can't mount with superblock charset: %s-%s " + "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", - encoding_info->name, encoding_info->version, + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), encoding_flags); goto failed_mount; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " - "%s-%s with flags 0x%hx", encoding_info->name, - encoding_info->version?:"\b", encoding_flags); + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; @@ -4299,9 +4755,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0, - bdev_nr_sectors(sb->s_bdev))) - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + if (sbi->s_daxdev) { + if (blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { @@ -4337,7 +4796,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { @@ -4620,7 +5079,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); - atomic_set(&sbi->s_fc_ineligible_updates, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); @@ -4653,7 +5111,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); + err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && @@ -4753,7 +5211,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; @@ -4865,9 +5323,9 @@ no_journal: * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ - if (parsed_opts.mb_optimize_scan == 1) + if (ctx->mb_optimize_scan == 1) set_opt2(sb, MB_OPTIMIZE_SCAN); - else if (parsed_opts.mb_optimize_scan == 0) + else if (ctx->mb_optimize_scan == 0) clear_opt2(sb, MB_OPTIMIZE_SCAN); else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); @@ -4969,15 +5427,6 @@ no_journal: if (err) goto failed_mount9; } - if (EXT4_SB(sb)->s_journal) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - descr = " journalled data mode"; - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - descr = " ordered data mode"; - else - descr = " writeback data mode"; - } else - descr = "out journal"; if (test_opt(sb, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); @@ -4987,14 +5436,6 @@ no_journal: "the device does not support discard"); } - if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %.*s%s%s. Quota mode: %s.", descr, - (int) sizeof(sbi->s_es->s_mount_opts), - sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data, - ext4_quota_mode(sb)); - if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5005,7 +5446,6 @@ no_journal: atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); - kfree(orig_data); return 0; cantfind_ext4: @@ -5091,14 +5531,60 @@ failed_mount: ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); -out_free_base: - kfree(sbi); - kfree(orig_data); - fs_put_dax(dax_dev); return err ? err : ret; } +static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi; + const char *descr; + int ret; + + sbi = ext4_alloc_sbi(sb); + if (!sbi) + ret = -ENOMEM; + + fc->s_fs_info = sbi; + + /* Cleanup superblock name */ + strreplace(sb->s_id, '/', '!'); + + sbi->s_sb_block = 1; /* Default super block location */ + if (ctx->spec & EXT4_SPEC_s_sb_block) + sbi->s_sb_block = ctx->s_sb_block; + + ret = __ext4_fill_super(fc, sb); + if (ret < 0) + goto free_sbi; + + if (sbi->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Quota mode: %s.", descr, ext4_quota_mode(sb)); + + return 0; + +free_sbi: + ext4_free_sbi(sbi); + fc->s_fs_info = NULL; + return ret; +} + +static int ext4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ext4_fill_super); +} + /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've @@ -5727,11 +6213,12 @@ struct ext4_mount_options { #endif }; -static int ext4_remount(struct super_block *sb, int *flags, char *data) +static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { + struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags, vfs_flags; + unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; @@ -5740,14 +6227,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif - char *orig_data = kstrdup(data, GFP_KERNEL); - struct ext4_parsed_options parsed_opts; - parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - parsed_opts.journal_devnum = 0; - - if (data && !orig_data) - return -ENOMEM; + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; /* Store the original options */ old_sb_flags = sb->s_flags; @@ -5768,28 +6249,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); - kfree(orig_data); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) - parsed_opts.journal_ioprio = + ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - /* - * Some options can be enabled by ext4 and/or by VFS mount flag - * either way we need to make sure it matches in both *flags and - * s_flags. Copy those selected flags from *flags to s_flags - */ - vfs_flags = SB_LAZYTIME | SB_I_VERSION; - sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); - - if (!parse_options(data, sb, &parsed_opts, 1)) { - err = -EINVAL; - goto restore_opts; - } + ext4_apply_options(fc, sb); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -5836,19 +6305,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; @@ -5996,16 +6465,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); - /* - * Some options can be enabled by ext4 and/or by VFS mount flag - * either way we need to make sure it matches in both *flags and - * s_flags. Copy those selected flags from s_flags to *flags - */ - *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); - - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", - orig_data, ext4_quota_mode(sb)); - kfree(orig_data); return 0; restore_opts: @@ -6031,10 +6490,30 @@ restore_opts: #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); - kfree(orig_data); return err; } +static int ext4_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + int ret; + + fc->s_fs_info = EXT4_SB(sb); + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) + return ret; + + ret = __ext4_remount(fc, sb); + if (ret < 0) + return ret; + + ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", + ext4_quota_mode(sb)); + + return 0; +} + #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) @@ -6275,10 +6754,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) { - lockdep_set_quota_inode(path->dentry->d_inode, - I_DATA_SEM_NORMAL); - } else { + if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; @@ -6298,7 +6774,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); return err; } @@ -6361,8 +6842,19 @@ int ext4_enable_quotas(struct super_block *sb) "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); - for (type--; type >= 0; type--) + for (type--; type >= 0; type--) { + struct inode *inode; + + inode = sb_dqopt(sb)->files[type]; + if (inode) + inode = igrab(inode); dquot_quota_off(sb, type); + if (inode) { + lockdep_set_quota_inode(inode, + I_DATA_SEM_NORMAL); + iput(inode); + } + } return err; } @@ -6466,7 +6958,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (EXT4_SB(sb)->s_journal && !handle) { + if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); @@ -6517,12 +7009,6 @@ out: } #endif -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); -} - #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { @@ -6580,11 +7066,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) } static struct file_system_type ext4_fs_type = { - .owner = THIS_MODULE, - .name = "ext4", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ext4", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); @@ -6649,6 +7136,7 @@ static int __init ext4_init_fs(void) out: unregister_as_ext2(); unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: @@ -6675,6 +7163,7 @@ static void __exit ext4_exit_fs(void) unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 2314f7446592..f61e65ae27d8 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -63,7 +63,7 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - return snprintf(buf, PAGE_SIZE, "%lu\n", + return sysfs_emit(buf, "%lu\n", (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -72,7 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); @@ -130,8 +130,8 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) { if (!sbi->s_journal) - return snprintf(buf, PAGE_SIZE, "<none>\n"); - return snprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "<none>\n"); + return sysfs_emit(buf, "%d\n", task_pid_vnr(sbi->s_journal->j_task)); } @@ -245,6 +245,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); +EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -295,6 +296,7 @@ static struct attribute *ext4_attrs[] = { #endif ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), + ATTR_LIST(last_trim_minblks), NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -357,7 +359,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -374,7 +376,7 @@ static ssize_t ext4_attr_show(struct kobject *kobj, switch (a->attr_id) { case attr_delayed_allocation_blocks: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (s64) EXT4_C2B(sbi, percpu_counter_sum(&sbi->s_dirtyclusters_counter))); case attr_session_write_kbytes: @@ -382,11 +384,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj, case attr_lifetime_write_kbytes: return lifetime_write_kbytes_show(sbi, buf); case attr_reserved_clusters: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); case attr_sra_exceeded_retry_limit: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); case attr_inode_readahead: @@ -394,42 +396,42 @@ static ssize_t ext4_attr_show(struct kobject *kobj, if (!ptr) return 0; if (a->attr_ptr == ptr_ext4_super_block_offset) - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); else - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); case attr_pointer_ul: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%lu\n", + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); case attr_pointer_u8: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); case attr_pointer_u64: if (!ptr) return 0; if (a->attr_ptr == ptr_ext4_super_block_offset) - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); else - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); case attr_pointer_string: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); case attr_feature: - return snprintf(buf, PAGE_SIZE, "supported\n"); + return sysfs_emit(buf, "supported\n"); case attr_first_error_time: return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0fc6e0245732..0a1d236212f8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -8,9 +8,9 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> +#include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> @@ -2450,7 +2450,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 842020311f83..eb22fa91c2b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -28,6 +28,8 @@ #include <linux/fscrypt.h> #include <linux/fsverity.h> +struct pagevec; + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a6accec60d04..ee308a8de432 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -7,7 +7,6 @@ */ #include <linux/fs.h> #include <linux/module.h> -#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -15,6 +14,7 @@ #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" @@ -1390,8 +1390,7 @@ retry: if (err) { clear_page_private_gcing(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1d85f1e58d32..0ec8e32a00b4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -8,8 +8,8 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" @@ -575,7 +575,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e0b5eb28d383..50b2874e758c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -8,7 +8,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/mpage.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/swap.h> @@ -2755,7 +2755,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e65c73c4411d..9683c80ff8c2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -8,6 +8,7 @@ #include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" @@ -587,7 +588,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; @@ -670,8 +671,7 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b4a2f8c36149..575d3dc418d0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -9,6 +9,7 @@ #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> @@ -245,9 +246,7 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } err = -EAGAIN; @@ -424,9 +423,7 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } unlock_page(page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 053b508d1e4f..76e6a3df9aba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,9 +8,9 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> @@ -261,29 +261,22 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) static const struct f2fs_sb_encodings { __u16 magic; char *name; - char *version; + unsigned int version; } f2fs_sb_encoding_map[] = { - {F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"}, + {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; -static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, - const struct f2fs_sb_encodings **encoding, - __u16 *flags) +static const struct f2fs_sb_encodings * +f2fs_sb_read_encoding(const struct f2fs_super_block *sb) { __u16 magic = le16_to_cpu(sb->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) if (magic == f2fs_sb_encoding_map[i].magic) - break; - - if (i >= ARRAY_SIZE(f2fs_sb_encoding_map)) - return -EINVAL; - - *encoding = &f2fs_sb_encoding_map[i]; - *flags = le16_to_cpu(sb->s_encoding_flags); + return &f2fs_sb_encoding_map[i]; - return 0; + return NULL; } struct kmem_cache *f2fs_cf_name_slab; @@ -2456,8 +2449,7 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -3917,25 +3909,32 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) struct unicode_map *encoding; __u16 encoding_flags; - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, - &encoding_flags)) { + encoding_info = f2fs_sb_read_encoding(sbi->raw_super); + if (!encoding_info) { f2fs_err(sbi, "Encoding requested by superblock is unknown"); return -EINVAL; } + encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { f2fs_err(sbi, - "can't mount with superblock charset: %s-%s " + "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", - encoding_info->name, encoding_info->version, + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), encoding_flags); return PTR_ERR(encoding); } f2fs_info(sbi, "Using encoding defined by superblock: " - "%s-%s with flags 0x%hx", encoding_info->name, - encoding_info->version?:"\b", encoding_flags); + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); sbi->sb->s_encoding = encoding; sbi->sb->s_encoding_flags = encoding_flags; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f8a14b1e2ef7..df406c16b2eb 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -205,8 +205,7 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) - return sysfs_emit(buf, "%s (%d.%d.%d)\n", - sb->s_encoding->charset, + return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); diff --git a/fs/file.c b/fs/file.c index 8627dacfc424..97d212a9b814 100644 --- a/fs/file.c +++ b/fs/file.c @@ -841,24 +841,68 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static inline struct file *__fget_files_rcu(struct files_struct *files, + unsigned int fd, fmode_t mask, unsigned int refs) +{ + for (;;) { + struct file *file; + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + struct file __rcu **fdentry; + + if (unlikely(fd >= fdt->max_fds)) + return NULL; + + fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + file = rcu_dereference_raw(*fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(file->f_mode & mask)) + return NULL; + + /* + * Ok, we have a file pointer. However, because we do + * this all locklessly under RCU, we may be racing with + * that file being closed. + * + * Such a race can take two forms: + * + * (a) the file ref already went down to zero, + * and get_file_rcu_many() fails. Just try + * again: + */ + if (unlikely(!get_file_rcu_many(file, refs))) + continue; + + /* + * (b) the file table entry has changed under us. + * Note that we don't need to re-check the 'fdt->fd' + * pointer having changed, because it always goes + * hand-in-hand with 'fdt'. + * + * If so, we need to put our refs and try again. + */ + if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || + unlikely(rcu_dereference_raw(*fdentry) != file)) { + fput_many(file, refs); + continue; + } + + /* + * Ok, we have a ref to the file, and checked that it + * still exists. + */ + return file; + } +} + static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask, unsigned int refs) { struct file *file; rcu_read_lock(); -loop: - file = files_lookup_fd_rcu(files, fd); - if (file) { - /* File object ref couldn't be taken. - * dup2() atomicity guarantee is the reason - * we loop to catch the new file (or NULL pointer) - */ - if (file->f_mode & mask) - file = NULL; - else if (!get_file_rcu_many(file, refs)) - goto loop; - } + file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 67f0e88eed01..f8d7fe6db989 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -372,7 +372,7 @@ static bool inode_do_switch_wbs(struct inode *inode, { struct address_space *mapping = inode->i_mapping; XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; bool switched = false; spin_lock(&inode->i_lock); @@ -389,21 +389,23 @@ static bool inode_do_switch_wbs(struct inode *inode, /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points - * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under writeback. + * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to + * folios actually under writeback. */ - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { - if (PageDirty(page)) { - dec_wb_stat(old_wb, WB_RECLAIMABLE); - inc_wb_stat(new_wb, WB_RECLAIMABLE); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (folio_test_dirty(folio)) { + long nr = folio_nr_pages(folio); + wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); + wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); } } xas_set(&xas, 0); - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + long nr = folio_nr_pages(folio); + WARN_ON_ONCE(!folio_test_writeback(folio)); + wb_stat_mod(old_wb, WB_WRITEBACK, -nr); + wb_stat_mod(new_wb, WB_WRITEBACK, nr); } if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { @@ -1666,6 +1668,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; + else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + if (!(inode->i_state & I_DIRTY_PAGES)) { + inode->i_state &= ~I_PINNING_FSCACHE_WB; + wbc->unpinned_fscache_wb = true; + dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + } + } spin_unlock(&inode->i_lock); @@ -1675,6 +1684,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } + wbc->unpinned_fscache_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/fs_context.c b/fs/fs_context.c index b7e43a780a62..24ce12f0db32 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -548,7 +548,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) param->key); } - if (len > PAGE_SIZE - 2 - size) + if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 3df07c0e32b3..ed40ce5742fd 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -199,6 +199,8 @@ int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p, int b; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; b = lookup_constant(bool_names, param->string, -1); if (b == -1) return fs_param_bad_value(log, param); @@ -211,8 +213,11 @@ int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int base = (unsigned long)p->data; - if (param->type != fs_value_is_string || - kstrtouint(param->string, base, &result->uint_32) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtouint(param->string, base, &result->uint_32) < 0) return fs_param_bad_value(log, param); return 0; } @@ -221,8 +226,11 @@ EXPORT_SYMBOL(fs_param_is_u32); int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || - kstrtoint(param->string, 0, &result->int_32) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtoint(param->string, 0, &result->int_32) < 0) return fs_param_bad_value(log, param); return 0; } @@ -231,8 +239,11 @@ EXPORT_SYMBOL(fs_param_is_s32); int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || - kstrtoull(param->string, 0, &result->uint_64) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtoull(param->string, 0, &result->uint_64) < 0) return fs_param_bad_value(log, param); return 0; } @@ -244,6 +255,8 @@ int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p, const struct constant_table *c; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; c = __lookup_constant(p->data, param->string); if (!c) return fs_param_bad_value(log, param); @@ -255,7 +268,8 @@ EXPORT_SYMBOL(fs_param_is_enum); int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || !*param->string) + if (param->type != fs_value_is_string || + (!*param->string && !(p->flags & fs_param_can_be_empty))) return fs_param_bad_value(log, param); return 0; } @@ -275,7 +289,8 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, { switch (param->type) { case fs_value_is_string: - if (kstrtouint(param->string, 0, &result->uint_32) < 0) + if ((!*param->string && !(p->flags & fs_param_can_be_empty)) || + kstrtouint(param->string, 0, &result->uint_32) < 0) break; if (result->uint_32 <= INT_MAX) return 0; diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index b313a978ae0a..76316c4a3fb7 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -38,3 +38,6 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_OLD_API + bool diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 03a871d689bb..afb090ea16c4 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -6,13 +6,9 @@ fscache-y := \ cache.o \ cookie.o \ - fsdef.o \ io.o \ main.o \ - netfs.o \ - object.o \ - operation.o \ - page.o + volume.o fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index bd4f44c1cce0..2749933852a9 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -1,209 +1,229 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache cache handling * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include "internal.h" -LIST_HEAD(fscache_cache_list); +static LIST_HEAD(fscache_caches); DECLARE_RWSEM(fscache_addremove_sem); -DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); -EXPORT_SYMBOL(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_clearance_waiters); +EXPORT_SYMBOL(fscache_clearance_waiters); -static LIST_HEAD(fscache_cache_tag_list); +static atomic_t fscache_cache_debug_id; /* - * look up a cache tag + * Allocate a cache cookie. */ -struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +static struct fscache_cache *fscache_alloc_cache(const char *name) { - struct fscache_cache_tag *tag, *xtag; - - /* firstly check for the existence of the tag under read lock */ - down_read(&fscache_addremove_sem); - - list_for_each_entry(tag, &fscache_cache_tag_list, link) { - if (strcmp(tag->name, name) == 0) { - atomic_inc(&tag->usage); - up_read(&fscache_addremove_sem); - return tag; - } - } - - up_read(&fscache_addremove_sem); - - /* the tag does not exist - create a candidate */ - xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); - if (!xtag) - /* return a dummy tag if out of memory */ - return ERR_PTR(-ENOMEM); - - atomic_set(&xtag->usage, 1); - strcpy(xtag->name, name); - - /* write lock, search again and add if still not present */ - down_write(&fscache_addremove_sem); + struct fscache_cache *cache; - list_for_each_entry(tag, &fscache_cache_tag_list, link) { - if (strcmp(tag->name, name) == 0) { - atomic_inc(&tag->usage); - up_write(&fscache_addremove_sem); - kfree(xtag); - return tag; + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (cache) { + if (name) { + cache->name = kstrdup(name, GFP_KERNEL); + if (!cache->name) { + kfree(cache); + return NULL; + } } + refcount_set(&cache->ref, 1); + INIT_LIST_HEAD(&cache->cache_link); + cache->debug_id = atomic_inc_return(&fscache_cache_debug_id); } - - list_add_tail(&xtag->link, &fscache_cache_tag_list); - up_write(&fscache_addremove_sem); - return xtag; + return cache; } -/* - * release a reference to a cache tag - */ -void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +static bool fscache_get_cache_maybe(struct fscache_cache *cache, + enum fscache_cache_trace where) { - if (tag != ERR_PTR(-ENOMEM)) { - down_write(&fscache_addremove_sem); + bool success; + int ref; - if (atomic_dec_and_test(&tag->usage)) - list_del_init(&tag->link); - else - tag = NULL; - - up_write(&fscache_addremove_sem); - - kfree(tag); - } + success = __refcount_inc_not_zero(&cache->ref, &ref); + if (success) + trace_fscache_cache(cache->debug_id, ref + 1, where); + return success; } /* - * select a cache in which to store an object - * - the cache addremove semaphore must be at least read-locked by the caller - * - the object will never be an index + * Look up a cache cookie. */ -struct fscache_cache *fscache_select_cache_for_object( - struct fscache_cookie *cookie) +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache) { - struct fscache_cache_tag *tag; - struct fscache_object *object; - struct fscache_cache *cache; + struct fscache_cache *candidate, *cache, *unnamed = NULL; - _enter(""); + /* firstly check for the existence of the cache under read lock */ + down_read(&fscache_addremove_sem); - if (list_empty(&fscache_cache_list)) { - _leave(" = NULL [no cache]"); - return NULL; + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && name && strcmp(cache->name, name) == 0 && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; + if (!cache->name && !name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; } - /* we check the parent to determine the cache to use */ - spin_lock(&cookie->lock); + if (!name) { + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; + } + } - /* the first in the parent's backing list should be the preferred - * cache */ - if (!hlist_empty(&cookie->backing_objects)) { - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); + up_read(&fscache_addremove_sem); - cache = object->cache; - if (fscache_object_is_dying(object) || - test_bit(FSCACHE_IOERROR, &cache->flags)) - cache = NULL; + /* the cache does not exist - create a candidate */ + candidate = fscache_alloc_cache(name); + if (!candidate) + return ERR_PTR(-ENOMEM); - spin_unlock(&cookie->lock); - _leave(" = %s [parent]", cache ? cache->tag->name : "NULL"); - return cache; - } + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); - /* the parent is unbacked */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - /* cookie not an index and is unbacked */ - spin_unlock(&cookie->lock); - _leave(" = NULL [cookie ub,ni]"); - return NULL; + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && name && strcmp(cache->name, name) == 0 && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + if (!cache->name) { + unnamed = cache; + if (!name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + } } - spin_unlock(&cookie->lock); + if (unnamed && is_cache && + fscache_get_cache_maybe(unnamed, fscache_cache_get_acquire)) + goto use_unnamed_cache; - if (!cookie->def->select_cache) - goto no_preference; + if (!name) { + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + } + } - /* ask the netfs for its preference */ - tag = cookie->def->select_cache(cookie->parent->netfs_data, - cookie->netfs_data); - if (!tag) - goto no_preference; + list_add_tail(&candidate->cache_link, &fscache_caches); + trace_fscache_cache(candidate->debug_id, + refcount_read(&candidate->ref), + fscache_cache_new_acquire); + up_write(&fscache_addremove_sem); + return candidate; - if (tag == ERR_PTR(-ENOMEM)) { - _leave(" = NULL [nomem tag]"); - return NULL; - } +got_cache_r: + up_read(&fscache_addremove_sem); + return cache; +use_unnamed_cache: + cache = unnamed; + cache->name = candidate->name; + candidate->name = NULL; +got_cache_w: + up_write(&fscache_addremove_sem); + kfree(candidate->name); + kfree(candidate); + return cache; +} - if (!tag->cache) { - _leave(" = NULL [unbacked tag]"); - return NULL; - } +/** + * fscache_acquire_cache - Acquire a cache-level cookie. + * @name: The name of the cache. + * + * Get a cookie to represent an actual cache. If a name is given and there is + * a nameless cache record available, this will acquire that and set its name, + * directing all the volumes using it to this cache. + * + * The cache will be switched over to the preparing state if not currently in + * use, otherwise -EBUSY will be returned. + */ +struct fscache_cache *fscache_acquire_cache(const char *name) +{ + struct fscache_cache *cache; - if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) - return NULL; + ASSERT(name); + cache = fscache_lookup_cache(name, true); + if (IS_ERR(cache)) + return cache; - _leave(" = %s [specific]", tag->name); - return tag->cache; + if (!fscache_set_cache_state_maybe(cache, + FSCACHE_CACHE_IS_NOT_PRESENT, + FSCACHE_CACHE_IS_PREPARING)) { + pr_warn("Cache tag %s in use\n", name); + fscache_put_cache(cache, fscache_cache_put_cache); + return ERR_PTR(-EBUSY); + } -no_preference: - /* netfs has no preference - just select first cache */ - cache = list_entry(fscache_cache_list.next, - struct fscache_cache, link); - _leave(" = %s [first]", cache->tag->name); return cache; } +EXPORT_SYMBOL(fscache_acquire_cache); /** - * fscache_init_cache - Initialise a cache record - * @cache: The cache record to be initialised - * @ops: The cache operations to be installed in that record - * @idfmt: Format string to define identifier - * @...: sprintf-style arguments + * fscache_put_cache - Release a cache-level cookie. + * @cache: The cache cookie to be released + * @where: An indication of where the release happened * - * Initialise a record of a cache and fill in the name. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. + * Release the caller's reference on a cache-level cookie. The @where + * indication should give information about the circumstances in which the call + * occurs and will be logged through a tracepoint. */ -void fscache_init_cache(struct fscache_cache *cache, - const struct fscache_cache_ops *ops, - const char *idfmt, - ...) +void fscache_put_cache(struct fscache_cache *cache, + enum fscache_cache_trace where) { - va_list va; + unsigned int debug_id = cache->debug_id; + bool zero; + int ref; - memset(cache, 0, sizeof(*cache)); + if (IS_ERR_OR_NULL(cache)) + return; - cache->ops = ops; + zero = __refcount_dec_and_test(&cache->ref, &ref); + trace_fscache_cache(debug_id, ref - 1, where); - va_start(va, idfmt); - vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); - va_end(va); + if (zero) { + down_write(&fscache_addremove_sem); + list_del_init(&cache->cache_link); + up_write(&fscache_addremove_sem); + kfree(cache->name); + kfree(cache); + } +} - INIT_WORK(&cache->op_gc, fscache_operation_gc); - INIT_LIST_HEAD(&cache->link); - INIT_LIST_HEAD(&cache->object_list); - INIT_LIST_HEAD(&cache->op_gc_list); - spin_lock_init(&cache->object_list_lock); - spin_lock_init(&cache->op_gc_list_lock); +/** + * fscache_relinquish_cache - Reset cache state and release cookie + * @cache: The cache cookie to be released + * + * Reset the state of a cache and release the caller's reference on a cache + * cookie. + */ +void fscache_relinquish_cache(struct fscache_cache *cache) +{ + enum fscache_cache_trace where = + (cache->state == FSCACHE_CACHE_IS_PREPARING) ? + fscache_cache_put_prep_failed : + fscache_cache_put_relinquish; + + cache->ops = NULL; + cache->cache_priv = NULL; + smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT); + fscache_put_cache(cache, where); } -EXPORT_SYMBOL(fscache_init_cache); +EXPORT_SYMBOL(fscache_relinquish_cache); /** * fscache_add_cache - Declare a cache as being open for business - * @cache: The record describing the cache - * @ifsdef: The record of the cache object describing the top-level index - * @tagname: The tag describing this cache + * @cache: The cache-level cookie representing the cache + * @ops: Table of cache operations to use + * @cache_priv: Private data for the cache record * * Add a cache to the system, making it available for netfs's to use. * @@ -211,93 +231,97 @@ EXPORT_SYMBOL(fscache_init_cache); * description. */ int fscache_add_cache(struct fscache_cache *cache, - struct fscache_object *ifsdef, - const char *tagname) + const struct fscache_cache_ops *ops, + void *cache_priv) { - struct fscache_cache_tag *tag; - - ASSERTCMP(ifsdef->cookie, ==, &fscache_fsdef_index); - BUG_ON(!cache->ops); - BUG_ON(!ifsdef); + int n_accesses; - cache->flags = 0; - ifsdef->event_mask = - ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) & - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags); + _enter("{%s,%s}", ops->name, cache->name); - if (!tagname) - tagname = cache->identifier; + BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); - BUG_ON(!tagname[0]); - - _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); - - /* we use the cache tag to uniquely identify caches */ - tag = __fscache_lookup_cache_tag(tagname); - if (IS_ERR(tag)) - goto nomem; - - if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) - goto tag_in_use; - - cache->kobj = kobject_create_and_add(tagname, fscache_root); - if (!cache->kobj) - goto error; - - ifsdef->cache = cache; - cache->fsdef = ifsdef; + /* Get a ref on the cache cookie and keep its n_accesses counter raised + * by 1 to prevent wakeups from transitioning it to 0 until we're + * withdrawing caching services from it. + */ + n_accesses = atomic_inc_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, fscache_access_cache_pin); down_write(&fscache_addremove_sem); - tag->cache = cache; - cache->tag = tag; - - /* add the cache to the list */ - list_add(&cache->link, &fscache_cache_list); - - /* add the cache's netfs definition index object to the cache's - * list */ - spin_lock(&cache->object_list_lock); - list_add_tail(&ifsdef->cache_link, &cache->object_list); - spin_unlock(&cache->object_list_lock); - - /* add the cache's netfs definition index object to the top level index - * cookie as a known backing object */ - spin_lock(&fscache_fsdef_index.lock); - - hlist_add_head(&ifsdef->cookie_link, - &fscache_fsdef_index.backing_objects); - - refcount_inc(&fscache_fsdef_index.ref); + cache->ops = ops; + cache->cache_priv = cache_priv; + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_ACTIVE); - /* done */ - spin_unlock(&fscache_fsdef_index.lock); up_write(&fscache_addremove_sem); - - pr_notice("Cache \"%s\" added (type %s)\n", - cache->tag->name, cache->ops->name); - kobject_uevent(cache->kobj, KOBJ_ADD); - - _leave(" = 0 [%s]", cache->identifier); + pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); + _leave(" = 0 [%s]", cache->name); return 0; +} +EXPORT_SYMBOL(fscache_add_cache); -tag_in_use: - pr_err("Cache tag '%s' already in use\n", tagname); - __fscache_release_cache_tag(tag); - _leave(" = -EXIST"); - return -EEXIST; - -error: - __fscache_release_cache_tag(tag); - _leave(" = -EINVAL"); - return -EINVAL; +/** + * fscache_begin_cache_access - Pin a cache so it can be accessed + * @cache: The cache-level cookie + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing it and returns true if successful. This works as follows: + * + * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), + * then we return false to indicate access was not permitted. + * + * (2) If the cache tests as live, then we increment the n_accesses count and + * then recheck the liveness, ending the access if it ceased to be live. + * + * (3) When we end the access, we decrement n_accesses and wake up the any + * waiters if it reaches 0. + * + * (4) Whilst the cache is caching, n_accesses is kept artificially + * incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline, the state is changed to prevent new + * accesses, n_accesses is decremented and we wait for n_accesses to + * become 0. + */ +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) +{ + int n_accesses; + + if (!fscache_cache_is_live(cache)) + return false; + + n_accesses = atomic_inc_return(&cache->n_accesses); + smp_mb__after_atomic(); /* Reread live flag after n_accesses */ + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, why); + if (!fscache_cache_is_live(cache)) { + fscache_end_cache_access(cache, fscache_access_unlive); + return false; + } + return true; +} -nomem: - _leave(" = -ENOMEM"); - return -ENOMEM; +/** + * fscache_end_cache_access - Unpin a cache at the end of an access. + * @cache: The cache-level cookie + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache after we've accessed it. The @why indicator is merely + * provided for tracing purposes. + */ +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, why); + if (n_accesses == 0) + wake_up_var(&cache->n_accesses); } -EXPORT_SYMBOL(fscache_add_cache); /** * fscache_io_error - Note a cache I/O error @@ -311,106 +335,94 @@ EXPORT_SYMBOL(fscache_add_cache); */ void fscache_io_error(struct fscache_cache *cache) { - if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) + if (fscache_set_cache_state_maybe(cache, + FSCACHE_CACHE_IS_ACTIVE, + FSCACHE_CACHE_GOT_IOERROR)) pr_err("Cache '%s' stopped due to I/O error\n", - cache->ops->name); + cache->name); } EXPORT_SYMBOL(fscache_io_error); -/* - * request withdrawal of all the objects in a cache - * - all the objects being withdrawn are moved onto the supplied list +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The cache cookie + * + * Begin the process of withdrawing a cache from service. This stops new + * cache-level and volume-level accesses from taking place and waits for + * currently ongoing cache-level accesses to end. */ -static void fscache_withdraw_all_objects(struct fscache_cache *cache, - struct list_head *dying_objects) +void fscache_withdraw_cache(struct fscache_cache *cache) { - struct fscache_object *object; + int n_accesses; - while (!list_empty(&cache->object_list)) { - spin_lock(&cache->object_list_lock); + pr_notice("Withdrawing cache \"%s\" (%u objs)\n", + cache->name, atomic_read(&cache->object_count)); - if (!list_empty(&cache->object_list)) { - object = list_entry(cache->object_list.next, - struct fscache_object, cache_link); - list_move_tail(&object->cache_link, dying_objects); + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_WITHDRAWN); - _debug("withdraw %x", object->cookie->debug_id); + /* Allow wakeups on dec-to-0 */ + n_accesses = atomic_dec_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, fscache_access_cache_unpin); - /* This must be done under object_list_lock to prevent - * a race with fscache_drop_object(). - */ - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); - } - - spin_unlock(&cache->object_list_lock); - cond_resched(); - } + wait_var_event(&cache->n_accesses, + atomic_read(&cache->n_accesses) == 0); } +EXPORT_SYMBOL(fscache_withdraw_cache); -/** - * fscache_withdraw_cache - Withdraw a cache from the active service - * @cache: The record describing the cache - * - * Withdraw a cache from service, unbinding all its cache objects from the - * netfs cookies they're currently representing. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. +#ifdef CONFIG_PROC_FS +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; + +/* + * Generate a list of caches in /proc/fs/fscache/caches */ -void fscache_withdraw_cache(struct fscache_cache *cache) +static int fscache_caches_seq_show(struct seq_file *m, void *v) { - LIST_HEAD(dying_objects); + struct fscache_cache *cache; - _enter(""); + if (v == &fscache_caches) { + seq_puts(m, + "CACHE REF VOLS OBJS ACCES S NAME\n" + "======== ===== ===== ===== ===== = ===============\n" + ); + return 0; + } - pr_notice("Withdrawing cache \"%s\"\n", - cache->tag->name); + cache = list_entry(v, struct fscache_cache, cache_link); + seq_printf(m, + "%08x %5d %5d %5d %5d %c %s\n", + cache->debug_id, + refcount_read(&cache->ref), + atomic_read(&cache->n_volumes), + atomic_read(&cache->object_count), + atomic_read(&cache->n_accesses), + fscache_cache_states[cache->state], + cache->name ?: "-"); + return 0; +} - /* make the cache unavailable for cookie acquisition */ - if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) - BUG(); +static void *fscache_caches_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(fscache_addremove_sem) +{ + down_read(&fscache_addremove_sem); + return seq_list_start_head(&fscache_caches, *_pos); +} - down_write(&fscache_addremove_sem); - list_del_init(&cache->link); - cache->tag->cache = NULL; - up_write(&fscache_addremove_sem); +static void *fscache_caches_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_caches, _pos); +} - /* make sure all pages pinned by operations on behalf of the netfs are - * written to disk */ - fscache_stat(&fscache_n_cop_sync_cache); - cache->ops->sync_cache(cache); - fscache_stat_d(&fscache_n_cop_sync_cache); - - /* dissociate all the netfs pages backed by this cache from the block - * mappings in the cache */ - fscache_stat(&fscache_n_cop_dissociate_pages); - cache->ops->dissociate_pages(cache); - fscache_stat_d(&fscache_n_cop_dissociate_pages); - - /* we now have to destroy all the active objects pertaining to this - * cache - which we do by passing them off to thread pool to be - * disposed of */ - _debug("destroy"); - - fscache_withdraw_all_objects(cache, &dying_objects); - - /* wait for all extant objects to finish their outstanding operations - * and go away */ - _debug("wait for finish"); - wait_event(fscache_cache_cleared_wq, - atomic_read(&cache->object_count) == 0); - _debug("wait for clearance"); - wait_event(fscache_cache_cleared_wq, - list_empty(&cache->object_list)); - _debug("cleared"); - ASSERT(list_empty(&dying_objects)); - - kobject_put(cache->kobj); - - clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); - fscache_release_cache_tag(cache->tag); - cache->tag = NULL; - - _leave(""); +static void fscache_caches_seq_stop(struct seq_file *m, void *v) + __releases(fscache_addremove_sem) +{ + up_read(&fscache_addremove_sem); } -EXPORT_SYMBOL(fscache_withdraw_cache); + +const struct seq_operations fscache_caches_seq_ops = { + .start = fscache_caches_seq_start, + .next = fscache_caches_seq_next, + .stop = fscache_caches_seq_stop, + .show = fscache_caches_seq_show, +}; +#endif /* CONFIG_PROC_FS */ diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index cd42be646ed3..9bb1ab5fe5ed 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* netfs cookie management * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/filesystems/caching/netfs-api.rst for more information on @@ -15,70 +15,258 @@ struct kmem_cache *fscache_cookie_jar; -static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); +static void fscache_cookie_lru_timed_out(struct timer_list *timer); +static void fscache_cookie_lru_worker(struct work_struct *work); +static void fscache_cookie_worker(struct work_struct *work); +static void fscache_unhash_cookie(struct fscache_cookie *cookie); +static void fscache_perform_invalidation(struct fscache_cookie *cookie); #define fscache_cookie_hash_shift 15 static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; static LIST_HEAD(fscache_cookies); static DEFINE_RWLOCK(fscache_cookies_lock); - -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, - loff_t object_size); -static int fscache_alloc_object(struct fscache_cache *cache, - struct fscache_cookie *cookie); -static int fscache_attach_object(struct fscache_cookie *cookie, - struct fscache_object *object); - -static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) +static LIST_HEAD(fscache_cookie_lru); +static DEFINE_SPINLOCK(fscache_cookie_lru_lock); +DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); +static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +unsigned int fscache_lru_cookie_timeout = 10 * HZ; + +void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { - struct fscache_object *object; - struct hlist_node *o; const u8 *k; - unsigned loop; - pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n", + pr_err("%c-cookie c=%08x [fl=%lx na=%u nA=%u s=%c]\n", prefix, cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, cookie->flags, - atomic_read(&cookie->n_children), - atomic_read(&cookie->n_active)); - pr_err("%c-cookie d=%p{%s} n=%p\n", + atomic_read(&cookie->n_active), + atomic_read(&cookie->n_accesses), + fscache_cookie_states[cookie->state]); + pr_err("%c-cookie V=%08x [%s]\n", prefix, - cookie->def, - cookie->def ? cookie->def->name : "?", - cookie->netfs_data); - - o = READ_ONCE(cookie->backing_objects.first); - if (o) { - object = hlist_entry(o, struct fscache_object, cookie_link); - pr_err("%c-cookie o=%u\n", prefix, object->debug_id); - } + cookie->volume->debug_id, + cookie->volume->key); - pr_err("%c-key=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? cookie->inline_key : cookie->key; - for (loop = 0; loop < cookie->key_len; loop++) - pr_cont("%02x", k[loop]); - pr_cont("'\n"); + pr_err("%c-key=[%u] '%*phN'\n", prefix, cookie->key_len, cookie->key_len, k); } -void fscache_free_cookie(struct fscache_cookie *cookie) +static void fscache_free_cookie(struct fscache_cookie *cookie) { - if (cookie) { - BUG_ON(!hlist_empty(&cookie->backing_objects)); - write_lock(&fscache_cookies_lock); - list_del(&cookie->proc_link); - write_unlock(&fscache_cookies_lock); - if (cookie->aux_len > sizeof(cookie->inline_aux)) - kfree(cookie->aux); - if (cookie->key_len > sizeof(cookie->inline_key)) - kfree(cookie->key); - kmem_cache_free(fscache_cookie_jar, cookie); + if (WARN_ON_ONCE(!list_empty(&cookie->commit_link))) { + spin_lock(&fscache_cookie_lru_lock); + list_del_init(&cookie->commit_link); + spin_unlock(&fscache_cookie_lru_lock); + fscache_stat_d(&fscache_n_cookies_lru); + fscache_stat(&fscache_n_cookies_lru_removed); + } + + if (WARN_ON_ONCE(test_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags))) { + fscache_print_cookie(cookie, 'F'); + return; } + + write_lock(&fscache_cookies_lock); + list_del(&cookie->proc_link); + write_unlock(&fscache_cookies_lock); + if (cookie->aux_len > sizeof(cookie->inline_aux)) + kfree(cookie->aux); + if (cookie->key_len > sizeof(cookie->inline_key)) + kfree(cookie->key); + fscache_stat_d(&fscache_n_cookies); + kmem_cache_free(fscache_cookie_jar, cookie); +} + +static void __fscache_queue_cookie(struct fscache_cookie *cookie) +{ + if (!queue_work(fscache_wq, &cookie->work)) + fscache_put_cookie(cookie, fscache_cookie_put_over_queued); +} + +static void fscache_queue_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + fscache_get_cookie(cookie, where); + __fscache_queue_cookie(cookie); } /* + * Initialise the access gate on a cookie by setting a flag to prevent the + * state machine from being queued when the access counter transitions to 0. + * We're only interested in this when we withdraw caching services from the + * cookie. + */ +static void fscache_init_access_gate(struct fscache_cookie *cookie) +{ + int n_accesses; + + n_accesses = atomic_read(&cookie->n_accesses); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, fscache_access_cache_pin); + set_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags); +} + +/** + * fscache_end_cookie_access - Unpin a cache at the end of an access. + * @cookie: A data file cookie + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache cookie after we've accessed it and bring a deferred + * relinquishment or withdrawal state into effect. + * + * The @why indicator is provided for tracing purposes. + */ +void fscache_end_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&cookie->n_accesses); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, why); + if (n_accesses == 0 && + !test_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags)) + fscache_queue_cookie(cookie, fscache_cookie_get_end_access); +} +EXPORT_SYMBOL(fscache_end_cookie_access); + +/* + * Pin the cache behind a cookie so that we can access it. + */ +static void __fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + n_accesses = atomic_inc_return(&cookie->n_accesses); + smp_mb__after_atomic(); /* (Future) read state after is-caching. + * Reread n_accesses after is-caching + */ + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, why); +} + +/** + * fscache_begin_cookie_access - Pin a cache so data can be accessed + * @cookie: A data file cookie + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing data and returns true if successful. This works as follows: + * + * (1) If the cookie is not being cached (ie. FSCACHE_COOKIE_IS_CACHING is not + * set), we return false to indicate access was not permitted. + * + * (2) If the cookie is being cached, we increment its n_accesses count and + * then recheck the IS_CACHING flag, ending the access if it got cleared. + * + * (3) When we end the access, we decrement the cookie's n_accesses and wake + * up the any waiters if it reaches 0. + * + * (4) Whilst the cookie is actively being cached, its n_accesses is kept + * artificially incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline or if the cookie is culled, the flag is + * cleared to prevent new accesses, the cookie's n_accesses is decremented + * and we wait for it to become 0. + * + * The @why indicator are merely provided for tracing purposes. + */ +bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) + return false; + __fscache_begin_cookie_access(cookie, why); + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags) || + !fscache_cache_is_live(cookie->volume->cache)) { + fscache_end_cookie_access(cookie, fscache_access_unlive); + return false; + } + return true; +} + +static inline void wake_up_cookie_state(struct fscache_cookie *cookie) +{ + /* Use a barrier to ensure that waiters see the state variable + * change, as spin_unlock doesn't guarantee a barrier. + * + * See comments over wake_up_bit() and waitqueue_active(). + */ + smp_mb(); + wake_up_var(&cookie->state); +} + +/* + * Change the state a cookie is at and wake up anyone waiting for that. Impose + * an ordering between the stuff stored in the cookie and the state member. + * Paired with fscache_cookie_state(). + */ +static void __fscache_set_cookie_state(struct fscache_cookie *cookie, + enum fscache_cookie_state state) +{ + smp_store_release(&cookie->state, state); +} + +static void fscache_set_cookie_state(struct fscache_cookie *cookie, + enum fscache_cookie_state state) +{ + spin_lock(&cookie->lock); + __fscache_set_cookie_state(cookie, state); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); +} + +/** + * fscache_cookie_lookup_negative - Note negative lookup + * @cookie: The cookie that was being looked up + * + * Note that some part of the metadata path in the cache doesn't exist and so + * we can release any waiting readers in the certain knowledge that there's + * nothing for them to actually read. + * + * This function uses no locking and must only be called from the state machine. + */ +void fscache_cookie_lookup_negative(struct fscache_cookie *cookie) +{ + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_CREATING); +} +EXPORT_SYMBOL(fscache_cookie_lookup_negative); + +/** + * fscache_resume_after_invalidation - Allow I/O to resume after invalidation + * @cookie: The cookie that was invalidated + * + * Tell fscache that invalidation is sufficiently complete that I/O can be + * allowed again. + */ +void fscache_resume_after_invalidation(struct fscache_cookie *cookie) +{ + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); +} +EXPORT_SYMBOL(fscache_resume_after_invalidation); + +/** + * fscache_caching_failed - Report that a failure stopped caching on a cookie + * @cookie: The cookie that was affected + * + * Tell fscache that caching on a cookie needs to be stopped due to some sort + * of failure. + * + * This function uses no locking and must only be called from the state machine. + */ +void fscache_caching_failed(struct fscache_cookie *cookie) +{ + clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED); +} +EXPORT_SYMBOL(fscache_caching_failed); + +/* * Set the index key in a cookie. The cookie struct has space for a 16-byte * key plus length and hash, but if that's not big enough, it's instead a * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then @@ -87,38 +275,35 @@ void fscache_free_cookie(struct fscache_cookie *cookie) static int fscache_set_key(struct fscache_cookie *cookie, const void *index_key, size_t index_key_len) { - u32 *buf; - int bufs; + void *buf; + size_t buf_size; - bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); + buf_size = round_up(index_key_len, sizeof(__le32)); if (index_key_len > sizeof(cookie->inline_key)) { - buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL); + buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; cookie->key = buf; } else { - buf = (u32 *)cookie->inline_key; + buf = cookie->inline_key; } memcpy(buf, index_key, index_key_len); - cookie->key_hash = fscache_hash(0, buf, bufs); + cookie->key_hash = fscache_hash(cookie->volume->key_hash, + buf, buf_size); return 0; } -static long fscache_compare_cookie(const struct fscache_cookie *a, - const struct fscache_cookie *b) +static bool fscache_cookie_same(const struct fscache_cookie *a, + const struct fscache_cookie *b) { const void *ka, *kb; - if (a->key_hash != b->key_hash) - return (long)a->key_hash - (long)b->key_hash; - if (a->parent != b->parent) - return (long)a->parent - (long)b->parent; - if (a->key_len != b->key_len) - return (long)a->key_len - (long)b->key_len; - if (a->type != b->type) - return (long)a->type - (long)b->type; + if (a->key_hash != b->key_hash || + a->volume != b->volume || + a->key_len != b->key_len) + return false; if (a->key_len <= sizeof(a->inline_key)) { ka = &a->inline_key; @@ -127,7 +312,7 @@ static long fscache_compare_cookie(const struct fscache_cookie *a, ka = a->key; kb = b->key; } - return memcmp(ka, kb, a->key_len); + return memcmp(ka, kb, a->key_len) == 0; } static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); @@ -135,12 +320,11 @@ static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); /* * Allocate a cookie. */ -struct fscache_cookie *fscache_alloc_cookie( - struct fscache_cookie *parent, - const struct fscache_cookie_def *def, +static struct fscache_cookie *fscache_alloc_cookie( + struct fscache_volume *volume, + u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, - void *netfs_data, loff_t object_size) { struct fscache_cookie *cookie; @@ -149,9 +333,15 @@ struct fscache_cookie *fscache_alloc_cookie( cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); if (!cookie) return NULL; + fscache_stat(&fscache_n_cookies); - cookie->key_len = index_key_len; - cookie->aux_len = aux_data_len; + cookie->volume = volume; + cookie->advice = advice; + cookie->key_len = index_key_len; + cookie->aux_len = aux_data_len; + cookie->object_size = object_size; + if (object_size == 0) + __set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); if (fscache_set_key(cookie, index_key, index_key_len) < 0) goto nomem; @@ -165,30 +355,16 @@ struct fscache_cookie *fscache_alloc_cookie( } refcount_set(&cookie->ref, 1); - atomic_set(&cookie->n_children, 0); cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id); - - /* We keep the active count elevated until relinquishment to prevent an - * attempt to wake up every time the object operations queue quiesces. - */ - atomic_set(&cookie->n_active, 1); - - cookie->def = def; - cookie->parent = parent; - cookie->netfs_data = netfs_data; - cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); - cookie->type = def->type; spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); - - /* radix tree insertion won't use the preallocation pool unless it's - * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_LIST_HEAD(&cookie->commit_link); + INIT_WORK(&cookie->work, fscache_cookie_worker); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); write_lock(&fscache_cookies_lock); list_add_tail(&cookie->proc_link, &fscache_cookies); write_unlock(&fscache_cookies_lock); + fscache_see_cookie(cookie, fscache_cookie_new_acquire); return cookie; nomem: @@ -196,13 +372,28 @@ nomem: return NULL; } +static void fscache_wait_on_collision(struct fscache_cookie *candidate, + struct fscache_cookie *wait_for) +{ + enum fscache_cookie_state *statep = &wait_for->state; + + wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED, + 20 * HZ); + if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) { + pr_notice("Potential collision c=%08x old: c=%08x", + candidate->debug_id, wait_for->debug_id); + wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED); + } +} + /* * Attempt to insert the new cookie into the hash. If there's a collision, we - * return the old cookie if it's not in use and an error otherwise. + * wait for the old cookie to complete if it's being relinquished and an error + * otherwise. */ -struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) +static bool fscache_hash_cookie(struct fscache_cookie *candidate) { - struct fscache_cookie *cursor; + struct fscache_cookie *cursor, *wait_for = NULL; struct hlist_bl_head *h; struct hlist_bl_node *p; unsigned int bucket; @@ -212,64 +403,53 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) hlist_bl_lock(h); hlist_bl_for_each_entry(cursor, p, h, hash_link) { - if (fscache_compare_cookie(candidate, cursor) == 0) - goto collision; + if (fscache_cookie_same(candidate, cursor)) { + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cursor->flags)) + goto collision; + wait_for = fscache_get_cookie(cursor, + fscache_cookie_get_hash_collision); + break; + } } - __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags); - fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent); - atomic_inc(&candidate->parent->n_children); + fscache_get_volume(candidate->volume, fscache_volume_get_cookie); + atomic_inc(&candidate->volume->n_cookies); hlist_bl_add_head(&candidate->hash_link, h); + set_bit(FSCACHE_COOKIE_IS_HASHED, &candidate->flags); hlist_bl_unlock(h); - return candidate; -collision: - if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { - trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), - fscache_cookie_collision); - pr_err("Duplicate cookie detected\n"); - fscache_print_cookie(cursor, 'O'); - fscache_print_cookie(candidate, 'N'); - hlist_bl_unlock(h); - return NULL; + if (wait_for) { + fscache_wait_on_collision(candidate, wait_for); + fscache_put_cookie(wait_for, fscache_cookie_put_hash_collision); } + return true; - fscache_cookie_get(cursor, fscache_cookie_get_reacquire); +collision: + trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), + fscache_cookie_collision); + pr_err("Duplicate cookie detected\n"); + fscache_print_cookie(cursor, 'O'); + fscache_print_cookie(candidate, 'N'); hlist_bl_unlock(h); - return cursor; + return false; } /* - * request a cookie to represent an object (index, datafile, xattr, etc) - * - parent specifies the parent object - * - the top level index cookie for each netfs is stored in the fscache_netfs - * struct upon registration - * - def points to the definition - * - the netfs_data will be passed to the functions pointed to in *def - * - all attached caches will be searched to see if they contain this object - * - index objects aren't stored on disk until there's a dependent file that - * needs storing - * - other objects are stored in a selected cache immediately, and all the - * indices forming the path to it are instantiated if necessary - * - we never let on to the netfs about errors - * - we may set a negative cookie pointer, but that's okay + * Request a cookie to represent a data storage object within a volume. + * + * We never let on to the netfs about errors. We may set a negative cookie + * pointer, but that's okay */ struct fscache_cookie *__fscache_acquire_cookie( - struct fscache_cookie *parent, - const struct fscache_cookie_def *def, + struct fscache_volume *volume, + u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, - void *netfs_data, - loff_t object_size, - bool enable) + loff_t object_size) { - struct fscache_cookie *candidate, *cookie; - - BUG_ON(!def); + struct fscache_cookie *cookie; - _enter("{%s},{%s},%p,%u", - parent ? (char *) parent->def->name : "<no-parent>", - def->name, netfs_data, enable); + _enter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -280,563 +460,440 @@ struct fscache_cookie *__fscache_acquire_cookie( fscache_stat(&fscache_n_acquires); - /* if there's no parent cookie, then we don't create one here either */ - if (!parent) { - fscache_stat(&fscache_n_acquires_null); - _leave(" [no parent]"); - return NULL; - } - - /* validate the definition */ - BUG_ON(!def->name[0]); - - BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && - parent->type != FSCACHE_COOKIE_TYPE_INDEX); - - candidate = fscache_alloc_cookie(parent, def, - index_key, index_key_len, - aux_data, aux_data_len, - netfs_data, object_size); - if (!candidate) { + cookie = fscache_alloc_cookie(volume, advice, + index_key, index_key_len, + aux_data, aux_data_len, + object_size); + if (!cookie) { fscache_stat(&fscache_n_acquires_oom); - _leave(" [ENOMEM]"); return NULL; } - cookie = fscache_hash_cookie(candidate); - if (!cookie) { - trace_fscache_cookie(candidate->debug_id, 1, - fscache_cookie_discard); - goto out; - } - - if (cookie == candidate) - candidate = NULL; - - switch (cookie->type) { - case FSCACHE_COOKIE_TYPE_INDEX: - fscache_stat(&fscache_n_cookie_index); - break; - case FSCACHE_COOKIE_TYPE_DATAFILE: - fscache_stat(&fscache_n_cookie_data); - break; - default: - fscache_stat(&fscache_n_cookie_special); - break; + if (!fscache_hash_cookie(cookie)) { + fscache_see_cookie(cookie, fscache_cookie_discard); + fscache_free_cookie(cookie); + return NULL; } trace_fscache_acquire(cookie); - - if (enable) { - /* if the object is an index then we need do nothing more here - * - we create indices on disk when we need them as an index - * may exist in multiple caches */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } else { - atomic_dec(&parent->n_children); - fscache_cookie_put(cookie, - fscache_cookie_put_acquire_nobufs); - fscache_stat(&fscache_n_acquires_nobufs); - _leave(" = NULL"); - return NULL; - } - } else { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } - } - fscache_stat(&fscache_n_acquires_ok); - -out: - fscache_free_cookie(candidate); + _leave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); /* - * Enable a cookie to permit it to accept new operations. + * Prepare a cache object to be written to. */ -void __fscache_enable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - loff_t object_size, - bool (*can_enable)(void *data), - void *data) +static void fscache_prepare_to_write(struct fscache_cookie *cookie) { - _enter("%x", cookie->debug_id); - - trace_fscache_enable(cookie); - - wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - TASK_UNINTERRUPTIBLE); - - fscache_update_aux(cookie, aux_data); - - if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) - goto out_unlock; - - if (can_enable && !can_enable(data)) { - /* The netfs decided it didn't want to enable after all */ - } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - /* Wait for outstanding disablement to complete */ - __fscache_wait_on_invalidate(cookie); - - if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } else { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } - -out_unlock: - clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + cookie->volume->cache->ops->prepare_to_write(cookie); } -EXPORT_SYMBOL(__fscache_enable_cookie); /* - * acquire a non-index cookie - * - this must make sure the index chain is instantiated and instantiate the - * object representation too + * Look up a cookie in the cache. */ -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, - loff_t object_size) +static void fscache_perform_lookup(struct fscache_cookie *cookie) { - struct fscache_object *object; - struct fscache_cache *cache; - int ret; + enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; + bool need_withdraw = false; _enter(""); - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - /* now we need to see whether the backing objects for this cookie yet - * exist, if not there'll be nothing to search */ - down_read(&fscache_addremove_sem); - - if (list_empty(&fscache_cache_list)) { - up_read(&fscache_addremove_sem); - _leave(" = 0 [no caches]"); - return 0; - } - - /* select a cache in which to store the object */ - cache = fscache_select_cache_for_object(cookie->parent); - if (!cache) { - up_read(&fscache_addremove_sem); - fscache_stat(&fscache_n_acquires_no_cache); - _leave(" = -ENOMEDIUM [no cache]"); - return -ENOMEDIUM; - } - - _debug("cache %s", cache->tag->name); - - set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - - /* ask the cache to allocate objects for this cookie and its parent - * chain */ - ret = fscache_alloc_object(cache, cookie); - if (ret < 0) { - up_read(&fscache_addremove_sem); - _leave(" = %d", ret); - return ret; - } - - spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) { - spin_unlock(&cookie->lock); - goto unavailable; + if (!cookie->volume->cache_priv) { + fscache_create_volume(cookie->volume, true); + if (!cookie->volume->cache_priv) { + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + goto out; + } } - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - fscache_set_store_limit(object, object_size); - - /* initiate the process of looking up all the objects in the chain - * (done by fscache_initialise_object()) */ - fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); - - spin_unlock(&cookie->lock); - - /* we may be required to wait for lookup to complete at this point */ - if (!fscache_defer_lookup) { - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - TASK_UNINTERRUPTIBLE); - if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) - goto unavailable; + if (!cookie->volume->cache->ops->lookup_cookie(cookie)) { + if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + need_withdraw = true; + _leave(" [fail]"); + goto out; } - up_read(&fscache_addremove_sem); - _leave(" = 0 [deferred]"); - return 0; + fscache_see_cookie(cookie, fscache_cookie_see_active); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + trace = fscache_access_lookup_cookie_end; -unavailable: - up_read(&fscache_addremove_sem); - _leave(" = -ENOBUFS"); - return -ENOBUFS; +out: + fscache_end_cookie_access(cookie, trace); + if (need_withdraw) + fscache_withdraw_cookie(cookie); + fscache_end_volume_access(cookie->volume, cookie, trace); } /* - * recursively allocate cache object records for a cookie/cache combination - * - caller must be holding the addremove sem + * Begin the process of looking up a cookie. We offload the actual process to + * a worker thread. */ -static int fscache_alloc_object(struct fscache_cache *cache, - struct fscache_cookie *cookie) +static bool fscache_begin_lookup(struct fscache_cookie *cookie, bool will_modify) { - struct fscache_object *object; - int ret; - - _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name); - - spin_lock(&cookie->lock); - hlist_for_each_entry(object, &cookie->backing_objects, - cookie_link) { - if (object->cache == cache) - goto object_already_extant; + if (will_modify) { + set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags); + set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); } - spin_unlock(&cookie->lock); - - /* ask the cache to allocate an object (we may end up with duplicate - * objects at this stage, but we sort that out later) */ - fscache_stat(&fscache_n_cop_alloc_object); - object = cache->ops->alloc_object(cache, cookie); - fscache_stat_d(&fscache_n_cop_alloc_object); - if (IS_ERR(object)) { - fscache_stat(&fscache_n_object_no_alloc); - ret = PTR_ERR(object); - goto error; - } - - ASSERTCMP(object->cookie, ==, cookie); - fscache_stat(&fscache_n_object_alloc); - - object->debug_id = atomic_inc_return(&fscache_object_debug_id); - - _debug("ALLOC OBJ%x: %s {%lx}", - object->debug_id, cookie->def->name, object->events); - - ret = fscache_alloc_object(cache, cookie->parent); - if (ret < 0) - goto error_put; - - /* only attach if we managed to allocate all we needed, otherwise - * discard the object we just allocated and instead use the one - * attached to the cookie */ - if (fscache_attach_object(cookie, object) < 0) { - fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object, fscache_obj_put_attach_fail); - fscache_stat_d(&fscache_n_cop_put_object); - } - - _leave(" = 0"); - return 0; - -object_already_extant: - ret = -ENOBUFS; - if (fscache_object_is_dying(object) || - fscache_cache_is_broken(object)) { - spin_unlock(&cookie->lock); - goto error; - } - spin_unlock(&cookie->lock); - _leave(" = 0 [found]"); - return 0; - -error_put: - fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object, fscache_obj_put_alloc_fail); - fscache_stat_d(&fscache_n_cop_put_object); -error: - _leave(" = %d", ret); - return ret; + if (!fscache_begin_volume_access(cookie->volume, cookie, + fscache_access_lookup_cookie)) + return false; + + __fscache_begin_cookie_access(cookie, fscache_access_lookup_cookie); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LOOKING_UP); + set_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags); + set_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags); + return true; } /* - * attach a cache object to a cookie + * Start using the cookie for I/O. This prevents the backing object from being + * reaped by VM pressure. */ -static int fscache_attach_object(struct fscache_cookie *cookie, - struct fscache_object *object) +void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) { - struct fscache_object *p; - struct fscache_cache *cache = object->cache; - int ret; + enum fscache_cookie_state state; + bool queue = false; + int n_active; - _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + _enter("c=%08x", cookie->debug_id); - ASSERTCMP(object->cookie, ==, cookie); + if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Trying to use relinquished cookie\n")) + return; spin_lock(&cookie->lock); - /* there may be multiple initial creations of this object, but we only - * want one */ - ret = -EEXIST; - hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { - if (p->cache == object->cache) { - if (fscache_object_is_dying(p)) - ret = -ENOBUFS; - goto cant_attach_object; - } - } + n_active = atomic_inc_return(&cookie->n_active); + trace_fscache_active(cookie->debug_id, refcount_read(&cookie->ref), + n_active, atomic_read(&cookie->n_accesses), + will_modify ? + fscache_active_use_modify : fscache_active_use); + +again: + state = fscache_cookie_state(cookie); + switch (state) { + case FSCACHE_COOKIE_STATE_QUIESCENT: + queue = fscache_begin_lookup(cookie, will_modify); + break; - /* pin the parent object */ - spin_lock_nested(&cookie->parent->lock, 1); - hlist_for_each_entry(p, &cookie->parent->backing_objects, - cookie_link) { - if (p->cache == object->cache) { - if (fscache_object_is_dying(p)) { - ret = -ENOBUFS; - spin_unlock(&cookie->parent->lock); - goto cant_attach_object; - } - object->parent = p; - spin_lock(&p->lock); - p->n_children++; - spin_unlock(&p->lock); - break; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_CREATING: + if (will_modify) + set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags); + break; + case FSCACHE_COOKIE_STATE_ACTIVE: + case FSCACHE_COOKIE_STATE_INVALIDATING: + if (will_modify && + !test_and_set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags)) { + set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); + queue = true; } - } - spin_unlock(&cookie->parent->lock); - - /* attach to the cache's object list */ - if (list_empty(&object->cache_link)) { - spin_lock(&cache->object_list_lock); - list_add(&object->cache_link, &cache->object_list); - spin_unlock(&cache->object_list_lock); - } - - /* Attach to the cookie. The object already has a ref on it. */ - hlist_add_head(&object->cookie_link, &cookie->backing_objects); - ret = 0; - -cant_attach_object: - spin_unlock(&cookie->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * Invalidate an object. Callable with spinlocks held. - */ -void __fscache_invalidate(struct fscache_cookie *cookie) -{ - struct fscache_object *object; - - _enter("{%s}", cookie->def->name); - - fscache_stat(&fscache_n_invalidates); + break; - /* Only permit invalidation of data files. Invalidating an index will - * require the caller to release all its attachments to the tree rooted - * there, and if it's doing that, it may as well just retire the - * cookie. - */ - ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + case FSCACHE_COOKIE_STATE_FAILED: + case FSCACHE_COOKIE_STATE_WITHDRAWING: + break; - /* If there's an object, we tell the object state machine to handle the - * invalidation on our behalf, otherwise there's nothing to do. - */ - if (!hlist_empty(&cookie->backing_objects)) { + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + spin_unlock(&cookie->lock); + wait_var_event(&cookie->state, + fscache_cookie_state(cookie) != + FSCACHE_COOKIE_STATE_LRU_DISCARDING); spin_lock(&cookie->lock); + goto again; - if (fscache_cookie_enabled(cookie) && - !hlist_empty(&cookie->backing_objects) && - !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, - &cookie->flags)) { - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, - cookie_link); - if (fscache_object_is_live(object)) - fscache_raise_event( - object, FSCACHE_OBJECT_EV_INVALIDATE); - } - - spin_unlock(&cookie->lock); + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + WARN(1, "Can't use cookie in state %u\n", state); + break; } + spin_unlock(&cookie->lock); + if (queue) + fscache_queue_cookie(cookie, fscache_cookie_get_use_work); _leave(""); } -EXPORT_SYMBOL(__fscache_invalidate); +EXPORT_SYMBOL(__fscache_use_cookie); -/* - * Wait for object invalidation to complete. - */ -void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) +static void fscache_unuse_cookie_locked(struct fscache_cookie *cookie) { - _enter("%x", cookie->debug_id); + clear_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags); + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) + return; - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - TASK_UNINTERRUPTIBLE); + cookie->unused_at = jiffies; + spin_lock(&fscache_cookie_lru_lock); + if (list_empty(&cookie->commit_link)) { + fscache_get_cookie(cookie, fscache_cookie_get_lru); + fscache_stat(&fscache_n_cookies_lru); + } + list_move_tail(&cookie->commit_link, &fscache_cookie_lru); - _leave(""); + spin_unlock(&fscache_cookie_lru_lock); + timer_reduce(&fscache_cookie_lru_timer, + jiffies + fscache_lru_cookie_timeout); } -EXPORT_SYMBOL(__fscache_wait_on_invalidate); /* - * update the index entries backing a cookie + * Stop using the cookie for I/O. */ -void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) +void __fscache_unuse_cookie(struct fscache_cookie *cookie, + const void *aux_data, const loff_t *object_size) { - struct fscache_object *object; - - fscache_stat(&fscache_n_updates); - - if (!cookie) { - fscache_stat(&fscache_n_updates_null); - _leave(" [no cookie]"); + unsigned int debug_id = cookie->debug_id; + unsigned int r = refcount_read(&cookie->ref); + unsigned int a = atomic_read(&cookie->n_accesses); + unsigned int c; + + if (aux_data || object_size) + __fscache_update_cookie(cookie, aux_data, object_size); + + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + c = atomic_fetch_add_unless(&cookie->n_active, -1, 1); + if (c != 1) { + trace_fscache_active(debug_id, r, c - 1, a, fscache_active_unuse); return; } - _enter("{%s}", cookie->def->name); - spin_lock(&cookie->lock); - - fscache_update_aux(cookie, aux_data); - - if (fscache_cookie_enabled(cookie)) { - /* update the index entry on disk in each cache backing this - * cookie. - */ - hlist_for_each_entry(object, - &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); - } - } - + r = refcount_read(&cookie->ref); + a = atomic_read(&cookie->n_accesses); + c = atomic_dec_return(&cookie->n_active); + trace_fscache_active(debug_id, r, c, a, fscache_active_unuse); + if (c == 0) + fscache_unuse_cookie_locked(cookie); spin_unlock(&cookie->lock); - _leave(""); } -EXPORT_SYMBOL(__fscache_update_cookie); +EXPORT_SYMBOL(__fscache_unuse_cookie); /* - * Disable a cookie to stop it from accepting new requests from the netfs. + * Perform work upon the cookie, such as committing its cache state, + * relinquishing it or withdrawing the backing cache. We're protected from the + * cache going away under us as object withdrawal must come through this + * non-reentrant work item. */ -void __fscache_disable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool invalidate) +static void fscache_cookie_state_machine(struct fscache_cookie *cookie) { - struct fscache_object *object; - bool awaken = false; + enum fscache_cookie_state state; + bool wake = false; - _enter("%x,%u", cookie->debug_id, invalidate); + _enter("c=%x", cookie->debug_id); - trace_fscache_disable(cookie); - - ASSERTCMP(atomic_read(&cookie->n_active), >, 0); - - if (atomic_read(&cookie->n_children) != 0) { - pr_err("Cookie '%s' still has children\n", - cookie->def->name); - BUG(); - } +again: + spin_lock(&cookie->lock); +again_locked: + state = cookie->state; + switch (state) { + case FSCACHE_COOKIE_STATE_QUIESCENT: + /* The QUIESCENT state is jumped to the LOOKING_UP state by + * fscache_use_cookie(). + */ - wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - TASK_UNINTERRUPTIBLE); + if (atomic_read(&cookie->n_accesses) == 0 && + test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_RELINQUISHING); + wake = true; + goto again_locked; + } + break; - fscache_update_aux(cookie, aux_data); + case FSCACHE_COOKIE_STATE_LOOKING_UP: + spin_unlock(&cookie->lock); + fscache_init_access_gate(cookie); + fscache_perform_lookup(cookie); + goto again; - if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) - goto out_unlock_enable; + case FSCACHE_COOKIE_STATE_INVALIDATING: + spin_unlock(&cookie->lock); + fscache_perform_invalidation(cookie); + goto again; + + case FSCACHE_COOKIE_STATE_ACTIVE: + if (test_and_clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags)) { + spin_unlock(&cookie->lock); + fscache_prepare_to_write(cookie); + spin_lock(&cookie->lock); + } + if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_LRU_DISCARDING); + wake = true; + goto again_locked; + } + fallthrough; - /* If the cookie is being invalidated, wait for that to complete first - * so that we can reuse the flag. - */ - __fscache_wait_on_invalidate(cookie); + case FSCACHE_COOKIE_STATE_FAILED: + if (atomic_read(&cookie->n_accesses) != 0) + break; + if (test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_RELINQUISHING); + wake = true; + goto again_locked; + } + if (test_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_WITHDRAWING); + wake = true; + goto again_locked; + } + break; - /* Dispose of the backing objects */ - set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + case FSCACHE_COOKIE_STATE_WITHDRAWING: + if (cookie->cache_priv) { + spin_unlock(&cookie->lock); + cookie->volume->cache->ops->withdraw_cookie(cookie); + spin_lock(&cookie->lock); + } - spin_lock(&cookie->lock); - if (!hlist_empty(&cookie->backing_objects)) { - hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { - if (invalidate) - set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + switch (state) { + case FSCACHE_COOKIE_STATE_RELINQUISHING: + fscache_see_cookie(cookie, fscache_cookie_see_relinquish); + fscache_unhash_cookie(cookie); + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_DROPPED); + wake = true; + goto out; + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + fscache_see_cookie(cookie, fscache_cookie_see_lru_discard); + break; + case FSCACHE_COOKIE_STATE_WITHDRAWING: + fscache_see_cookie(cookie, fscache_cookie_see_withdraw); + break; + default: + BUG(); } - } else { - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - awaken = true; - } - spin_unlock(&cookie->lock); - if (awaken) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - /* Wait for cessation of activity requiring access to the netfs (when - * n_active reaches 0). This makes sure outstanding reads and writes - * have completed. - */ - if (!atomic_dec_and_test(&cookie->n_active)) { - wait_var_event(&cookie->n_active, - !atomic_read(&cookie->n_active)); - } + clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + wake = true; + goto again_locked; - /* Make sure any pending writes are cancelled. */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) - fscache_invalidate_writes(cookie); + case FSCACHE_COOKIE_STATE_DROPPED: + break; - /* Reset the cookie state if it wasn't relinquished */ - if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { - atomic_inc(&cookie->n_active); - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + default: + WARN_ONCE(1, "Cookie %x in unexpected state %u\n", + cookie->debug_id, state); + break; } -out_unlock_enable: - clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +out: + spin_unlock(&cookie->lock); + if (wake) + wake_up_cookie_state(cookie); _leave(""); } -EXPORT_SYMBOL(__fscache_disable_cookie); + +static void fscache_cookie_worker(struct work_struct *work) +{ + struct fscache_cookie *cookie = container_of(work, struct fscache_cookie, work); + + fscache_see_cookie(cookie, fscache_cookie_see_work); + fscache_cookie_state_machine(cookie); + fscache_put_cookie(cookie, fscache_cookie_put_work); +} /* - * release a cookie back to the cache - * - the object will be marked as recyclable on disk if retire is true - * - all dependents of this cookie must have already been unregistered - * (indices/files/pages) + * Wait for the object to become inactive. The cookie's work item will be + * scheduled when someone transitions n_accesses to 0 - but if someone's + * already done that, schedule it anyway. */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool retire) +static void __fscache_withdraw_cookie(struct fscache_cookie *cookie) { - fscache_stat(&fscache_n_relinquishes); - if (retire) - fscache_stat(&fscache_n_relinquishes_retire); + int n_accesses; + bool unpinned; + + unpinned = test_and_clear_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags); + + /* Need to read the access count after unpinning */ + n_accesses = atomic_read(&cookie->n_accesses); + if (unpinned) + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, fscache_access_cache_unpin); + if (n_accesses == 0) + fscache_queue_cookie(cookie, fscache_cookie_get_end_access); +} - if (!cookie) { - fscache_stat(&fscache_n_relinquishes_null); - _leave(" [no cookie]"); - return; - } +static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) +{ + fscache_see_cookie(cookie, fscache_cookie_see_lru_do_one); - _enter("%x{%s,%d},%d", - cookie->debug_id, cookie->def->name, - atomic_read(&cookie->n_active), retire); + spin_lock(&cookie->lock); + if (cookie->state != FSCACHE_COOKIE_STATE_ACTIVE || + time_before(jiffies, cookie->unused_at + fscache_lru_cookie_timeout) || + atomic_read(&cookie->n_active) > 0) { + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_cookies_lru_removed); + } else { + set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_cookies_lru_expired); + _debug("lru c=%x", cookie->debug_id); + __fscache_withdraw_cookie(cookie); + } - trace_fscache_relinquish(cookie, retire); + fscache_put_cookie(cookie, fscache_cookie_put_lru); +} - /* No further netfs-accessing operations on this cookie permitted */ - if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) - BUG(); +static void fscache_cookie_lru_worker(struct work_struct *work) +{ + struct fscache_cookie *cookie; + unsigned long unused_at; - __fscache_disable_cookie(cookie, aux_data, retire); + spin_lock(&fscache_cookie_lru_lock); - /* Clear pointers back to the netfs */ - cookie->netfs_data = NULL; - cookie->def = NULL; - BUG_ON(!radix_tree_empty(&cookie->stores)); + while (!list_empty(&fscache_cookie_lru)) { + cookie = list_first_entry(&fscache_cookie_lru, + struct fscache_cookie, commit_link); + unused_at = cookie->unused_at + fscache_lru_cookie_timeout; + if (time_before(jiffies, unused_at)) { + timer_reduce(&fscache_cookie_lru_timer, unused_at); + break; + } - if (cookie->parent) { - ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0); - ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); - atomic_dec(&cookie->parent->n_children); + list_del_init(&cookie->commit_link); + fscache_stat_d(&fscache_n_cookies_lru); + spin_unlock(&fscache_cookie_lru_lock); + fscache_cookie_lru_do_one(cookie); + spin_lock(&fscache_cookie_lru_lock); } - /* Dispose of the netfs's link to the cookie */ - fscache_cookie_put(cookie, fscache_cookie_put_relinquish); + spin_unlock(&fscache_cookie_lru_lock); +} - _leave(""); +static void fscache_cookie_lru_timed_out(struct timer_list *timer) +{ + queue_work(fscache_wq, &fscache_cookie_lru_work); +} + +static void fscache_cookie_drop_from_lru(struct fscache_cookie *cookie) +{ + bool need_put = false; + + if (!list_empty(&cookie->commit_link)) { + spin_lock(&fscache_cookie_lru_lock); + if (!list_empty(&cookie->commit_link)) { + list_del_init(&cookie->commit_link); + fscache_stat_d(&fscache_n_cookies_lru); + fscache_stat(&fscache_n_cookies_lru_dropped); + need_put = true; + } + spin_unlock(&fscache_cookie_lru_lock); + if (need_put) + fscache_put_cookie(cookie, fscache_cookie_put_lru); + } } -EXPORT_SYMBOL(__fscache_relinquish_cookie); /* * Remove a cookie from the hash table. @@ -851,43 +908,91 @@ static void fscache_unhash_cookie(struct fscache_cookie *cookie) hlist_bl_lock(h); hlist_bl_del(&cookie->hash_link); + clear_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags); hlist_bl_unlock(h); + fscache_stat(&fscache_n_relinquishes_dropped); } +static void fscache_drop_withdraw_cookie(struct fscache_cookie *cookie) +{ + fscache_cookie_drop_from_lru(cookie); + __fscache_withdraw_cookie(cookie); +} + +/** + * fscache_withdraw_cookie - Mark a cookie for withdrawal + * @cookie: The cookie to be withdrawn. + * + * Allow the cache backend to withdraw the backing for a cookie for its own + * reasons, even if that cookie is in active use. + */ +void fscache_withdraw_cookie(struct fscache_cookie *cookie) +{ + set_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags); + fscache_drop_withdraw_cookie(cookie); +} +EXPORT_SYMBOL(fscache_withdraw_cookie); + /* - * Drop a reference to a cookie. + * Allow the netfs to release a cookie back to the cache. + * - the object will be marked as recyclable on disk if retire is true */ -void fscache_cookie_put(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { - struct fscache_cookie *parent; - int ref; + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + _enter("c=%08x{%d},%d", + cookie->debug_id, atomic_read(&cookie->n_active), retire); - _enter("%x", cookie->debug_id); + if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Cookie c=%x already relinquished\n", cookie->debug_id)) + return; - do { - unsigned int cookie_debug_id = cookie->debug_id; - bool zero = __refcount_dec_and_test(&cookie->ref, &ref); + if (retire) + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + trace_fscache_relinquish(cookie, retire); - trace_fscache_cookie(cookie_debug_id, ref - 1, where); - if (!zero) - return; + ASSERTCMP(atomic_read(&cookie->n_active), ==, 0); + ASSERTCMP(atomic_read(&cookie->volume->n_cookies), >, 0); + atomic_dec(&cookie->volume->n_cookies); - parent = cookie->parent; + if (test_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags)) { + set_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags); + fscache_drop_withdraw_cookie(cookie); + } else { + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_DROPPED); fscache_unhash_cookie(cookie); - fscache_free_cookie(cookie); + } + fscache_put_cookie(cookie, fscache_cookie_put_relinquish); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); - cookie = parent; - where = fscache_cookie_put_parent; - } while (cookie); +/* + * Drop a reference to a cookie. + */ +void fscache_put_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + struct fscache_volume *volume = cookie->volume; + unsigned int cookie_debug_id = cookie->debug_id; + bool zero; + int ref; - _leave(""); + zero = __refcount_dec_and_test(&cookie->ref, &ref); + trace_fscache_cookie(cookie_debug_id, ref - 1, where); + if (zero) { + fscache_free_cookie(cookie); + fscache_put_volume(volume, fscache_volume_put_cookie); + } } +EXPORT_SYMBOL(fscache_put_cookie); /* * Get a reference to a cookie. */ -struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, +struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { int ref; @@ -896,85 +1001,73 @@ struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, trace_fscache_cookie(cookie->debug_id, ref + 1, where); return cookie; } +EXPORT_SYMBOL(fscache_get_cookie); /* - * check the consistency between the netfs inode and the backing cache - * - * NOTE: it only serves no-index type + * Ask the cache to effect invalidation of a cookie. */ -int __fscache_check_consistency(struct fscache_cookie *cookie, - const void *aux_data) +static void fscache_perform_invalidation(struct fscache_cookie *cookie) { - struct fscache_operation *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,", cookie); + if (!cookie->volume->cache->ops->invalidate_cookie(cookie)) + fscache_caching_failed(cookie); + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); +} - ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); +/* + * Invalidate an object. + */ +void __fscache_invalidate(struct fscache_cookie *cookie, + const void *aux_data, loff_t new_size, + unsigned int flags) +{ + bool is_caching; - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; + _enter("c=%x", cookie->debug_id); - if (hlist_empty(&cookie->backing_objects)) - return 0; + fscache_stat(&fscache_n_invalidates); - op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!op) - return -ENOMEM; + if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Trying to invalidate relinquished cookie\n")) + return; - fscache_operation_init(cookie, op, NULL, NULL, NULL); - op->flags = FSCACHE_OP_MYTHREAD | - (1 << FSCACHE_OP_WAITING) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency); + if ((flags & FSCACHE_INVAL_DIO_WRITE) && + test_and_set_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags)) + return; spin_lock(&cookie->lock); + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + fscache_update_aux(cookie, aux_data, &new_size); + cookie->inval_counter++; + trace_fscache_invalidate(cookie, new_size); - fscache_update_aux(cookie, aux_data); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto inconsistent; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) - goto inconsistent; - - op->debug_id = atomic_inc_return(&fscache_op_debug_id); + switch (cookie->state) { + case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ + default: + spin_unlock(&cookie->lock); + _leave(" [no %u]", cookie->state); + return; - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, op) < 0) - goto submit_failed; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_CREATING: + spin_unlock(&cookie->lock); + _leave(" [look %x]", cookie->inval_counter); + return; - /* the work queue now carries its own ref on the object */ - spin_unlock(&cookie->lock); + case FSCACHE_COOKIE_STATE_ACTIVE: + is_caching = fscache_begin_cookie_access( + cookie, fscache_access_invalidate_cookie); + if (is_caching) + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_INVALIDATING); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); - ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); - if (ret == 0) { - /* ask the cache to honour the operation */ - ret = object->cache->ops->check_consistency(op); - fscache_op_complete(op, false); - } else if (ret == -ENOBUFS) { - ret = 0; + if (is_caching) + fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); + _leave(" [inv]"); + return; } - - fscache_put_operation(op); - _leave(" = %d", ret); - return ret; - -submit_failed: - wake_cookie = __fscache_unuse_cookie(cookie); -inconsistent: - spin_unlock(&cookie->lock); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - kfree(op); - _leave(" = -ESTALE"); - return -ESTALE; } -EXPORT_SYMBOL(__fscache_check_consistency); +EXPORT_SYMBOL(__fscache_invalidate); /* * Generate a list of extant cookies in /proc/fs/fscache/cookies @@ -983,44 +1076,27 @@ static int fscache_cookies_seq_show(struct seq_file *m, void *v) { struct fscache_cookie *cookie; unsigned int keylen = 0, auxlen = 0; - char _type[3], *type; u8 *p; if (v == &fscache_cookies) { seq_puts(m, - "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n" - "======== ======== ===== ===== === == === ================ ==========\n" + "COOKIE VOLUME REF ACT ACC S FL DEF \n" + "======== ======== === === === = == ================\n" ); return 0; } cookie = list_entry(v, struct fscache_cookie, proc_link); - switch (cookie->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - snprintf(_type, sizeof(_type), "%02u", - cookie->type); - type = _type; - break; - } - seq_printf(m, - "%08x %08x %5u %5u %3u %s %03lx %-16s %px", + "%08x %08x %3d %3d %3d %c %02lx", cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, + cookie->volume->debug_id, refcount_read(&cookie->ref), - atomic_read(&cookie->n_children), atomic_read(&cookie->n_active), - type, - cookie->flags, - cookie->def->name, - cookie->netfs_data); + atomic_read(&cookie->n_accesses), + fscache_cookie_states[cookie->state], + cookie->flags); keylen = cookie->key_len; auxlen = cookie->aux_len; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c deleted file mode 100644 index 0402673c680e..000000000000 --- a/fs/fscache/fsdef.c +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Filesystem index definition - * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL CACHE -#include <linux/module.h> -#include "internal.h" - -static -enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size); - -/* - * The root index is owned by FS-Cache itself. - * - * When a netfs requests caching facilities, FS-Cache will, if one doesn't - * already exist, create an entry in the root index with the key being the name - * of the netfs ("AFS" for example), and the auxiliary data holding the index - * structure version supplied by the netfs: - * - * FSDEF - * | - * +-----------+ - * | | - * NFS AFS - * [v=1] [v=1] - * - * If an entry with the appropriate name does already exist, the version is - * compared. If the version is different, the entire subtree from that entry - * will be discarded and a new entry created. - * - * The new entry will be an index, and a cookie referring to it will be passed - * to the netfs. This is then the root handle by which the netfs accesses the - * cache. It can create whatever objects it likes in that index, including - * further indices. - */ -static struct fscache_cookie_def fscache_fsdef_index_def = { - .name = ".FS-Cache", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie fscache_fsdef_index = { - .debug_id = 1, - .ref = REFCOUNT_INIT(1), - .n_active = ATOMIC_INIT(1), - .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), - .backing_objects = HLIST_HEAD_INIT, - .def = &fscache_fsdef_index_def, - .flags = 1 << FSCACHE_COOKIE_ENABLED, - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; -EXPORT_SYMBOL(fscache_fsdef_index); - -/* - * Definition of an entry in the root index. Each entry is an index, keyed to - * a specific netfs and only applicable to a particular version of the index - * structure used by that netfs. - */ -struct fscache_cookie_def fscache_fsdef_netfs_def = { - .name = "FSDEF.netfs", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .check_aux = fscache_fsdef_netfs_check_aux, -}; - -/* - * check that the index structure version number stored in the auxiliary data - * matches the one the netfs gave us - */ -static enum fscache_checkaux fscache_fsdef_netfs_check_aux( - void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct fscache_netfs *netfs = cookie_netfs_data; - uint32_t version; - - _enter("{%s},,%hu", netfs->name, datalen); - - if (datalen != sizeof(version)) { - _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - memcpy(&version, data, sizeof(version)); - if (version != netfs->version) { - _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c3e4804b8fcb..f121c21590dc 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -1,65 +1,69 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal definitions for FS-Cache * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -/* - * Lock order, in the order in which multiple locks should be obtained: - * - fscache_addremove_sem - * - cookie->lock - * - cookie->parent->lock - * - cache->object_list_lock - * - object->lock - * - object->parent->lock - * - cookie->stores_lock - * - fscache_thread_lock - * - */ - #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "FS-Cache: " fmt +#include <linux/slab.h> #include <linux/fscache-cache.h> #include <trace/events/fscache.h> #include <linux/sched.h> #include <linux/seq_file.h> -#define FSCACHE_MIN_THREADS 4 -#define FSCACHE_MAX_THREADS 32 - /* * cache.c */ -extern struct list_head fscache_cache_list; -extern struct rw_semaphore fscache_addremove_sem; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} -extern struct fscache_cache *fscache_select_cache_for_object( - struct fscache_cookie *); +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} /* * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; extern const struct seq_operations fscache_cookies_seq_ops; +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); -extern void fscache_free_cookie(struct fscache_cookie *); -extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, - const struct fscache_cookie_def *, - const void *, size_t, - const void *, size_t, - void *, loff_t); -extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); -extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *, - enum fscache_cookie_trace); -extern void fscache_cookie_put(struct fscache_cookie *, - enum fscache_cookie_trace); - -static inline void fscache_cookie_see(struct fscache_cookie *cookie, +static inline void fscache_see_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), @@ -67,60 +71,22 @@ static inline void fscache_cookie_see(struct fscache_cookie *cookie, } /* - * fsdef.c + * io.c */ -extern struct fscache_cookie fscache_fsdef_index; -extern struct fscache_cookie_def fscache_fsdef_netfs_def; - -/* - * main.c - */ -extern unsigned fscache_defer_lookup; -extern unsigned fscache_defer_create; -extern unsigned fscache_debug; -extern struct kobject *fscache_root; -extern struct workqueue_struct *fscache_object_wq; -extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - -extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); - -static inline bool fscache_object_congested(void) +static inline void fscache_end_operation(struct netfs_cache_resources *cres) { - return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); } /* - * object.c + * main.c */ -extern void fscache_enqueue_object(struct fscache_object *); +extern unsigned fscache_debug; -/* - * operation.c - */ -extern int fscache_submit_exclusive_op(struct fscache_object *, - struct fscache_operation *); -extern int fscache_submit_op(struct fscache_object *, - struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, bool); -extern void fscache_cancel_all_ops(struct fscache_object *); -extern void fscache_abort_object(struct fscache_object *); -extern void fscache_start_operations(struct fscache_object *); -extern void fscache_operation_gc(struct work_struct *); - -/* - * page.c - */ -extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); -extern int fscache_wait_for_operation_activation(struct fscache_object *, - struct fscache_operation *, - atomic_t *, - atomic_t *); -extern void fscache_invalidate_writes(struct fscache_cookie *); -struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie, - struct address_space *mapping, - fscache_rw_complete_t end_io_func, - void *context); +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); /* * proc.c @@ -137,125 +103,27 @@ extern void fscache_proc_cleanup(void); * stats.c */ #ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; -extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; - -extern atomic_t fscache_n_op_pend; -extern atomic_t fscache_n_op_run; -extern atomic_t fscache_n_op_enqueue; -extern atomic_t fscache_n_op_deferred_release; -extern atomic_t fscache_n_op_initialised; -extern atomic_t fscache_n_op_release; -extern atomic_t fscache_n_op_gc; -extern atomic_t fscache_n_op_cancelled; -extern atomic_t fscache_n_op_rejected; - -extern atomic_t fscache_n_attr_changed; -extern atomic_t fscache_n_attr_changed_ok; -extern atomic_t fscache_n_attr_changed_nobufs; -extern atomic_t fscache_n_attr_changed_nomem; -extern atomic_t fscache_n_attr_changed_calls; - -extern atomic_t fscache_n_allocs; -extern atomic_t fscache_n_allocs_ok; -extern atomic_t fscache_n_allocs_wait; -extern atomic_t fscache_n_allocs_nobufs; -extern atomic_t fscache_n_allocs_intr; -extern atomic_t fscache_n_allocs_object_dead; -extern atomic_t fscache_n_alloc_ops; -extern atomic_t fscache_n_alloc_op_waits; - -extern atomic_t fscache_n_retrievals; -extern atomic_t fscache_n_retrievals_ok; -extern atomic_t fscache_n_retrievals_wait; -extern atomic_t fscache_n_retrievals_nodata; -extern atomic_t fscache_n_retrievals_nobufs; -extern atomic_t fscache_n_retrievals_intr; -extern atomic_t fscache_n_retrievals_nomem; -extern atomic_t fscache_n_retrievals_object_dead; -extern atomic_t fscache_n_retrieval_ops; -extern atomic_t fscache_n_retrieval_op_waits; - -extern atomic_t fscache_n_stores; -extern atomic_t fscache_n_stores_ok; -extern atomic_t fscache_n_stores_again; -extern atomic_t fscache_n_stores_nobufs; -extern atomic_t fscache_n_stores_oom; -extern atomic_t fscache_n_store_ops; -extern atomic_t fscache_n_store_calls; -extern atomic_t fscache_n_store_pages; -extern atomic_t fscache_n_store_radix_deletes; -extern atomic_t fscache_n_store_pages_over_limit; - -extern atomic_t fscache_n_store_vmscan_not_storing; -extern atomic_t fscache_n_store_vmscan_gone; -extern atomic_t fscache_n_store_vmscan_busy; -extern atomic_t fscache_n_store_vmscan_cancelled; -extern atomic_t fscache_n_store_vmscan_wait; - -extern atomic_t fscache_n_marks; -extern atomic_t fscache_n_uncaches; +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_null; -extern atomic_t fscache_n_acquires_no_cache; extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_nobufs; extern atomic_t fscache_n_acquires_oom; extern atomic_t fscache_n_invalidates; -extern atomic_t fscache_n_invalidates_run; - -extern atomic_t fscache_n_updates; -extern atomic_t fscache_n_updates_null; -extern atomic_t fscache_n_updates_run; extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_null; -extern atomic_t fscache_n_relinquishes_waitcrt; extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; -extern atomic_t fscache_n_cookie_index; -extern atomic_t fscache_n_cookie_data; -extern atomic_t fscache_n_cookie_special; - -extern atomic_t fscache_n_object_alloc; -extern atomic_t fscache_n_object_no_alloc; -extern atomic_t fscache_n_object_lookups; -extern atomic_t fscache_n_object_lookups_negative; -extern atomic_t fscache_n_object_lookups_positive; -extern atomic_t fscache_n_object_lookups_timed_out; -extern atomic_t fscache_n_object_created; -extern atomic_t fscache_n_object_avail; -extern atomic_t fscache_n_object_dead; - -extern atomic_t fscache_n_checkaux_none; -extern atomic_t fscache_n_checkaux_okay; -extern atomic_t fscache_n_checkaux_update; -extern atomic_t fscache_n_checkaux_obsolete; - -extern atomic_t fscache_n_cop_alloc_object; -extern atomic_t fscache_n_cop_lookup_object; -extern atomic_t fscache_n_cop_lookup_complete; -extern atomic_t fscache_n_cop_grab_object; -extern atomic_t fscache_n_cop_invalidate_object; -extern atomic_t fscache_n_cop_update_object; -extern atomic_t fscache_n_cop_drop_object; -extern atomic_t fscache_n_cop_put_object; -extern atomic_t fscache_n_cop_sync_cache; -extern atomic_t fscache_n_cop_attr_changed; -extern atomic_t fscache_n_cop_read_or_alloc_page; -extern atomic_t fscache_n_cop_read_or_alloc_pages; -extern atomic_t fscache_n_cop_allocate_page; -extern atomic_t fscache_n_cop_allocate_pages; -extern atomic_t fscache_n_cop_write_page; -extern atomic_t fscache_n_cop_uncache_page; -extern atomic_t fscache_n_cop_dissociate_pages; - -extern atomic_t fscache_n_cache_no_space_reject; -extern atomic_t fscache_n_cache_stale_objects; -extern atomic_t fscache_n_cache_retired_objects; -extern atomic_t fscache_n_cache_culled_objects; +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; static inline void fscache_stat(atomic_t *stat) { @@ -278,71 +146,26 @@ int fscache_stats_show(struct seq_file *m, void *v); #endif /* - * raise an event on an object - * - if the event is not masked for that object, then the object is - * queued for attention by the thread pool. - */ -static inline void fscache_raise_event(struct fscache_object *object, - unsigned event) -{ - BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); -#if 0 - printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n", - object->debug_id, object->event_mask, (1 << event)); -#endif - if (!test_and_set_bit(event, &object->events) && - test_bit(event, &object->event_mask)) - fscache_enqueue_object(object); -} - -/* - * get an extra reference to a netfs retrieval context + * volume.c */ -static inline -void *fscache_get_context(struct fscache_cookie *cookie, void *context) -{ - if (cookie->def->get_context) - cookie->def->get_context(cookie->netfs_data, context); - return context; -} +extern const struct seq_operations fscache_volumes_seq_ops; -/* - * release a reference to a netfs retrieval context - */ -static inline -void fscache_put_context(struct fscache_cookie *cookie, void *context) -{ - if (cookie->def->put_context) - cookie->def->put_context(cookie->netfs_data, context); -} +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); -/* - * Update the auxiliary data on a cookie. - */ -static inline -void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data) -{ - void *p; - - if (!aux_data) - return; - if (cookie->aux_len <= sizeof(cookie->inline_aux)) - p = cookie->inline_aux; - else - p = cookie->aux; - - if (memcmp(p, aux_data, cookie->aux_len) != 0) { - memcpy(p, aux_data, cookie->aux_len); - set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags); - } -} /*****************************************************************************/ /* * debug tracing */ #define dbgprintk(FMT, ...) \ - printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) @@ -395,7 +218,7 @@ do { \ #define FSCACHE_DEBUG_CACHE 0 #define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OBJECT 2 #define FSCACHE_DEBUG_OPERATION 3 #define FSCACHE_POINT_ENTER 1 diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 8ecc1141802f..7a769ea57720 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -4,113 +4,323 @@ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ - -#define FSCACHE_DEBUG_LEVEL PAGE -#include <linux/module.h> -#define FSCACHE_USE_NEW_IO_API +#define FSCACHE_DEBUG_LEVEL OPERATION #include <linux/fscache-cache.h> +#include <linux/uio.h> +#include <linux/bvec.h> #include <linux/slab.h> -#include <linux/netfs.h> +#include <linux/uio.h> #include "internal.h" -/* - * Start a cache read operation. - * - we return: - * -ENOMEM - out of memory, some pages may be being read - * -ERESTARTSYS - interrupted, some pages may be being read - * -ENOBUFS - no backing object or space available in which to cache any - * pages not being read - * -ENODATA - no data available in the backing object for some or all of - * the pages - * 0 - dispatched a read on all pages +/** + * fscache_wait_for_operation - Wait for an object become accessible + * @cres: The cache resources for the operation being performed + * @want_state: The minimum state the object must be at + * + * See if the target cache object is at the specified minimum state of + * accessibility yet, and if not, wait for it. */ -int __fscache_begin_read_operation(struct netfs_read_request *rreq, - struct fscache_cookie *cookie) +bool fscache_wait_for_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state) { - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; + struct fscache_cookie *cookie = fscache_cres_cookie(cres); + enum fscache_cookie_state state; - _enter("rr=%08x", rreq->debug_id); +again: + if (!fscache_cache_is_live(cookie->volume->cache)) { + _leave(" [broken]"); + return false; + } - fscache_stat(&fscache_n_retrievals); + state = fscache_cookie_state(cookie); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; + switch (state) { + case FSCACHE_COOKIE_STATE_CREATING: + case FSCACHE_COOKIE_STATE_INVALIDATING: + if (want_state == FSCACHE_WANT_PARAMS) + goto ready; /* There can be no content */ + fallthrough; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + wait_var_event(&cookie->state, + fscache_cookie_state(cookie) != state); + goto again; - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; + case FSCACHE_COOKIE_STATE_ACTIVE: + goto ready; + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + default: + _leave(" [not live]"); + return false; } - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); +ready: + if (!cres->cache_priv2) + return cookie->volume->cache->ops->begin_operation(cres, want_state); + return true; +} +EXPORT_SYMBOL(fscache_wait_for_operation); + +/* + * Begin an I/O operation on the cache, waiting till we reach the right state. + * + * Attaches the resources required to the operation resources record. + */ +static int fscache_begin_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie, + enum fscache_want_state want_state, + enum fscache_access_trace why) +{ + enum fscache_cookie_state state; + long timeo; + bool once_only = false; - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; + cres->ops = NULL; + cres->cache_priv = cookie; + cres->cache_priv2 = NULL; + cres->debug_id = cookie->debug_id; + cres->inval_counter = cookie->inval_counter; - op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL); - if (!op) - return -ENOMEM; - trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); + if (!fscache_begin_cookie_access(cookie, why)) + return -ENOBUFS; +again: spin_lock(&cookie->lock); - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); + state = fscache_cookie_state(cookie); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + switch (state) { + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + case FSCACHE_COOKIE_STATE_INVALIDATING: + goto wait_for_file_wrangling; + case FSCACHE_COOKIE_STATE_CREATING: + if (want_state == FSCACHE_WANT_PARAMS) + goto ready; /* There can be no content */ + goto wait_for_file_wrangling; + case FSCACHE_COOKIE_STATE_ACTIVE: + goto ready; + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + WARN(1, "Can't use cookie in state %u\n", cookie->state); + goto not_live; + default: + goto not_live; + } - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; +ready: spin_unlock(&cookie->lock); + if (!cookie->volume->cache->ops->begin_operation(cres, want_state)) + goto failed; + return 0; - fscache_stat(&fscache_n_retrieval_ops); +wait_for_file_wrangling: + spin_unlock(&cookie->lock); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + atomic_read(&cookie->n_accesses), + fscache_access_io_wait); + timeo = wait_var_event_timeout(&cookie->state, + fscache_cookie_state(cookie) != state, 20 * HZ); + if (timeo <= 1 && !once_only) { + pr_warn("%s: cookie state change wait timed out: cookie->state=%u state=%u", + __func__, fscache_cookie_state(cookie), state); + fscache_print_cookie(cookie, 'O'); + once_only = true; + } + goto again; - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - ret = object->cache->ops->begin_read_operation(rreq, op); - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: +not_live: spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); +failed: + cres->cache_priv = NULL; + cres->ops = NULL; + fscache_end_cookie_access(cookie, fscache_access_io_not_live); _leave(" = -ENOBUFS"); return -ENOBUFS; } + +int __fscache_begin_read_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS, + fscache_access_io_read); +} EXPORT_SYMBOL(__fscache_begin_read_operation); + +int __fscache_begin_write_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS, + fscache_access_io_write); +} +EXPORT_SYMBOL(__fscache_begin_write_operation); + +/** + * fscache_set_page_dirty - Mark page dirty and pin a cache object for writeback + * @page: The page being dirtied + * @cookie: The cookie referring to the cache object + * + * Set the dirty flag on a page and pin an in-use cache object in memory when + * dirtying a page so that writeback can later write to it. This is intended + * to be called from the filesystem's ->set_page_dirty() method. + * + * Returns 1 if PG_dirty was set on the page, 0 otherwise. + */ +int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie) +{ + struct inode *inode = page->mapping->host; + bool need_use = false; + + _enter(""); + + if (!__set_page_dirty_nobuffers(page)) + return 0; + if (!fscache_cookie_valid(cookie)) + return 1; + + if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { + inode->i_state |= I_PINNING_FSCACHE_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return 1; +} +EXPORT_SYMBOL(fscache_set_page_dirty); + +struct fscache_write_request { + struct netfs_cache_resources cache_resources; + struct address_space *mapping; + loff_t start; + size_t len; + bool set_bits; + netfs_io_terminated_t term_func; + void *term_func_priv; +}; + +void __fscache_clear_page_bits(struct address_space *mapping, + loff_t start, size_t len) +{ + pgoff_t first = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE; + struct page *page; + + if (len) { + XA_STATE(xas, &mapping->i_pages, first); + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + end_page_fscache(page); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(__fscache_clear_page_bits); + +/* + * Deal with the completion of writing the data to the cache. + */ +static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct fscache_write_request *wreq = priv; + + fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources), + wreq->mapping, wreq->start, wreq->len, + wreq->set_bits); + + if (wreq->term_func) + wreq->term_func(wreq->term_func_priv, transferred_or_error, + was_async); + fscache_end_operation(&wreq->cache_resources); + kfree(wreq); +} + +void __fscache_write_to_cache(struct fscache_cookie *cookie, + struct address_space *mapping, + loff_t start, size_t len, loff_t i_size, + netfs_io_terminated_t term_func, + void *term_func_priv, + bool cond) +{ + struct fscache_write_request *wreq; + struct netfs_cache_resources *cres; + struct iov_iter iter; + int ret = -ENOBUFS; + + if (len == 0) + goto abandon; + + _enter("%llx,%zx", start, len); + + wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); + if (!wreq) + goto abandon; + wreq->mapping = mapping; + wreq->start = start; + wreq->len = len; + wreq->set_bits = cond; + wreq->term_func = term_func; + wreq->term_func_priv = term_func_priv; + + cres = &wreq->cache_resources; + if (fscache_begin_operation(cres, cookie, FSCACHE_WANT_WRITE, + fscache_access_io_write) < 0) + goto abandon_free; + + ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + if (ret < 0) + goto abandon_end; + + /* TODO: Consider clearing page bits now for space the write isn't + * covering. This is more complicated than it appears when THPs are + * taken into account. + */ + + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + fscache_write(cres, start, &iter, fscache_wreq_done, wreq); + return; + +abandon_end: + return fscache_wreq_done(wreq, ret, false); +abandon_free: + kfree(wreq); +abandon: + fscache_clear_page_bits(cookie, mapping, start, len, cond); + if (term_func) + term_func(term_func_priv, ret, false); +} +EXPORT_SYMBOL(__fscache_write_to_cache); + +/* + * Change the size of a backing object. + */ +void __fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size) +{ + struct netfs_cache_resources cres; + + trace_fscache_resize(cookie, new_size); + if (fscache_begin_operation(&cres, cookie, FSCACHE_WANT_WRITE, + fscache_access_io_resize) == 0) { + fscache_stat(&fscache_n_resizes); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); + + /* We cannot defer a resize as we need to do it inside the + * netfs's inode lock so that we're serialised with respect to + * writes. + */ + cookie->volume->cache->ops->resize_cookie(&cres, new_size); + fscache_end_operation(&cres); + } else { + fscache_stat(&fscache_n_resizes_null); + } +} +EXPORT_SYMBOL(__fscache_resize_cookie); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4207f98e405f..dad85fd84f6f 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -1,17 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* General filesystem local caching manager * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/slab.h> -#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "internal.h" @@ -19,79 +15,18 @@ MODULE_DESCRIPTION("FS Cache Manager"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); -unsigned fscache_defer_lookup = 1; -module_param_named(defer_lookup, fscache_defer_lookup, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_defer_lookup, - "Defer cookie lookup to background thread"); - -unsigned fscache_defer_create = 1; -module_param_named(defer_create, fscache_defer_create, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_defer_create, - "Defer cookie creation to background thread"); - unsigned fscache_debug; module_param_named(debug, fscache_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(fscache_debug, "FS-Cache debugging mask"); -struct kobject *fscache_root; -struct workqueue_struct *fscache_object_wq; -struct workqueue_struct *fscache_op_wq; - -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); +EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); +EXPORT_TRACEPOINT_SYMBOL(fscache_access); -/* these values serve as lower bounds, will be adjusted in fscache_init() */ -static unsigned fscache_object_max_active = 4; -static unsigned fscache_op_max_active = 2; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *fscache_sysctl_header; - -static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct workqueue_struct **wqp = table->extra1; - unsigned int *datap = table->data; - int ret; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret == 0) - workqueue_set_max_active(*wqp, *datap); - return ret; -} - -static struct ctl_table fscache_sysctls[] = { - { - .procname = "object_max_active", - .data = &fscache_object_max_active, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = fscache_max_active_sysctl, - .extra1 = &fscache_object_wq, - }, - { - .procname = "operation_max_active", - .data = &fscache_op_max_active, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = fscache_max_active_sysctl, - .extra1 = &fscache_op_wq, - }, - {} -}; - -static struct ctl_table fscache_sysctls_root[] = { - { - .procname = "fscache", - .mode = 0555, - .child = fscache_sysctls, - }, - {} -}; -#endif +struct workqueue_struct *fscache_wq; +EXPORT_SYMBOL(fscache_wq); /* * Mixing scores (in bits) for (7,20): @@ -118,15 +53,16 @@ static inline unsigned int fold_hash(unsigned long x, unsigned long y) /* * Generate a hash. This is derived from full_name_hash(), but we want to be * sure it is arch independent and that it doesn't change as bits of the - * computed hash value might appear on disk. The caller also guarantees that - * the hashed data will be a series of aligned 32-bit words. + * computed hash value might appear on disk. The caller must guarantee that + * the source data is a multiple of four bytes in size. */ -unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) +unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) { - unsigned int a, x = 0, y = salt; + const __le32 *p = data; + unsigned int a, x = 0, y = salt, n = len / sizeof(__le32); for (; n; n--) { - a = *data++; + a = le32_to_cpu(*p++); HASH_MIX(x, y, a); } return fold_hash(x, y); @@ -137,44 +73,16 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) */ static int __init fscache_init(void) { - unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; - int ret; - - fscache_object_max_active = - clamp_val(nr_cpus, - fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); - - ret = -ENOMEM; - fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, - fscache_object_max_active); - if (!fscache_object_wq) - goto error_object_wq; - - fscache_op_max_active = - clamp_val(fscache_object_max_active / 2, - fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + int ret = -ENOMEM; - ret = -ENOMEM; - fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, - fscache_op_max_active); - if (!fscache_op_wq) - goto error_op_wq; - - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); + fscache_wq = alloc_workqueue("fscache", WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!fscache_wq) + goto error_wq; ret = fscache_proc_init(); if (ret < 0) goto error_proc; -#ifdef CONFIG_SYSCTL - ret = -ENOMEM; - fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); - if (!fscache_sysctl_header) - goto error_sysctl; -#endif - fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, 0, NULL); @@ -184,26 +92,14 @@ static int __init fscache_init(void) goto error_cookie_jar; } - fscache_root = kobject_create_and_add("fscache", kernel_kobj); - if (!fscache_root) - goto error_kobj; - pr_notice("Loaded\n"); return 0; -error_kobj: - kmem_cache_destroy(fscache_cookie_jar); error_cookie_jar: -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(fscache_sysctl_header); -error_sysctl: -#endif fscache_proc_cleanup(); error_proc: - destroy_workqueue(fscache_op_wq); -error_op_wq: - destroy_workqueue(fscache_object_wq); -error_object_wq: + destroy_workqueue(fscache_wq); +error_wq: return ret; } @@ -216,14 +112,9 @@ static void __exit fscache_exit(void) { _enter(""); - kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(fscache_sysctl_header); -#endif fscache_proc_cleanup(); - destroy_workqueue(fscache_op_wq); - destroy_workqueue(fscache_object_wq); + destroy_workqueue(fscache_wq); pr_notice("Unloaded\n"); } diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c deleted file mode 100644 index d6bdb7b5e723..000000000000 --- a/fs/fscache/netfs.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache netfs (client) registration - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/slab.h> -#include "internal.h" - -/* - * register a network filesystem for caching - */ -int __fscache_register_netfs(struct fscache_netfs *netfs) -{ - struct fscache_cookie *candidate, *cookie; - - _enter("{%s}", netfs->name); - - /* allocate a cookie for the primary index */ - candidate = fscache_alloc_cookie(&fscache_fsdef_index, - &fscache_fsdef_netfs_def, - netfs->name, strlen(netfs->name), - &netfs->version, sizeof(netfs->version), - netfs, 0); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - candidate->flags = 1 << FSCACHE_COOKIE_ENABLED; - - /* check the netfs type is not already present */ - cookie = fscache_hash_cookie(candidate); - if (!cookie) - goto already_registered; - if (cookie != candidate) { - trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard); - fscache_free_cookie(candidate); - } - - fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs); - atomic_inc(&cookie->parent->n_children); - - netfs->primary_index = cookie; - - pr_notice("Netfs '%s' registered for caching\n", netfs->name); - trace_fscache_netfs(netfs); - _leave(" = 0"); - return 0; - -already_registered: - fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs); - _leave(" = -EEXIST"); - return -EEXIST; -} -EXPORT_SYMBOL(__fscache_register_netfs); - -/* - * unregister a network filesystem from the cache - * - all cookies must have been released first - */ -void __fscache_unregister_netfs(struct fscache_netfs *netfs) -{ - _enter("{%s.%u}", netfs->name, netfs->version); - - fscache_relinquish_cookie(netfs->primary_index, NULL, false); - pr_notice("Netfs '%s' unregistered from caching\n", netfs->name); - - _leave(""); -} -EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/fs/fscache/object.c b/fs/fscache/object.c deleted file mode 100644 index 6a675652129b..000000000000 --- a/fs/fscache/object.c +++ /dev/null @@ -1,1125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache object state machine handler - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * See Documentation/filesystems/caching/object.rst for a description of the - * object state machine and the in-kernel representations. - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/prefetch.h> -#include "internal.h" - -static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int); -static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int); -static const struct fscache_state *fscache_drop_object(struct fscache_object *, int); -static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int); -static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int); -static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int); -static const struct fscache_state *fscache_kill_object(struct fscache_object *, int); -static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int); -static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int); -static const struct fscache_state *fscache_object_available(struct fscache_object *, int); -static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); -static const struct fscache_state *fscache_update_object(struct fscache_object *, int); -static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); - -#define __STATE_NAME(n) fscache_osm_##n -#define STATE(n) (&__STATE_NAME(n)) - -/* - * Define a work state. Work states are execution states. No event processing - * is performed by them. The function attached to a work state returns a - * pointer indicating the next state to which the state machine should - * transition. Returning NO_TRANSIT repeats the current state, but goes back - * to the scheduler first. - */ -#define WORK_STATE(n, sn, f) \ - const struct fscache_state __STATE_NAME(n) = { \ - .name = #n, \ - .short_name = sn, \ - .work = f \ - } - -/* - * Returns from work states. - */ -#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); }) - -#define NO_TRANSIT ((struct fscache_state *)NULL) - -/* - * Define a wait state. Wait states are event processing states. No execution - * is performed by them. Wait states are just tables of "if event X occurs, - * clear it and transition to state Y". The dispatcher returns to the - * scheduler if none of the events in which the wait state has an interest are - * currently pending. - */ -#define WAIT_STATE(n, sn, ...) \ - const struct fscache_state __STATE_NAME(n) = { \ - .name = #n, \ - .short_name = sn, \ - .work = NULL, \ - .transitions = { __VA_ARGS__, { 0, NULL } } \ - } - -#define TRANSIT_TO(state, emask) \ - { .events = (emask), .transit_to = STATE(state) } - -/* - * The object state machine. - */ -static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); -static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); -static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); -static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); -static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); - -static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object); -static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object); - -static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); -static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); -static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); -static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); - -static WAIT_STATE(WAIT_FOR_INIT, "?INI", - TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); - -static WAIT_STATE(WAIT_FOR_PARENT, "?PRN", - TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY)); - -static WAIT_STATE(WAIT_FOR_CMD, "?CMD", - TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE), - TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE), - TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); - -static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR", - TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED)); - -/* - * Out-of-band event transition tables. These are for handling unexpected - * events, such as an I/O error. If an OOB event occurs, the state machine - * clears and disables the event and forces a transition to the nominated work - * state (acurrently executing work states will complete first). - * - * In such a situation, object->state remembers the state the machine should - * have been in/gone to and returning NO_TRANSIT returns to that. - */ -static const struct fscache_transition fscache_osm_init_oob[] = { - TRANSIT_TO(ABORT_INIT, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static const struct fscache_transition fscache_osm_lookup_oob[] = { - TRANSIT_TO(LOOKUP_FAILURE, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static const struct fscache_transition fscache_osm_run_oob[] = { - TRANSIT_TO(KILL_OBJECT, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static int fscache_get_object(struct fscache_object *, - enum fscache_obj_ref_trace); -static void fscache_put_object(struct fscache_object *, - enum fscache_obj_ref_trace); -static bool fscache_enqueue_dependents(struct fscache_object *, int); -static void fscache_dequeue_object(struct fscache_object *); -static void fscache_update_aux_data(struct fscache_object *); - -/* - * we need to notify the parent when an op completes that we had outstanding - * upon it - */ -static inline void fscache_done_parent_op(struct fscache_object *object) -{ - struct fscache_object *parent = object->parent; - - _enter("OBJ%x {OBJ%x,%x}", - object->debug_id, parent->debug_id, parent->n_ops); - - spin_lock_nested(&parent->lock, 1); - parent->n_obj_ops--; - parent->n_ops--; - if (parent->n_ops == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); -} - -/* - * Object state machine dispatcher. - */ -static void fscache_object_sm_dispatcher(struct fscache_object *object) -{ - const struct fscache_transition *t; - const struct fscache_state *state, *new_state; - unsigned long events, event_mask; - bool oob; - int event = -1; - - ASSERT(object != NULL); - - _enter("{OBJ%x,%s,%lx}", - object->debug_id, object->state->name, object->events); - - event_mask = object->event_mask; -restart: - object->event_mask = 0; /* Mask normal event handling */ - state = object->state; -restart_masked: - events = object->events; - - /* Handle any out-of-band events (typically an error) */ - if (events & object->oob_event_mask) { - _debug("{OBJ%x} oob %lx", - object->debug_id, events & object->oob_event_mask); - oob = true; - for (t = object->oob_table; t->events; t++) { - if (events & t->events) { - state = t->transit_to; - ASSERT(state->work != NULL); - event = fls(events & t->events) - 1; - __clear_bit(event, &object->oob_event_mask); - clear_bit(event, &object->events); - goto execute_work_state; - } - } - } - oob = false; - - /* Wait states are just transition tables */ - if (!state->work) { - if (events & event_mask) { - for (t = state->transitions; t->events; t++) { - if (events & t->events) { - new_state = t->transit_to; - event = fls(events & t->events) - 1; - trace_fscache_osm(object, state, - true, false, event); - clear_bit(event, &object->events); - _debug("{OBJ%x} ev %d: %s -> %s", - object->debug_id, event, - state->name, new_state->name); - object->state = state = new_state; - goto execute_work_state; - } - } - - /* The event mask didn't include all the tabled bits */ - BUG(); - } - /* Randomly woke up */ - goto unmask_events; - } - -execute_work_state: - _debug("{OBJ%x} exec %s", object->debug_id, state->name); - - trace_fscache_osm(object, state, false, oob, event); - new_state = state->work(object, event); - event = -1; - if (new_state == NO_TRANSIT) { - _debug("{OBJ%x} %s notrans", object->debug_id, state->name); - if (unlikely(state == STATE(OBJECT_DEAD))) { - _leave(" [dead]"); - return; - } - fscache_enqueue_object(object); - event_mask = object->oob_event_mask; - goto unmask_events; - } - - _debug("{OBJ%x} %s -> %s", - object->debug_id, state->name, new_state->name); - object->state = state = new_state; - - if (state->work) { - if (unlikely(state == STATE(OBJECT_DEAD))) { - _leave(" [dead]"); - return; - } - goto restart_masked; - } - - /* Transited to wait state */ - event_mask = object->oob_event_mask; - for (t = state->transitions; t->events; t++) - event_mask |= t->events; - -unmask_events: - object->event_mask = event_mask; - smp_mb(); - events = object->events; - if (events & event_mask) - goto restart; - _leave(" [msk %lx]", event_mask); -} - -/* - * execute an object - */ -static void fscache_object_work_func(struct work_struct *work) -{ - struct fscache_object *object = - container_of(work, struct fscache_object, work); - - _enter("{OBJ%x}", object->debug_id); - - fscache_object_sm_dispatcher(object); - fscache_put_object(object, fscache_obj_put_work); -} - -/** - * fscache_object_init - Initialise a cache object description - * @object: Object description - * @cookie: Cookie object will be attached to - * @cache: Cache in which backing object will be found - * - * Initialise a cache object description to its basic values. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. - */ -void fscache_object_init(struct fscache_object *object, - struct fscache_cookie *cookie, - struct fscache_cache *cache) -{ - const struct fscache_transition *t; - - atomic_inc(&cache->object_count); - - object->state = STATE(WAIT_FOR_INIT); - object->oob_table = fscache_osm_init_oob; - object->flags = 1 << FSCACHE_OBJECT_IS_LIVE; - spin_lock_init(&object->lock); - INIT_LIST_HEAD(&object->cache_link); - INIT_HLIST_NODE(&object->cookie_link); - INIT_WORK(&object->work, fscache_object_work_func); - INIT_LIST_HEAD(&object->dependents); - INIT_LIST_HEAD(&object->dep_link); - INIT_LIST_HEAD(&object->pending_ops); - object->n_children = 0; - object->n_ops = object->n_in_progress = object->n_exclusive = 0; - object->events = 0; - object->store_limit = 0; - object->store_limit_l = 0; - object->cache = cache; - object->cookie = cookie; - fscache_cookie_get(cookie, fscache_cookie_get_attach_object); - object->parent = NULL; -#ifdef CONFIG_FSCACHE_OBJECT_LIST - RB_CLEAR_NODE(&object->objlist_link); -#endif - - object->oob_event_mask = 0; - for (t = object->oob_table; t->events; t++) - object->oob_event_mask |= t->events; - object->event_mask = object->oob_event_mask; - for (t = object->state->transitions; t->events; t++) - object->event_mask |= t->events; -} -EXPORT_SYMBOL(fscache_object_init); - -/* - * Mark the object as no longer being live, making sure that we synchronise - * against op submission. - */ -static inline void fscache_mark_object_dead(struct fscache_object *object) -{ - spin_lock(&object->lock); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); - spin_unlock(&object->lock); -} - -/* - * Abort object initialisation before we start it. - */ -static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_event_mask = 0; - fscache_dequeue_object(object); - return transit_to(KILL_OBJECT); -} - -/* - * initialise an object - * - check the specified object's parent to see if we can make use of it - * immediately to do a creation - * - we may need to start the process of creating a parent and we need to wait - * for the parent's lookup and creation to complete if it's not there yet - */ -static const struct fscache_state *fscache_initialise_object(struct fscache_object *object, - int event) -{ - struct fscache_object *parent; - bool success; - - _enter("{OBJ%x},%d", object->debug_id, event); - - ASSERT(list_empty(&object->dep_link)); - - parent = object->parent; - if (!parent) { - _leave(" [no parent]"); - return transit_to(DROP_OBJECT); - } - - _debug("parent: %s of:%lx", parent->state->name, parent->flags); - - if (fscache_object_is_dying(parent)) { - _leave(" [bad parent]"); - return transit_to(DROP_OBJECT); - } - - if (fscache_object_is_available(parent)) { - _leave(" [ready]"); - return transit_to(PARENT_READY); - } - - _debug("wait"); - - spin_lock(&parent->lock); - fscache_stat(&fscache_n_cop_grab_object); - success = false; - if (fscache_object_is_live(parent) && - object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) { - list_add(&object->dep_link, &parent->dependents); - success = true; - } - fscache_stat_d(&fscache_n_cop_grab_object); - spin_unlock(&parent->lock); - if (!success) { - _leave(" [grab failed]"); - return transit_to(DROP_OBJECT); - } - - /* fscache_acquire_non_index_cookie() uses this - * to wake the chain up */ - fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD); - _leave(" [wait]"); - return transit_to(WAIT_FOR_PARENT); -} - -/* - * Once the parent object is ready, we should kick off our lookup op. - */ -static const struct fscache_state *fscache_parent_ready(struct fscache_object *object, - int event) -{ - struct fscache_object *parent = object->parent; - - _enter("{OBJ%x},%d", object->debug_id, event); - - ASSERT(parent != NULL); - - spin_lock(&parent->lock); - parent->n_ops++; - parent->n_obj_ops++; - spin_unlock(&parent->lock); - - _leave(""); - return transit_to(LOOK_UP_OBJECT); -} - -/* - * look an object up in the cache from which it was allocated - * - we hold an "access lock" on the parent object, so the parent object cannot - * be withdrawn by either party till we've finished - */ -static const struct fscache_state *fscache_look_up_object(struct fscache_object *object, - int event) -{ - struct fscache_cookie *cookie = object->cookie; - struct fscache_object *parent = object->parent; - int ret; - - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_table = fscache_osm_lookup_oob; - - ASSERT(parent != NULL); - ASSERTCMP(parent->n_ops, >, 0); - ASSERTCMP(parent->n_obj_ops, >, 0); - - /* make sure the parent is still available */ - ASSERT(fscache_object_is_available(parent)); - - if (fscache_object_is_dying(parent) || - test_bit(FSCACHE_IOERROR, &object->cache->flags) || - !fscache_use_cookie(object)) { - _leave(" [unavailable]"); - return transit_to(LOOKUP_FAILURE); - } - - _debug("LOOKUP \"%s\" in \"%s\"", - cookie->def->name, object->cache->tag->name); - - fscache_stat(&fscache_n_object_lookups); - fscache_stat(&fscache_n_cop_lookup_object); - ret = object->cache->ops->lookup_object(object); - fscache_stat_d(&fscache_n_cop_lookup_object); - - fscache_unuse_cookie(object); - - if (ret == -ETIMEDOUT) { - /* probably stuck behind another object, so move this one to - * the back of the queue */ - fscache_stat(&fscache_n_object_lookups_timed_out); - _leave(" [timeout]"); - return NO_TRANSIT; - } - - if (ret < 0) { - _leave(" [error]"); - return transit_to(LOOKUP_FAILURE); - } - - _leave(" [ok]"); - return transit_to(OBJECT_AVAILABLE); -} - -/** - * fscache_object_lookup_negative - Note negative cookie lookup - * @object: Object pointing to cookie to mark - * - * Note negative lookup, permitting those waiting to read data from an already - * existing backing object to continue as there's no data for them to read. - */ -void fscache_object_lookup_negative(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - - if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { - fscache_stat(&fscache_n_object_lookups_negative); - - /* Allow write requests to begin stacking up and read requests to begin - * returning ENODATA. - */ - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - } - _leave(""); -} -EXPORT_SYMBOL(fscache_object_lookup_negative); - -/** - * fscache_obtained_object - Note successful object lookup or creation - * @object: Object pointing to cookie to mark - * - * Note successful lookup and/or creation, permitting those waiting to write - * data to a backing object to continue. - * - * Note that after calling this, an object's cookie may be relinquished by the - * netfs, and so must be accessed with object lock held. - */ -void fscache_obtained_object(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - - /* if we were still looking up, then we must have a positive lookup - * result, in which case there may be data available */ - if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { - fscache_stat(&fscache_n_object_lookups_positive); - - /* We do (presumably) have data */ - clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - /* Allow write requests to begin stacking up and read requests - * to begin shovelling data. - */ - clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - } else { - fscache_stat(&fscache_n_object_created); - } - - set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); - _leave(""); -} -EXPORT_SYMBOL(fscache_obtained_object); - -/* - * handle an object that has just become available - */ -static const struct fscache_state *fscache_object_available(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_table = fscache_osm_run_oob; - - spin_lock(&object->lock); - - fscache_done_parent_op(object); - if (object->n_in_progress == 0) { - if (object->n_ops > 0) { - ASSERTCMP(object->n_ops, >=, object->n_obj_ops); - fscache_start_operations(object); - } else { - ASSERT(list_empty(&object->pending_ops)); - } - } - spin_unlock(&object->lock); - - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - fscache_stat(&fscache_n_object_avail); - - _leave(""); - return transit_to(JUMPSTART_DEPS); -} - -/* - * Wake up this object's dependent objects now that we've become available. - */ -static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY)) - return NO_TRANSIT; /* Not finished; requeue */ - return transit_to(WAIT_FOR_CMD); -} - -/* - * Handle lookup or creation failute. - */ -static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object, - int event) -{ - struct fscache_cookie *cookie; - - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_event_mask = 0; - - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); - - cookie = object->cookie; - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - - fscache_done_parent_op(object); - return transit_to(KILL_OBJECT); -} - -/* - * Wait for completion of all active operations on this object and the death of - * all child objects of this object. - */ -static const struct fscache_state *fscache_kill_object(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x,%d,%d},%d", - object->debug_id, object->n_ops, object->n_children, event); - - fscache_mark_object_dead(object); - object->oob_event_mask = 0; - - if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { - /* Reject any new read/write ops and abort any that are pending. */ - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_cancel_all_ops(object); - } - - if (list_empty(&object->dependents) && - object->n_ops == 0 && - object->n_children == 0) - return transit_to(DROP_OBJECT); - - if (object->n_in_progress == 0) { - spin_lock(&object->lock); - if (object->n_ops > 0 && object->n_in_progress == 0) - fscache_start_operations(object); - spin_unlock(&object->lock); - } - - if (!list_empty(&object->dependents)) - return transit_to(KILL_DEPENDENTS); - - return transit_to(WAIT_FOR_CLEARANCE); -} - -/* - * Kill dependent objects. - */ -static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL)) - return NO_TRANSIT; /* Not finished */ - return transit_to(WAIT_FOR_CLEARANCE); -} - -/* - * Drop an object's attachments - */ -static const struct fscache_state *fscache_drop_object(struct fscache_object *object, - int event) -{ - struct fscache_object *parent = object->parent; - struct fscache_cookie *cookie = object->cookie; - struct fscache_cache *cache = object->cache; - bool awaken = false; - - _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event); - - ASSERT(cookie != NULL); - ASSERT(!hlist_unhashed(&object->cookie_link)); - - if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) { - _debug("final update"); - fscache_update_aux_data(object); - } - - /* Make sure the cookie no longer points here and that the netfs isn't - * waiting for us. - */ - spin_lock(&cookie->lock); - hlist_del_init(&object->cookie_link); - if (hlist_empty(&cookie->backing_objects) && - test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - awaken = true; - spin_unlock(&cookie->lock); - - if (awaken) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - - - /* Prevent a race with our last child, which has to signal EV_CLEARED - * before dropping our spinlock. - */ - spin_lock(&object->lock); - spin_unlock(&object->lock); - - /* Discard from the cache's collection of objects */ - spin_lock(&cache->object_list_lock); - list_del_init(&object->cache_link); - spin_unlock(&cache->object_list_lock); - - fscache_stat(&fscache_n_cop_drop_object); - cache->ops->drop_object(object); - fscache_stat_d(&fscache_n_cop_drop_object); - - /* The parent object wants to know when all it dependents have gone */ - if (parent) { - _debug("release parent OBJ%x {%d}", - parent->debug_id, parent->n_children); - - spin_lock(&parent->lock); - parent->n_children--; - if (parent->n_children == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); - object->parent = NULL; - } - - /* this just shifts the object release to the work processor */ - fscache_put_object(object, fscache_obj_put_drop_obj); - fscache_stat(&fscache_n_object_dead); - - _leave(""); - return transit_to(OBJECT_DEAD); -} - -/* - * get a ref on an object - */ -static int fscache_get_object(struct fscache_object *object, - enum fscache_obj_ref_trace why) -{ - int ret; - - fscache_stat(&fscache_n_cop_grab_object); - ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN; - fscache_stat_d(&fscache_n_cop_grab_object); - return ret; -} - -/* - * Discard a ref on an object - */ -static void fscache_put_object(struct fscache_object *object, - enum fscache_obj_ref_trace why) -{ - fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object, why); - fscache_stat_d(&fscache_n_cop_put_object); -} - -/** - * fscache_object_destroy - Note that a cache object is about to be destroyed - * @object: The object to be destroyed - * - * Note the imminent destruction and deallocation of a cache object record. - */ -void fscache_object_destroy(struct fscache_object *object) -{ - /* We can get rid of the cookie now */ - fscache_cookie_put(object->cookie, fscache_cookie_put_object); - object->cookie = NULL; -} -EXPORT_SYMBOL(fscache_object_destroy); - -/* - * enqueue an object for metadata-type processing - */ -void fscache_enqueue_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); - - if (queue_work(fscache_object_wq, &object->work)) { - if (fscache_object_congested()) - wake_up(cong_wq); - } else - fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); - } -} - -/** - * fscache_object_sleep_till_congested - Sleep until object wq is congested - * @timeoutp: Scheduler sleep timeout - * - * Allow an object handler to sleep until the object workqueue is congested. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * %true is returned if the object wq is congested, %false otherwise. - */ -bool fscache_object_sleep_till_congested(signed long *timeoutp) -{ - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); - DEFINE_WAIT(wait); - - if (fscache_object_congested()) - return true; - - add_wait_queue_exclusive(cong_wq, &wait); - if (!fscache_object_congested()) - *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); - - return fscache_object_congested(); -} -EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); - -/* - * Enqueue the dependents of an object for metadata-type processing. - * - * If we don't manage to finish the list before the scheduler wants to run - * again then return false immediately. We return true if the list was - * cleared. - */ -static bool fscache_enqueue_dependents(struct fscache_object *object, int event) -{ - struct fscache_object *dep; - bool ret = true; - - _enter("{OBJ%x}", object->debug_id); - - if (list_empty(&object->dependents)) - return true; - - spin_lock(&object->lock); - - while (!list_empty(&object->dependents)) { - dep = list_entry(object->dependents.next, - struct fscache_object, dep_link); - list_del_init(&dep->dep_link); - - fscache_raise_event(dep, event); - fscache_put_object(dep, fscache_obj_put_enq_dep); - - if (!list_empty(&object->dependents) && need_resched()) { - ret = false; - break; - } - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * remove an object from whatever queue it's waiting on - */ -static void fscache_dequeue_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - if (!list_empty(&object->dep_link)) { - spin_lock(&object->parent->lock); - list_del_init(&object->dep_link); - spin_unlock(&object->parent->lock); - } - - _leave(""); -} - -/** - * fscache_check_aux - Ask the netfs whether an object on disk is still valid - * @object: The object to ask about - * @data: The auxiliary data for the object - * @datalen: The size of the auxiliary data - * @object_size: The size of the object according to the server. - * - * This function consults the netfs about the coherency state of an object. - * The caller must be holding a ref on cookie->n_active (held by - * fscache_look_up_object() on behalf of the cache backend during object lookup - * and creation). - */ -enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, uint16_t datalen, - loff_t object_size) -{ - enum fscache_checkaux result; - - if (!object->cookie->def->check_aux) { - fscache_stat(&fscache_n_checkaux_none); - return FSCACHE_CHECKAUX_OKAY; - } - - result = object->cookie->def->check_aux(object->cookie->netfs_data, - data, datalen, object_size); - switch (result) { - /* entry okay as is */ - case FSCACHE_CHECKAUX_OKAY: - fscache_stat(&fscache_n_checkaux_okay); - break; - - /* entry requires update */ - case FSCACHE_CHECKAUX_NEEDS_UPDATE: - fscache_stat(&fscache_n_checkaux_update); - break; - - /* entry requires deletion */ - case FSCACHE_CHECKAUX_OBSOLETE: - fscache_stat(&fscache_n_checkaux_obsolete); - break; - - default: - BUG(); - } - - return result; -} -EXPORT_SYMBOL(fscache_check_aux); - -/* - * Asynchronously invalidate an object. - */ -static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object, - int event) -{ - struct fscache_operation *op; - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x},%d", object->debug_id, event); - - /* We're going to need the cookie. If the cookie is not available then - * retire the object instead. - */ - if (!fscache_use_cookie(object)) { - ASSERT(radix_tree_empty(&object->cookie->stores)); - set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); - _leave(" [no cookie]"); - return transit_to(KILL_OBJECT); - } - - /* Reject any new read/write ops and abort any that are pending. */ - fscache_invalidate_writes(cookie); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_cancel_all_ops(object); - - /* Now we have to wait for in-progress reads and writes */ - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - goto nomem; - - fscache_operation_init(cookie, op, object->cache->ops->invalidate_object, - NULL, NULL); - op->flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_EXCLUSIVE) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate); - - spin_lock(&cookie->lock); - if (fscache_submit_exclusive_op(object, op) < 0) - goto submit_op_failed; - spin_unlock(&cookie->lock); - fscache_put_operation(op); - - /* Once we've completed the invalidation, we know there will be no data - * stored in the cache and thus we can reinstate the data-check-skip - * optimisation. - */ - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - - /* We can allow read and write requests to come in once again. They'll - * queue up behind our exclusive invalidation operation. - */ - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - _leave(" [ok]"); - return transit_to(UPDATE_OBJECT); - -nomem: - fscache_mark_object_dead(object); - fscache_unuse_cookie(object); - _leave(" [ENOMEM]"); - return transit_to(KILL_OBJECT); - -submit_op_failed: - fscache_mark_object_dead(object); - spin_unlock(&cookie->lock); - fscache_unuse_cookie(object); - kfree(op); - _leave(" [EIO]"); - return transit_to(KILL_OBJECT); -} - -static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object, - int event) -{ - const struct fscache_state *s; - - fscache_stat(&fscache_n_invalidates_run); - fscache_stat(&fscache_n_cop_invalidate_object); - s = _fscache_invalidate_object(object, event); - fscache_stat_d(&fscache_n_cop_invalidate_object); - return s; -} - -/* - * Update auxiliary data. - */ -static void fscache_update_aux_data(struct fscache_object *object) -{ - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); -} - -/* - * Asynchronously update an object. - */ -static const struct fscache_state *fscache_update_object(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - fscache_update_aux_data(object); - - _leave(""); - return transit_to(WAIT_FOR_CMD); -} - -/** - * fscache_object_retrying_stale - Note retrying stale object - * @object: The object that will be retried - * - * Note that an object lookup found an on-disk object that was adjudged to be - * stale and has been deleted. The lookup will be retried. - */ -void fscache_object_retrying_stale(struct fscache_object *object) -{ - fscache_stat(&fscache_n_cache_no_space_reject); -} -EXPORT_SYMBOL(fscache_object_retrying_stale); - -/** - * fscache_object_mark_killed - Note that an object was killed - * @object: The object that was culled - * @why: The reason the object was killed. - * - * Note that an object was killed. Returns true if the object was - * already marked killed, false if it wasn't. - */ -void fscache_object_mark_killed(struct fscache_object *object, - enum fscache_why_object_killed why) -{ - if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { - pr_err("Error: Object already killed by cache [%s]\n", - object->cache->identifier); - return; - } - - switch (why) { - case FSCACHE_OBJECT_NO_SPACE: - fscache_stat(&fscache_n_cache_no_space_reject); - break; - case FSCACHE_OBJECT_IS_STALE: - fscache_stat(&fscache_n_cache_stale_objects); - break; - case FSCACHE_OBJECT_WAS_RETIRED: - fscache_stat(&fscache_n_cache_retired_objects); - break; - case FSCACHE_OBJECT_WAS_CULLED: - fscache_stat(&fscache_n_cache_culled_objects); - break; - } -} -EXPORT_SYMBOL(fscache_object_mark_killed); - -/* - * The object is dead. We can get here if an object gets queued by an event - * that would lead to its death (such as EV_KILL) when the dispatcher is - * already running (and so can be requeued) but hasn't yet cleared the event - * mask. - */ -static const struct fscache_state *fscache_object_dead(struct fscache_object *object, - int event) -{ - if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, - &object->flags)) - return NO_TRANSIT; - - WARN(true, "FS-Cache object redispatched after death"); - return NO_TRANSIT; -} diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c deleted file mode 100644 index e002cdfaf3cc..000000000000 --- a/fs/fscache/operation.c +++ /dev/null @@ -1,633 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache worker operation management routines - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * See Documentation/filesystems/caching/operations.rst - */ - -#define FSCACHE_DEBUG_LEVEL OPERATION -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include "internal.h" - -atomic_t fscache_op_debug_id; -EXPORT_SYMBOL(fscache_op_debug_id); - -static void fscache_operation_dummy_cancel(struct fscache_operation *op) -{ -} - -/** - * fscache_operation_init - Do basic initialisation of an operation - * @cookie: The cookie to operate on - * @op: The operation to initialise - * @processor: The function to perform the operation - * @cancel: A function to handle operation cancellation - * @release: The release function to assign - * - * Do basic initialisation of an operation. The caller must still set flags, - * object and processor if needed. - */ -void fscache_operation_init(struct fscache_cookie *cookie, - struct fscache_operation *op, - fscache_operation_processor_t processor, - fscache_operation_cancel_t cancel, - fscache_operation_release_t release) -{ - INIT_WORK(&op->work, fscache_op_work_func); - atomic_set(&op->usage, 1); - op->state = FSCACHE_OP_ST_INITIALISED; - op->debug_id = atomic_inc_return(&fscache_op_debug_id); - op->processor = processor; - op->cancel = cancel ?: fscache_operation_dummy_cancel; - op->release = release; - INIT_LIST_HEAD(&op->pend_link); - fscache_stat(&fscache_n_op_initialised); - trace_fscache_op(cookie, op, fscache_op_init); -} -EXPORT_SYMBOL(fscache_operation_init); - -/** - * fscache_enqueue_operation - Enqueue an operation for processing - * @op: The operation to enqueue - * - * Enqueue an operation for processing by the FS-Cache thread pool. - * - * This will get its own ref on the object. - */ -void fscache_enqueue_operation(struct fscache_operation *op) -{ - struct fscache_cookie *cookie = op->object->cookie; - - _enter("{OBJ%x OP%x,%u}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - - ASSERT(list_empty(&op->pend_link)); - ASSERT(op->processor != NULL); - ASSERT(fscache_object_is_available(op->object)); - ASSERTCMP(atomic_read(&op->usage), >, 0); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS, - op->state, ==, FSCACHE_OP_ST_CANCELLED); - - fscache_stat(&fscache_n_op_enqueue); - switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_ASYNC: - trace_fscache_op(cookie, op, fscache_op_enqueue_async); - _debug("queue async"); - atomic_inc(&op->usage); - if (!queue_work(fscache_op_wq, &op->work)) - fscache_put_operation(op); - break; - case FSCACHE_OP_MYTHREAD: - trace_fscache_op(cookie, op, fscache_op_enqueue_mythread); - _debug("queue for caller's attention"); - break; - default: - pr_err("Unexpected op type %lx", op->flags); - BUG(); - break; - } -} -EXPORT_SYMBOL(fscache_enqueue_operation); - -/* - * start an op running - */ -static void fscache_run_op(struct fscache_object *object, - struct fscache_operation *op) -{ - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); - - op->state = FSCACHE_OP_ST_IN_PROGRESS; - object->n_in_progress++; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - if (op->processor) - fscache_enqueue_operation(op); - else - trace_fscache_op(object->cookie, op, fscache_op_run); - fscache_stat(&fscache_n_op_run); -} - -/* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* - * submit an exclusive operation for an object - * - other ops are excluded from running simultaneously with this one - * - this gets any extra refs it needs on an op - */ -int fscache_submit_exclusive_op(struct fscache_object *object, - struct fscache_operation *op) -{ - const struct fscache_state *ostate; - unsigned long flags; - int ret; - - _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); - - trace_fscache_op(object->cookie, op, fscache_op_submit_ex); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - ASSERTCMP(object->n_ops, >=, object->n_in_progress); - ASSERTCMP(object->n_ops, >=, object->n_exclusive); - ASSERT(list_empty(&op->pend_link)); - - ostate = object->state; - smp_rmb(); - - op->state = FSCACHE_OP_ST_PENDING; - flags = READ_ONCE(object->flags); - if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { - fscache_stat(&fscache_n_op_rejected); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (unlikely(fscache_cache_is_broken(object))) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -EIO; - } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { - op->object = object; - object->n_ops++; - object->n_exclusive++; /* reads and writes must wait */ - - if (object->n_in_progress > 0) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - } else if (!list_empty(&object->pending_ops)) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - fscache_start_operations(object); - } else { - ASSERTCMP(object->n_in_progress, ==, 0); - fscache_run_op(object, op); - } - - /* need to issue a new write op after this */ - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { - op->object = object; - object->n_ops++; - object->n_exclusive++; /* reads and writes must wait */ - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { - fscache_report_unexpected_submission(object, op, ostate); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * submit an operation for an object - * - objects may be submitted only in the following states: - * - during object creation (write ops may be submitted) - * - whilst the object is active - * - after an I/O error incurred in one of the two above states (op rejected) - * - this gets any extra refs it needs on an op - */ -int fscache_submit_op(struct fscache_object *object, - struct fscache_operation *op) -{ - const struct fscache_state *ostate; - unsigned long flags; - int ret; - - _enter("{OBJ%x OP%x},{%u}", - object->debug_id, op->debug_id, atomic_read(&op->usage)); - - trace_fscache_op(object->cookie, op, fscache_op_submit); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - ASSERTCMP(object->n_ops, >=, object->n_in_progress); - ASSERTCMP(object->n_ops, >=, object->n_exclusive); - ASSERT(list_empty(&op->pend_link)); - - ostate = object->state; - smp_rmb(); - - op->state = FSCACHE_OP_ST_PENDING; - flags = READ_ONCE(object->flags); - if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { - fscache_stat(&fscache_n_op_rejected); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (unlikely(fscache_cache_is_broken(object))) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -EIO; - } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { - op->object = object; - object->n_ops++; - - if (object->n_exclusive > 0) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - } else if (!list_empty(&object->pending_ops)) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - fscache_start_operations(object); - } else { - ASSERTCMP(object->n_exclusive, ==, 0); - fscache_run_op(object, op); - } - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { - op->object = object; - object->n_ops++; - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { - fscache_report_unexpected_submission(object, op, ostate); - ASSERT(!fscache_object_is_active(object)); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * queue an object for withdrawal on error, aborting all following asynchronous - * operations - */ -void fscache_abort_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); -} - -/* - * Jump start the operation processing on an object. The caller must hold - * object->lock. - */ -void fscache_start_operations(struct fscache_object *object) -{ - struct fscache_operation *op; - bool stop = false; - - while (!list_empty(&object->pending_ops) && !stop) { - op = list_entry(object->pending_ops.next, - struct fscache_operation, pend_link); - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { - if (object->n_in_progress > 0) - break; - stop = true; - } - list_del_init(&op->pend_link); - fscache_run_op(object, op); - - /* the pending queue was holding a ref on the object */ - fscache_put_operation(op); - } - - ASSERTCMP(object->n_in_progress, <=, object->n_ops); - - _debug("woke %d ops on OBJ%x", - object->n_in_progress, object->debug_id); -} - -/* - * cancel an operation that's pending on an object - */ -int fscache_cancel_op(struct fscache_operation *op, - bool cancel_in_progress_op) -{ - struct fscache_object *object = op->object; - bool put = false; - int ret; - - _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); - - trace_fscache_op(object->cookie, op, fscache_op_cancel); - - ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); - ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - - ret = -EBUSY; - if (op->state == FSCACHE_OP_ST_PENDING) { - ASSERT(!list_empty(&op->pend_link)); - list_del_init(&op->pend_link); - put = true; - - fscache_stat(&fscache_n_op_cancelled); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - ret = 0; - } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { - ASSERTCMP(object->n_in_progress, >, 0); - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); - - fscache_stat(&fscache_n_op_cancelled); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - ret = 0; - } - - if (put) - fscache_put_operation(op); - spin_unlock(&object->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * Cancel all pending operations on an object - */ -void fscache_cancel_all_ops(struct fscache_object *object) -{ - struct fscache_operation *op; - - _enter("OBJ%x", object->debug_id); - - spin_lock(&object->lock); - - while (!list_empty(&object->pending_ops)) { - op = list_entry(object->pending_ops.next, - struct fscache_operation, pend_link); - fscache_stat(&fscache_n_op_cancelled); - list_del_init(&op->pend_link); - - trace_fscache_op(object->cookie, op, fscache_op_cancel_all); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); - cond_resched_lock(&object->lock); - } - - spin_unlock(&object->lock); - _leave(""); -} - -/* - * Record the completion or cancellation of an in-progress operation. - */ -void fscache_op_complete(struct fscache_operation *op, bool cancelled) -{ - struct fscache_object *object = op->object; - - _enter("OBJ%x", object->debug_id); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); - ASSERTCMP(object->n_in_progress, >, 0); - ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), - object->n_exclusive, >, 0); - ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), - object->n_in_progress, ==, 1); - - spin_lock(&object->lock); - - if (!cancelled) { - trace_fscache_op(object->cookie, op, fscache_op_completed); - op->state = FSCACHE_OP_ST_COMPLETE; - } else { - op->cancel(op); - trace_fscache_op(object->cookie, op, fscache_op_cancelled); - op->state = FSCACHE_OP_ST_CANCELLED; - } - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); - - spin_unlock(&object->lock); - _leave(""); -} -EXPORT_SYMBOL(fscache_op_complete); - -/* - * release an operation - * - queues pending ops if this is the last in-progress op - */ -void fscache_put_operation(struct fscache_operation *op) -{ - struct fscache_object *object; - struct fscache_cache *cache; - - _enter("{OBJ%x OP%x,%d}", - op->object ? op->object->debug_id : 0, - op->debug_id, atomic_read(&op->usage)); - - ASSERTCMP(atomic_read(&op->usage), >, 0); - - if (!atomic_dec_and_test(&op->usage)) - return; - - trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put); - - _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && - op->state != FSCACHE_OP_ST_COMPLETE, - op->state, ==, FSCACHE_OP_ST_CANCELLED); - - fscache_stat(&fscache_n_op_release); - - if (op->release) { - op->release(op); - op->release = NULL; - } - op->state = FSCACHE_OP_ST_DEAD; - - object = op->object; - if (likely(object)) { - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); - - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; - } - - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - } - - kfree(op); - _leave(" [done]"); -} -EXPORT_SYMBOL(fscache_put_operation); - -/* - * garbage collect operations that have had their release deferred - */ -void fscache_operation_gc(struct work_struct *work) -{ - struct fscache_operation *op; - struct fscache_object *object; - struct fscache_cache *cache = - container_of(work, struct fscache_cache, op_gc); - int count = 0; - - _enter(""); - - do { - spin_lock(&cache->op_gc_list_lock); - if (list_empty(&cache->op_gc_list)) { - spin_unlock(&cache->op_gc_list_lock); - break; - } - - op = list_entry(cache->op_gc_list.next, - struct fscache_operation, pend_link); - list_del(&op->pend_link); - spin_unlock(&cache->op_gc_list_lock); - - object = op->object; - trace_fscache_op(object->cookie, op, fscache_op_gc); - - spin_lock(&object->lock); - - _debug("GC DEFERRED REL OBJ%x OP%x", - object->debug_id, op->debug_id); - fscache_stat(&fscache_n_op_gc); - - ASSERTCMP(atomic_read(&op->usage), ==, 0); - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD); - - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); - - } while (count++ < 20); - - if (!list_empty(&cache->op_gc_list)) - schedule_work(&cache->op_gc); - - _leave(""); -} - -/* - * execute an operation using fs_op_wq to provide processing context - - * the caller holds a ref to this object, so we don't need to hold one - */ -void fscache_op_work_func(struct work_struct *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, work); - - _enter("{OBJ%x OP%x,%d}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - - trace_fscache_op(op->object->cookie, op, fscache_op_work); - - ASSERT(op->processor != NULL); - op->processor(op); - fscache_put_operation(op); - - _leave(""); -} diff --git a/fs/fscache/page.c b/fs/fscache/page.c deleted file mode 100644 index 27df94ef0e0b..000000000000 --- a/fs/fscache/page.c +++ /dev/null @@ -1,1242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Cache page management and data I/O routines - * - * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL PAGE -#include <linux/module.h> -#include <linux/fscache-cache.h> -#include <linux/buffer_head.h> -#include <linux/pagevec.h> -#include <linux/slab.h> -#include "internal.h" - -/* - * check to see if a page is being written to the cache - */ -bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) -{ - void *val; - - rcu_read_lock(); - val = radix_tree_lookup(&cookie->stores, page->index); - rcu_read_unlock(); - trace_fscache_check_page(cookie, page, val, 0); - - return val != NULL; -} -EXPORT_SYMBOL(__fscache_check_page_write); - -/* - * wait for a page to finish being written to the cache - */ -void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) -{ - wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); - - trace_fscache_page(cookie, page, fscache_page_write_wait); - - wait_event(*wq, !__fscache_check_page_write(cookie, page)); -} -EXPORT_SYMBOL(__fscache_wait_on_page_write); - -/* - * wait for a page to finish being written to the cache. Put a timeout here - * since we might be called recursively via parent fs. - */ -static -bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) -{ - wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); - - return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), - HZ); -} - -/* - * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged - */ -bool __fscache_maybe_release_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - struct page *xpage; - void *val; - - _enter("%p,%p,%x", cookie, page, gfp); - - trace_fscache_page(cookie, page, fscache_page_maybe_release); - -try_again: - rcu_read_lock(); - val = radix_tree_lookup(&cookie->stores, page->index); - if (!val) { - rcu_read_unlock(); - fscache_stat(&fscache_n_store_vmscan_not_storing); - __fscache_uncache_page(cookie, page); - return true; - } - - /* see if the page is actually undergoing storage - if so we can't get - * rid of it till the cache has finished with it */ - if (radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG)) { - rcu_read_unlock(); - goto page_busy; - } - - /* the page is pending storage, so we attempt to cancel the store and - * discard the store request so that the page can be reclaimed */ - spin_lock(&cookie->stores_lock); - rcu_read_unlock(); - - if (radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG)) { - /* the page started to undergo storage whilst we were looking, - * so now we can only wait or return */ - spin_unlock(&cookie->stores_lock); - goto page_busy; - } - - xpage = radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - spin_unlock(&cookie->stores_lock); - - if (xpage) { - fscache_stat(&fscache_n_store_vmscan_cancelled); - fscache_stat(&fscache_n_store_radix_deletes); - ASSERTCMP(xpage, ==, page); - } else { - fscache_stat(&fscache_n_store_vmscan_gone); - } - - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - if (xpage) - put_page(xpage); - __fscache_uncache_page(cookie, page); - return true; - -page_busy: - /* We will wait here if we're allowed to, but that could deadlock the - * allocator as the work threads writing to the cache may all end up - * sleeping on memory allocation, so we may need to impose a timeout - * too. */ - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { - fscache_stat(&fscache_n_store_vmscan_busy); - return false; - } - - fscache_stat(&fscache_n_store_vmscan_wait); - if (!release_page_wait_timeout(cookie, page)) - _debug("fscache writeout timeout page: %p{%lx}", - page, page->index); - - gfp &= ~__GFP_DIRECT_RECLAIM; - goto try_again; -} -EXPORT_SYMBOL(__fscache_maybe_release_page); - -/* - * note that a page has finished being written to the cache - */ -static void fscache_end_page_write(struct fscache_object *object, - struct page *page) -{ - struct fscache_cookie *cookie; - struct page *xpage = NULL, *val; - - spin_lock(&object->lock); - cookie = object->cookie; - if (cookie) { - /* delete the page from the tree if it is now no longer - * pending */ - spin_lock(&cookie->stores_lock); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_clear_store); - if (!radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG)) { - fscache_stat(&fscache_n_store_radix_deletes); - xpage = radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - trace_fscache_page(cookie, page, fscache_page_write_end); - - val = radix_tree_lookup(&cookie->stores, page->index); - trace_fscache_check_page(cookie, page, val, 1); - } else { - trace_fscache_page(cookie, page, fscache_page_write_end_pend); - } - spin_unlock(&cookie->stores_lock); - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - } else { - trace_fscache_page(cookie, page, fscache_page_write_end_noc); - } - spin_unlock(&object->lock); - if (xpage) - put_page(xpage); -} - -/* - * actually apply the changed attributes to a cache object - */ -static void fscache_attr_changed_op(struct fscache_operation *op) -{ - struct fscache_object *object = op->object; - int ret; - - _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); - - fscache_stat(&fscache_n_attr_changed_calls); - - if (fscache_object_is_active(object)) { - fscache_stat(&fscache_n_cop_attr_changed); - ret = object->cache->ops->attr_changed(object); - fscache_stat_d(&fscache_n_cop_attr_changed); - if (ret < 0) - fscache_abort_object(object); - fscache_op_complete(op, ret < 0); - } else { - fscache_op_complete(op, true); - } - - _leave(""); -} - -/* - * notification that the attributes on an object have changed - */ -int __fscache_attr_changed(struct fscache_cookie *cookie) -{ - struct fscache_operation *op; - struct fscache_object *object; - bool wake_cookie = false; - - _enter("%p", cookie); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - - fscache_stat(&fscache_n_attr_changed); - - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - fscache_stat(&fscache_n_attr_changed_nomem); - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed); - op->flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_EXCLUSIVE) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - if (fscache_submit_exclusive_op(object, op) < 0) - goto nobufs_dec; - spin_unlock(&cookie->lock); - fscache_stat(&fscache_n_attr_changed_ok); - fscache_put_operation(op); - _leave(" = 0"); - return 0; - -nobufs_dec: - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs: - spin_unlock(&cookie->lock); - fscache_put_operation(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_stat(&fscache_n_attr_changed_nobufs); - _leave(" = %d", -ENOBUFS); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_attr_changed); - -/* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* - * release a retrieval op reference - */ -static void fscache_release_retrieval_op(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - _enter("{OP%x}", op->op.debug_id); - - ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, - atomic_read(&op->n_pages), ==, 0); - - if (op->context) - fscache_put_context(op->cookie, op->context); - - _leave(""); -} - -/* - * allocate a retrieval op - */ -struct fscache_retrieval *fscache_alloc_retrieval( - struct fscache_cookie *cookie, - struct address_space *mapping, - fscache_rw_complete_t end_io_func, - void *context) -{ - struct fscache_retrieval *op; - - /* allocate a retrieval operation and attempt to submit it */ - op = kzalloc(sizeof(*op), GFP_NOIO); - if (!op) { - fscache_stat(&fscache_n_retrievals_nomem); - return NULL; - } - - fscache_operation_init(cookie, &op->op, NULL, - fscache_do_cancel_retrieval, - fscache_release_retrieval_op); - op->op.flags = FSCACHE_OP_MYTHREAD | - (1UL << FSCACHE_OP_WAITING) | - (1UL << FSCACHE_OP_UNUSE_COOKIE); - op->cookie = cookie; - op->mapping = mapping; - op->end_io_func = end_io_func; - op->context = context; - INIT_LIST_HEAD(&op->to_do); - - /* Pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure. - */ - if (context) - fscache_get_context(op->cookie, context); - return op; -} - -/* - * wait for a deferred lookup to complete - */ -int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) -{ - _enter(""); - - if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { - _leave(" = 0 [imm]"); - return 0; - } - - fscache_stat(&fscache_n_retrievals_wait); - - if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - TASK_INTERRUPTIBLE) != 0) { - fscache_stat(&fscache_n_retrievals_intr); - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; - } - - ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); - - smp_rmb(); - _leave(" = 0 [dly]"); - return 0; -} - -/* - * wait for an object to become active (or dead) - */ -int fscache_wait_for_operation_activation(struct fscache_object *object, - struct fscache_operation *op, - atomic_t *stat_op_waits, - atomic_t *stat_object_dead) -{ - int ret; - - if (!test_bit(FSCACHE_OP_WAITING, &op->flags)) - goto check_if_dead; - - _debug(">>> WT"); - if (stat_op_waits) - fscache_stat(stat_op_waits); - if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - TASK_INTERRUPTIBLE) != 0) { - trace_fscache_op(object->cookie, op, fscache_op_signal); - ret = fscache_cancel_op(op, false); - if (ret == 0) - return -ERESTARTSYS; - - /* it's been removed from the pending queue by another party, - * so we should get to run shortly */ - wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - -check_if_dead: - if (op->state == FSCACHE_OP_ST_CANCELLED) { - if (stat_object_dead) - fscache_stat(stat_object_dead); - _leave(" = -ENOBUFS [cancelled]"); - return -ENOBUFS; - } - if (unlikely(fscache_object_is_dying(object) || - fscache_cache_is_broken(object))) { - enum fscache_operation_state state = op->state; - trace_fscache_op(object->cookie, op, fscache_op_signal); - fscache_cancel_op(op, true); - if (stat_object_dead) - fscache_stat(stat_object_dead); - _leave(" = -ENOBUFS [obj dead %d]", state); - return -ENOBUFS; - } - return 0; -} - -/* - * read a page from the cache or allocate a block in which to store it - * - we return: - * -ENOMEM - out of memory, nothing done - * -ERESTARTSYS - interrupted - * -ENOBUFS - no backing object available in which to cache the block - * -ENODATA - no data available in the backing object for this block - * 0 - dispatched a read - it'll call end_io_func() when finished - */ -int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, - struct page *page, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%p,,,", cookie, page); - - fscache_stat(&fscache_n_retrievals); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, page->mapping, - end_io_func, context); - if (!op) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - atomic_set(&op->n_pages, 1); - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); - - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); - - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_retrieval_ops); - - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { - fscache_stat(&fscache_n_cop_allocate_page); - ret = object->cache->ops->allocate_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_allocate_page); - if (ret == 0) - ret = -ENODATA; - } else { - fscache_stat(&fscache_n_cop_read_or_alloc_page); - ret = object->cache->ops->read_or_alloc_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_read_or_alloc_page); - } - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_put_retrieval(op); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_read_or_alloc_page); - -/* - * read a list of page from the cache or allocate a block in which to store - * them - * - we return: - * -ENOMEM - out of memory, some pages may be being read - * -ERESTARTSYS - interrupted, some pages may be being read - * -ENOBUFS - no backing object or space available in which to cache any - * pages not being read - * -ENODATA - no data available in the backing object for some or all of - * the pages - * 0 - dispatched a read on all pages - * - * end_io_func() will be called for each page read from the cache as it is - * finishes being read - * - * any pages for which a read is dispatched will be removed from pages and - * nr_pages - */ -int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,,%d,,,", cookie, *nr_pages); - - fscache_stat(&fscache_n_retrievals); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(*nr_pages, >, 0); - ASSERT(!list_empty(pages)); - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context); - if (!op) - return -ENOMEM; - atomic_set(&op->n_pages, *nr_pages); - trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); - - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_retrieval_ops); - - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { - fscache_stat(&fscache_n_cop_allocate_pages); - ret = object->cache->ops->allocate_pages( - op, pages, nr_pages, gfp); - fscache_stat_d(&fscache_n_cop_allocate_pages); - } else { - fscache_stat(&fscache_n_cop_read_or_alloc_pages); - ret = object->cache->ops->read_or_alloc_pages( - op, pages, nr_pages, gfp); - fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); - } - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_read_or_alloc_pages); - -/* - * allocate a block in the cache on which to store a page - * - we return: - * -ENOMEM - out of memory, nothing done - * -ERESTARTSYS - interrupted - * -ENOBUFS - no backing object available in which to cache the block - * 0 - block allocated - */ -int __fscache_alloc_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%p,,,", cookie, page); - - fscache_stat(&fscache_n_allocs); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL); - if (!op) - return -ENOMEM; - atomic_set(&op->n_pages, 1); - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_alloc_ops); - - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - fscache_stat(&fscache_n_cop_allocate_page); - ret = object->cache->ops->allocate_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_allocate_page); - -error: - if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_allocs_intr); - else if (ret < 0) - fscache_stat(&fscache_n_allocs_nobufs); - else - fscache_stat(&fscache_n_allocs_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_allocs_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_alloc_page); - -/* - * Unmark pages allocate in the readahead code path (via: - * fscache_readpages_or_alloc) after delegating to the base filesystem - */ -void __fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages) -{ - struct page *page; - - list_for_each_entry(page, pages, lru) { - if (PageFsCache(page)) - __fscache_uncache_page(cookie, page); - } -} -EXPORT_SYMBOL(__fscache_readpages_cancel); - -/* - * release a write op reference - */ -static void fscache_release_write_op(struct fscache_operation *_op) -{ - _enter("{OP%x}", _op->debug_id); -} - -/* - * perform the background storage of a page into the cache - */ -static void fscache_write_op(struct fscache_operation *_op) -{ - struct fscache_storage *op = - container_of(_op, struct fscache_storage, op); - struct fscache_object *object = op->op.object; - struct fscache_cookie *cookie; - struct page *page; - unsigned n; - void *results[1]; - int ret; - - _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); - -again: - spin_lock(&object->lock); - cookie = object->cookie; - - if (!fscache_object_is_active(object)) { - /* If we get here, then the on-disk cache object likely no - * longer exists, so we should just cancel this write - * operation. - */ - spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); - _leave(" [inactive]"); - return; - } - - if (!cookie) { - /* If we get here, then the cookie belonging to the object was - * detached, probably by the cookie being withdrawn due to - * memory pressure, which means that the pages we might write - * to the cache from no longer exist - therefore, we can just - * cancel this write operation. - */ - spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); - _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", - _op->flags, _op->state, object->state->short_name, - object->flags); - return; - } - - spin_lock(&cookie->stores_lock); - - fscache_stat(&fscache_n_store_calls); - - /* find a page to store */ - results[0] = NULL; - page = NULL; - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit); - if (n != 1) - goto superseded; - page = results[0]; - _debug("gang %d [%lx]", n, page->index); - - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_pend2store); - - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - - if (page->index >= op->store_limit) - goto discard_page; - - fscache_stat(&fscache_n_store_pages); - fscache_stat(&fscache_n_cop_write_page); - ret = object->cache->ops->write_page(op, page); - fscache_stat_d(&fscache_n_cop_write_page); - trace_fscache_wrote_page(cookie, page, &op->op, ret); - fscache_end_page_write(object, page); - if (ret < 0) { - fscache_abort_object(object); - fscache_op_complete(&op->op, true); - } else { - fscache_enqueue_operation(&op->op); - } - - _leave(""); - return; - -discard_page: - fscache_stat(&fscache_n_store_pages_over_limit); - trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS); - fscache_end_page_write(object, page); - goto again; - -superseded: - /* this writer is going away and there aren't any more things to - * write */ - _debug("cease"); - spin_unlock(&cookie->stores_lock); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); - _leave(""); -} - -/* - * Clear the pages pending writing for invalidation - */ -void fscache_invalidate_writes(struct fscache_cookie *cookie) -{ - struct page *page; - void *results[16]; - int n, i; - - _enter(""); - - for (;;) { - spin_lock(&cookie->stores_lock); - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, - ARRAY_SIZE(results), - FSCACHE_COOKIE_PENDING_TAG); - if (n == 0) { - spin_unlock(&cookie->stores_lock); - break; - } - - for (i = n - 1; i >= 0; i--) { - page = results[i]; - radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - trace_fscache_page(cookie, page, fscache_page_inval); - } - - spin_unlock(&cookie->stores_lock); - - for (i = n - 1; i >= 0; i--) - put_page(results[i]); - } - - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - - _leave(""); -} - -/* - * request a page be stored in the cache - * - returns: - * -ENOMEM - out of memory, nothing done - * -ENOBUFS - no backing object available in which to cache the page - * 0 - dispatched a write - it'll call end_io_func() when finished - * - * if the cookie still has a backing object at this point, that object can be - * in one of a few states with respect to storage processing: - * - * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is - * set) - * - * (a) no writes yet - * - * (b) writes deferred till post-creation (mark page for writing and - * return immediately) - * - * (2) negative lookup, object created, initial fill being made from netfs - * - * (a) fill point not yet reached this page (mark page for writing and - * return) - * - * (b) fill point passed this page (queue op to store this page) - * - * (3) object extant (queue op to store this page) - * - * any other state is invalid - */ -int __fscache_write_page(struct fscache_cookie *cookie, - struct page *page, - loff_t object_size, - gfp_t gfp) -{ - struct fscache_storage *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%x,", cookie, (u32) page->flags); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERT(PageFsCache(page)); - - fscache_stat(&fscache_n_stores); - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!op) - goto nomem; - - fscache_operation_init(cookie, &op->op, fscache_write_op, NULL, - fscache_release_write_op); - op->op.flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_WAITING) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - - ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); - if (ret < 0) - goto nomem_free; - - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one); - - ret = -ENOBUFS; - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) - goto nobufs; - - trace_fscache_page(cookie, page, fscache_page_write); - - /* add the page to the pending-storage radix tree on the backing - * object */ - spin_lock(&object->lock); - - if (object->store_limit_l != object_size) - fscache_set_store_limit(object, object_size); - - spin_lock(&cookie->stores_lock); - - _debug("store limit %llx", (unsigned long long) object->store_limit); - - ret = radix_tree_insert(&cookie->stores, page->index, page); - if (ret < 0) { - if (ret == -EEXIST) - goto already_queued; - _debug("insert failed %d", ret); - goto nobufs_unlock_obj; - } - - trace_fscache_page(cookie, page, fscache_page_radix_insert); - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_set_pend); - get_page(page); - - /* we only want one writer at a time, but we do need to queue new - * writers after exclusive ops */ - if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) - goto already_pending; - - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - - op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); - op->store_limit = object->store_limit; - - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, &op->op) < 0) - goto submit_failed; - - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_stat(&fscache_n_store_ops); - fscache_stat(&fscache_n_stores_ok); - - /* the work queue now carries its own ref on the object */ - fscache_put_operation(&op->op); - _leave(" = 0"); - return 0; - -already_queued: - fscache_stat(&fscache_n_stores_again); -already_pending: - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_put_operation(&op->op); - fscache_stat(&fscache_n_stores_ok); - _leave(" = 0"); - return 0; - -submit_failed: - spin_lock(&cookie->stores_lock); - radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - spin_unlock(&cookie->stores_lock); - wake_cookie = __fscache_unuse_cookie(cookie); - put_page(page); - ret = -ENOBUFS; - goto nobufs; - -nobufs_unlock_obj: - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); -nobufs: - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_put_operation(&op->op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_stat(&fscache_n_stores_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; - -nomem_free: - fscache_put_operation(&op->op); -nomem: - fscache_stat(&fscache_n_stores_oom); - _leave(" = -ENOMEM"); - return -ENOMEM; -} -EXPORT_SYMBOL(__fscache_write_page); - -/* - * remove a page from the cache - */ -void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) -{ - struct fscache_object *object; - - _enter(",%p", page); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - fscache_stat(&fscache_n_uncaches); - - /* cache withdrawal may beat us to it */ - if (!PageFsCache(page)) - goto done; - - trace_fscache_page(cookie, page, fscache_page_uncache); - - /* get the object */ - spin_lock(&cookie->lock); - - if (hlist_empty(&cookie->backing_objects)) { - ClearPageFsCache(page); - goto done_unlock; - } - - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - /* there might now be stuff on disk we could read */ - clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - - /* only invoke the cache backend if we managed to mark the page - * uncached here; this deals with synchronisation vs withdrawal */ - if (TestClearPageFsCache(page) && - object->cache->ops->uncache_page) { - /* the cache backend releases the cookie lock */ - fscache_stat(&fscache_n_cop_uncache_page); - object->cache->ops->uncache_page(object, page); - fscache_stat_d(&fscache_n_cop_uncache_page); - goto done; - } - -done_unlock: - spin_unlock(&cookie->lock); -done: - _leave(""); -} -EXPORT_SYMBOL(__fscache_uncache_page); - -/** - * fscache_mark_page_cached - Mark a page as being cached - * @op: The retrieval op pages are being marked for - * @page: The page to be marked - * - * Mark a netfs page as being cached. After this is called, the netfs - * must call fscache_uncache_page() to remove the mark. - */ -void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) -{ - struct fscache_cookie *cookie = op->op.object->cookie; - -#ifdef CONFIG_FSCACHE_STATS - atomic_inc(&fscache_n_marks); -#endif - - trace_fscache_page(cookie, page, fscache_page_cached); - - _debug("- mark %p{%lx}", page, page->index); - if (TestSetPageFsCache(page)) { - static bool once_only; - if (!once_only) { - once_only = true; - pr_warn("Cookie type %s marked page %lx multiple times\n", - cookie->def->name, page->index); - } - } - - if (cookie->def->mark_page_cached) - cookie->def->mark_page_cached(cookie->netfs_data, - op->mapping, page); -} -EXPORT_SYMBOL(fscache_mark_page_cached); - -/** - * fscache_mark_pages_cached - Mark pages as being cached - * @op: The retrieval op pages are being marked for - * @pagevec: The pages to be marked - * - * Mark a bunch of netfs pages as being cached. After this is called, - * the netfs must call fscache_uncache_page() to remove the mark. - */ -void fscache_mark_pages_cached(struct fscache_retrieval *op, - struct pagevec *pagevec) -{ - unsigned long loop; - - for (loop = 0; loop < pagevec->nr; loop++) - fscache_mark_page_cached(op, pagevec->pages[loop]); - - pagevec_reinit(pagevec); -} -EXPORT_SYMBOL(fscache_mark_pages_cached); - -/* - * Uncache all the pages in an inode that are marked PG_fscache, assuming them - * to be associated with the given cookie. - */ -void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, - struct inode *inode) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - pgoff_t next; - int i; - - _enter("%p,%p", cookie, inode); - - if (!mapping || mapping->nrpages == 0) { - _leave(" [no pages]"); - return; - } - - pagevec_init(&pvec); - next = 0; - do { - if (!pagevec_lookup(&pvec, mapping, &next)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (PageFsCache(page)) { - __fscache_wait_on_page_write(cookie, page); - __fscache_uncache_page(cookie, page); - } - } - pagevec_release(&pvec); - cond_resched(); - } while (next); - - _leave(""); -} -EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 061df8f61ffc..dc3b0e9c8cce 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache statistics viewing interface * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#define FSCACHE_DEBUG_LEVEL OPERATION +#define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -16,42 +16,32 @@ */ int __init fscache_proc_init(void) { - _enter(""); - if (!proc_mkdir("fs/fscache", NULL)) goto error_dir; + if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + &fscache_caches_seq_ops)) + goto error; + + if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + &fscache_volumes_seq_ops)) + goto error; + if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) - goto error_cookies; + goto error; #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error_stats; + fscache_stats_show)) + goto error; #endif -#ifdef CONFIG_FSCACHE_OBJECT_LIST - if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, - &fscache_objlist_proc_ops)) - goto error_objects; -#endif - - _leave(" = 0"); return 0; -#ifdef CONFIG_FSCACHE_OBJECT_LIST -error_objects: -#endif -#ifdef CONFIG_FSCACHE_STATS - remove_proc_entry("fs/fscache/stats", NULL); -error_stats: -#endif - remove_proc_entry("fs/fscache/cookies", NULL); -error_cookies: +error: remove_proc_entry("fs/fscache", NULL); error_dir: - _leave(" = -ENOMEM"); return -ENOMEM; } @@ -60,12 +50,5 @@ error_dir: */ void fscache_proc_cleanup(void) { -#ifdef CONFIG_FSCACHE_OBJECT_LIST - remove_proc_entry("fs/fscache/objects", NULL); -#endif -#ifdef CONFIG_FSCACHE_STATS - remove_proc_entry("fs/fscache/stats", NULL); -#endif - remove_proc_entry("fs/fscache/cookies", NULL); - remove_proc_entry("fs/fscache", NULL); + remove_proc_subtree("fs/fscache", NULL); } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index a7c3ed89a3e0..fc94e5e79f1c 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache statistics * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#define FSCACHE_DEBUG_LEVEL THREAD -#include <linux/module.h> +#define FSCACHE_DEBUG_LEVEL CACHE #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" @@ -14,122 +13,41 @@ /* * operation counters */ -atomic_t fscache_n_op_pend; -atomic_t fscache_n_op_run; -atomic_t fscache_n_op_enqueue; -atomic_t fscache_n_op_deferred_release; -atomic_t fscache_n_op_initialised; -atomic_t fscache_n_op_release; -atomic_t fscache_n_op_gc; -atomic_t fscache_n_op_cancelled; -atomic_t fscache_n_op_rejected; - -atomic_t fscache_n_attr_changed; -atomic_t fscache_n_attr_changed_ok; -atomic_t fscache_n_attr_changed_nobufs; -atomic_t fscache_n_attr_changed_nomem; -atomic_t fscache_n_attr_changed_calls; - -atomic_t fscache_n_allocs; -atomic_t fscache_n_allocs_ok; -atomic_t fscache_n_allocs_wait; -atomic_t fscache_n_allocs_nobufs; -atomic_t fscache_n_allocs_intr; -atomic_t fscache_n_allocs_object_dead; -atomic_t fscache_n_alloc_ops; -atomic_t fscache_n_alloc_op_waits; - -atomic_t fscache_n_retrievals; -atomic_t fscache_n_retrievals_ok; -atomic_t fscache_n_retrievals_wait; -atomic_t fscache_n_retrievals_nodata; -atomic_t fscache_n_retrievals_nobufs; -atomic_t fscache_n_retrievals_intr; -atomic_t fscache_n_retrievals_nomem; -atomic_t fscache_n_retrievals_object_dead; -atomic_t fscache_n_retrieval_ops; -atomic_t fscache_n_retrieval_op_waits; - -atomic_t fscache_n_stores; -atomic_t fscache_n_stores_ok; -atomic_t fscache_n_stores_again; -atomic_t fscache_n_stores_nobufs; -atomic_t fscache_n_stores_oom; -atomic_t fscache_n_store_ops; -atomic_t fscache_n_store_calls; -atomic_t fscache_n_store_pages; -atomic_t fscache_n_store_radix_deletes; -atomic_t fscache_n_store_pages_over_limit; - -atomic_t fscache_n_store_vmscan_not_storing; -atomic_t fscache_n_store_vmscan_gone; -atomic_t fscache_n_store_vmscan_busy; -atomic_t fscache_n_store_vmscan_cancelled; -atomic_t fscache_n_store_vmscan_wait; - -atomic_t fscache_n_marks; -atomic_t fscache_n_uncaches; +atomic_t fscache_n_volumes; +atomic_t fscache_n_volumes_collision; +atomic_t fscache_n_volumes_nomem; +atomic_t fscache_n_cookies; +atomic_t fscache_n_cookies_lru; +atomic_t fscache_n_cookies_lru_expired; +atomic_t fscache_n_cookies_lru_removed; +atomic_t fscache_n_cookies_lru_dropped; atomic_t fscache_n_acquires; -atomic_t fscache_n_acquires_null; -atomic_t fscache_n_acquires_no_cache; atomic_t fscache_n_acquires_ok; -atomic_t fscache_n_acquires_nobufs; atomic_t fscache_n_acquires_oom; atomic_t fscache_n_invalidates; -atomic_t fscache_n_invalidates_run; atomic_t fscache_n_updates; -atomic_t fscache_n_updates_null; -atomic_t fscache_n_updates_run; +EXPORT_SYMBOL(fscache_n_updates); atomic_t fscache_n_relinquishes; -atomic_t fscache_n_relinquishes_null; -atomic_t fscache_n_relinquishes_waitcrt; atomic_t fscache_n_relinquishes_retire; - -atomic_t fscache_n_cookie_index; -atomic_t fscache_n_cookie_data; -atomic_t fscache_n_cookie_special; - -atomic_t fscache_n_object_alloc; -atomic_t fscache_n_object_no_alloc; -atomic_t fscache_n_object_lookups; -atomic_t fscache_n_object_lookups_negative; -atomic_t fscache_n_object_lookups_positive; -atomic_t fscache_n_object_lookups_timed_out; -atomic_t fscache_n_object_created; -atomic_t fscache_n_object_avail; -atomic_t fscache_n_object_dead; - -atomic_t fscache_n_checkaux_none; -atomic_t fscache_n_checkaux_okay; -atomic_t fscache_n_checkaux_update; -atomic_t fscache_n_checkaux_obsolete; - -atomic_t fscache_n_cop_alloc_object; -atomic_t fscache_n_cop_lookup_object; -atomic_t fscache_n_cop_lookup_complete; -atomic_t fscache_n_cop_grab_object; -atomic_t fscache_n_cop_invalidate_object; -atomic_t fscache_n_cop_update_object; -atomic_t fscache_n_cop_drop_object; -atomic_t fscache_n_cop_put_object; -atomic_t fscache_n_cop_sync_cache; -atomic_t fscache_n_cop_attr_changed; -atomic_t fscache_n_cop_read_or_alloc_page; -atomic_t fscache_n_cop_read_or_alloc_pages; -atomic_t fscache_n_cop_allocate_page; -atomic_t fscache_n_cop_allocate_pages; -atomic_t fscache_n_cop_write_page; -atomic_t fscache_n_cop_uncache_page; -atomic_t fscache_n_cop_dissociate_pages; - -atomic_t fscache_n_cache_no_space_reject; -atomic_t fscache_n_cache_stale_objects; -atomic_t fscache_n_cache_retired_objects; -atomic_t fscache_n_cache_culled_objects; +atomic_t fscache_n_relinquishes_dropped; + +atomic_t fscache_n_resizes; +atomic_t fscache_n_resizes_null; + +atomic_t fscache_n_read; +EXPORT_SYMBOL(fscache_n_read); +atomic_t fscache_n_write; +EXPORT_SYMBOL(fscache_n_write); +atomic_t fscache_n_no_write_space; +EXPORT_SYMBOL(fscache_n_no_write_space); +atomic_t fscache_n_no_create_space; +EXPORT_SYMBOL(fscache_n_no_create_space); +atomic_t fscache_n_culled; +EXPORT_SYMBOL(fscache_n_culled); /* * display the general statistics @@ -137,147 +55,48 @@ atomic_t fscache_n_cache_culled_objects; int fscache_stats_show(struct seq_file *m, void *v) { seq_puts(m, "FS-Cache statistics\n"); - - seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", - atomic_read(&fscache_n_cookie_index), - atomic_read(&fscache_n_cookie_data), - atomic_read(&fscache_n_cookie_special)); - - seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", - atomic_read(&fscache_n_object_alloc), - atomic_read(&fscache_n_object_no_alloc), - atomic_read(&fscache_n_object_avail), - atomic_read(&fscache_n_object_dead)); - seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", - atomic_read(&fscache_n_checkaux_none), - atomic_read(&fscache_n_checkaux_okay), - atomic_read(&fscache_n_checkaux_update), - atomic_read(&fscache_n_checkaux_obsolete)); - - seq_printf(m, "Pages : mrk=%u unc=%u\n", - atomic_read(&fscache_n_marks), - atomic_read(&fscache_n_uncaches)); - - seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" - " oom=%u\n", + seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", + atomic_read(&fscache_n_cookies), + atomic_read(&fscache_n_volumes), + atomic_read(&fscache_n_volumes_collision), + atomic_read(&fscache_n_volumes_nomem) + ); + + seq_printf(m, "Acquire: n=%u ok=%u oom=%u\n", atomic_read(&fscache_n_acquires), - atomic_read(&fscache_n_acquires_null), - atomic_read(&fscache_n_acquires_no_cache), atomic_read(&fscache_n_acquires_ok), - atomic_read(&fscache_n_acquires_nobufs), atomic_read(&fscache_n_acquires_oom)); - seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", - atomic_read(&fscache_n_object_lookups), - atomic_read(&fscache_n_object_lookups_negative), - atomic_read(&fscache_n_object_lookups_positive), - atomic_read(&fscache_n_object_created), - atomic_read(&fscache_n_object_lookups_timed_out)); + seq_printf(m, "LRU : n=%u exp=%u rmv=%u drp=%u at=%ld\n", + atomic_read(&fscache_n_cookies_lru), + atomic_read(&fscache_n_cookies_lru_expired), + atomic_read(&fscache_n_cookies_lru_removed), + atomic_read(&fscache_n_cookies_lru_dropped), + timer_pending(&fscache_cookie_lru_timer) ? + fscache_cookie_lru_timer.expires - jiffies : 0); - seq_printf(m, "Invals : n=%u run=%u\n", - atomic_read(&fscache_n_invalidates), - atomic_read(&fscache_n_invalidates_run)); + seq_printf(m, "Invals : n=%u\n", + atomic_read(&fscache_n_invalidates)); - seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + seq_printf(m, "Updates: n=%u rsz=%u rsn=%u\n", atomic_read(&fscache_n_updates), - atomic_read(&fscache_n_updates_null), - atomic_read(&fscache_n_updates_run)); + atomic_read(&fscache_n_resizes), + atomic_read(&fscache_n_resizes_null)); - seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", + seq_printf(m, "Relinqs: n=%u rtr=%u drop=%u\n", atomic_read(&fscache_n_relinquishes), - atomic_read(&fscache_n_relinquishes_null), - atomic_read(&fscache_n_relinquishes_waitcrt), - atomic_read(&fscache_n_relinquishes_retire)); - - seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", - atomic_read(&fscache_n_attr_changed), - atomic_read(&fscache_n_attr_changed_ok), - atomic_read(&fscache_n_attr_changed_nobufs), - atomic_read(&fscache_n_attr_changed_nomem), - atomic_read(&fscache_n_attr_changed_calls)); - - seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", - atomic_read(&fscache_n_allocs), - atomic_read(&fscache_n_allocs_ok), - atomic_read(&fscache_n_allocs_wait), - atomic_read(&fscache_n_allocs_nobufs), - atomic_read(&fscache_n_allocs_intr)); - seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", - atomic_read(&fscache_n_alloc_ops), - atomic_read(&fscache_n_alloc_op_waits), - atomic_read(&fscache_n_allocs_object_dead)); - - seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" - " int=%u oom=%u\n", - atomic_read(&fscache_n_retrievals), - atomic_read(&fscache_n_retrievals_ok), - atomic_read(&fscache_n_retrievals_wait), - atomic_read(&fscache_n_retrievals_nodata), - atomic_read(&fscache_n_retrievals_nobufs), - atomic_read(&fscache_n_retrievals_intr), - atomic_read(&fscache_n_retrievals_nomem)); - seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", - atomic_read(&fscache_n_retrieval_ops), - atomic_read(&fscache_n_retrieval_op_waits), - atomic_read(&fscache_n_retrievals_object_dead)); - - seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", - atomic_read(&fscache_n_stores), - atomic_read(&fscache_n_stores_ok), - atomic_read(&fscache_n_stores_again), - atomic_read(&fscache_n_stores_nobufs), - atomic_read(&fscache_n_stores_oom)); - seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", - atomic_read(&fscache_n_store_ops), - atomic_read(&fscache_n_store_calls), - atomic_read(&fscache_n_store_pages), - atomic_read(&fscache_n_store_radix_deletes), - atomic_read(&fscache_n_store_pages_over_limit)); + atomic_read(&fscache_n_relinquishes_retire), + atomic_read(&fscache_n_relinquishes_dropped)); - seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n", - atomic_read(&fscache_n_store_vmscan_not_storing), - atomic_read(&fscache_n_store_vmscan_gone), - atomic_read(&fscache_n_store_vmscan_busy), - atomic_read(&fscache_n_store_vmscan_cancelled), - atomic_read(&fscache_n_store_vmscan_wait)); + seq_printf(m, "NoSpace: nwr=%u ncr=%u cull=%u\n", + atomic_read(&fscache_n_no_write_space), + atomic_read(&fscache_n_no_create_space), + atomic_read(&fscache_n_culled)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", - atomic_read(&fscache_n_op_pend), - atomic_read(&fscache_n_op_run), - atomic_read(&fscache_n_op_enqueue), - atomic_read(&fscache_n_op_cancelled), - atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", - atomic_read(&fscache_n_op_initialised), - atomic_read(&fscache_n_op_deferred_release), - atomic_read(&fscache_n_op_release), - atomic_read(&fscache_n_op_gc)); + seq_printf(m, "IO : rd=%u wr=%u\n", + atomic_read(&fscache_n_read), + atomic_read(&fscache_n_write)); - seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", - atomic_read(&fscache_n_cop_alloc_object), - atomic_read(&fscache_n_cop_lookup_object), - atomic_read(&fscache_n_cop_lookup_complete), - atomic_read(&fscache_n_cop_grab_object)); - seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n", - atomic_read(&fscache_n_cop_invalidate_object), - atomic_read(&fscache_n_cop_update_object), - atomic_read(&fscache_n_cop_drop_object), - atomic_read(&fscache_n_cop_put_object), - atomic_read(&fscache_n_cop_attr_changed), - atomic_read(&fscache_n_cop_sync_cache)); - seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", - atomic_read(&fscache_n_cop_read_or_alloc_page), - atomic_read(&fscache_n_cop_read_or_alloc_pages), - atomic_read(&fscache_n_cop_allocate_page), - atomic_read(&fscache_n_cop_allocate_pages), - atomic_read(&fscache_n_cop_write_page), - atomic_read(&fscache_n_cop_uncache_page), - atomic_read(&fscache_n_cop_dissociate_pages)); - seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", - atomic_read(&fscache_n_cache_no_space_reject), - atomic_read(&fscache_n_cache_stale_objects), - atomic_read(&fscache_n_cache_retired_objects), - atomic_read(&fscache_n_cache_culled_objects)); netfs_stats_show(m); return 0; } diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c new file mode 100644 index 000000000000..a57c6cbee858 --- /dev/null +++ b/fs/fscache/volume.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Volume-level cache cookie handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/export.h> +#include <linux/slab.h> +#include "internal.h" + +#define fscache_volume_hash_shift 10 +static struct hlist_bl_head fscache_volume_hash[1 << fscache_volume_hash_shift]; +static atomic_t fscache_volume_debug_id; +static LIST_HEAD(fscache_volumes); + +static void fscache_create_volume_work(struct work_struct *work); + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref; + + __refcount_inc(&volume->ref, &ref); + trace_fscache_volume(volume->debug_id, ref + 1, where); + return volume; +} + +static void fscache_see_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref = refcount_read(&volume->ref); + + trace_fscache_volume(volume->debug_id, ref, where); +} + +/* + * Pin the cache behind a volume so that we can access it. + */ +static void __fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + n_accesses = atomic_inc_return(&volume->n_accesses); + smp_mb__after_atomic(); + trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0, + refcount_read(&volume->ref), + n_accesses, why); +} + +/** + * fscache_begin_volume_access - Pin a cache so a volume can be accessed + * @volume: The volume cookie + * @cookie: A datafile cookie for a tracing reference (or NULL) + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing a volume and returns true if successful. This works as follows: + * + * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), + * then we return false to indicate access was not permitted. + * + * (2) If the cache tests as live, then we increment the volume's n_accesses + * count and then recheck the cache liveness, ending the access if it + * ceased to be live. + * + * (3) When we end the access, we decrement the volume's n_accesses and wake + * up the any waiters if it reaches 0. + * + * (4) Whilst the cache is caching, the volume's n_accesses is kept + * artificially incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline, the state is changed to prevent new + * accesses, the volume's n_accesses is decremented and we wait for it to + * become 0. + * + * The datafile @cookie and the @why indicator are merely provided for tracing + * purposes. + */ +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + if (!fscache_cache_is_live(volume->cache)) + return false; + __fscache_begin_volume_access(volume, cookie, why); + if (!fscache_cache_is_live(volume->cache)) { + fscache_end_volume_access(volume, cookie, fscache_access_unlive); + return false; + } + return true; +} + +/** + * fscache_end_volume_access - Unpin a cache at the end of an access. + * @volume: The volume cookie + * @cookie: A datafile cookie for a tracing reference (or NULL) + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache volume after we've accessed it. The datafile @cookie and the + * @why indicator are merely provided for tracing purposes. + */ +void fscache_end_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&volume->n_accesses); + trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0, + refcount_read(&volume->ref), + n_accesses, why); + if (n_accesses == 0) + wake_up_var(&volume->n_accesses); +} +EXPORT_SYMBOL(fscache_end_volume_access); + +static bool fscache_volume_same(const struct fscache_volume *a, + const struct fscache_volume *b) +{ + size_t klen; + + if (a->key_hash != b->key_hash || + a->cache != b->cache || + a->key[0] != b->key[0]) + return false; + + klen = round_up(a->key[0] + 1, sizeof(__le32)); + return memcmp(a->key, b->key, klen) == 0; +} + +static bool fscache_is_acquire_pending(struct fscache_volume *volume) +{ + return test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &volume->flags); +} + +static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, + unsigned int collidee_debug_id) +{ + wait_var_event_timeout(&candidate->flags, + fscache_is_acquire_pending(candidate), 20 * HZ); + if (!fscache_is_acquire_pending(candidate)) { + pr_notice("Potential volume collision new=%08x old=%08x", + candidate->debug_id, collidee_debug_id); + fscache_stat(&fscache_n_volumes_collision); + wait_var_event(&candidate->flags, fscache_is_acquire_pending(candidate)); + } +} + +/* + * Attempt to insert the new volume into the hash. If there's a collision, we + * wait for the old volume to complete if it's being relinquished and an error + * otherwise. + */ +static bool fscache_hash_volume(struct fscache_volume *candidate) +{ + struct fscache_volume *cursor; + struct hlist_bl_head *h; + struct hlist_bl_node *p; + unsigned int bucket, collidee_debug_id = 0; + + bucket = candidate->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1); + h = &fscache_volume_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_volume_same(candidate, cursor)) { + if (!test_bit(FSCACHE_VOLUME_RELINQUISHED, &cursor->flags)) + goto collision; + fscache_see_volume(cursor, fscache_volume_get_hash_collision); + set_bit(FSCACHE_VOLUME_COLLIDED_WITH, &cursor->flags); + set_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags); + collidee_debug_id = cursor->debug_id; + break; + } + } + + hlist_bl_add_head(&candidate->hash_link, h); + hlist_bl_unlock(h); + + if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags)) + fscache_wait_on_volume_collision(candidate, collidee_debug_id); + return true; + +collision: + fscache_see_volume(cursor, fscache_volume_collision); + hlist_bl_unlock(h); + return false; +} + +/* + * Allocate and initialise a volume representation cookie. + */ +static struct fscache_volume *fscache_alloc_volume(const char *volume_key, + const char *cache_name, + const void *coherency_data, + size_t coherency_len) +{ + struct fscache_volume *volume; + struct fscache_cache *cache; + size_t klen, hlen; + char *key; + + if (!coherency_data) + coherency_len = 0; + + cache = fscache_lookup_cache(cache_name, false); + if (IS_ERR(cache)) + return NULL; + + volume = kzalloc(struct_size(volume, coherency, coherency_len), + GFP_KERNEL); + if (!volume) + goto err_cache; + + volume->cache = cache; + volume->coherency_len = coherency_len; + if (coherency_data) + memcpy(volume->coherency, coherency_data, coherency_len); + INIT_LIST_HEAD(&volume->proc_link); + INIT_WORK(&volume->work, fscache_create_volume_work); + refcount_set(&volume->ref, 1); + spin_lock_init(&volume->lock); + + /* Stick the length on the front of the key and pad it out to make + * hashing easier. + */ + klen = strlen(volume_key); + hlen = round_up(1 + klen + 1, sizeof(__le32)); + key = kzalloc(hlen, GFP_KERNEL); + if (!key) + goto err_vol; + key[0] = klen; + memcpy(key + 1, volume_key, klen); + + volume->key = key; + volume->key_hash = fscache_hash(0, key, hlen); + + volume->debug_id = atomic_inc_return(&fscache_volume_debug_id); + down_write(&fscache_addremove_sem); + atomic_inc(&cache->n_volumes); + list_add_tail(&volume->proc_link, &fscache_volumes); + fscache_see_volume(volume, fscache_volume_new_acquire); + fscache_stat(&fscache_n_volumes); + up_write(&fscache_addremove_sem); + _leave(" = v=%x", volume->debug_id); + return volume; + +err_vol: + kfree(volume); +err_cache: + fscache_put_cache(cache, fscache_cache_put_alloc_volume); + fscache_stat(&fscache_n_volumes_nomem); + return NULL; +} + +/* + * Create a volume's representation on disk. Have a volume ref and a cache + * access we have to release. + */ +static void fscache_create_volume_work(struct work_struct *work) +{ + const struct fscache_cache_ops *ops; + struct fscache_volume *volume = + container_of(work, struct fscache_volume, work); + + fscache_see_volume(volume, fscache_volume_see_create_work); + + ops = volume->cache->ops; + if (ops->acquire_volume) + ops->acquire_volume(volume); + fscache_end_cache_access(volume->cache, + fscache_access_acquire_volume_end); + + clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); + wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + fscache_put_volume(volume, fscache_volume_put_create_work); +} + +/* + * Dispatch a worker thread to create a volume's representation on disk. + */ +void fscache_create_volume(struct fscache_volume *volume, bool wait) +{ + if (test_and_set_bit(FSCACHE_VOLUME_CREATING, &volume->flags)) + goto maybe_wait; + if (volume->cache_priv) + goto no_wait; /* We raced */ + if (!fscache_begin_cache_access(volume->cache, + fscache_access_acquire_volume)) + goto no_wait; + + fscache_get_volume(volume, fscache_volume_get_create_work); + if (!schedule_work(&volume->work)) + fscache_put_volume(volume, fscache_volume_put_create_work); + +maybe_wait: + if (wait) { + fscache_see_volume(volume, fscache_volume_wait_create_work); + wait_on_bit(&volume->flags, FSCACHE_VOLUME_CREATING, + TASK_UNINTERRUPTIBLE); + } + return; +no_wait: + clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); + wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); +} + +/* + * Acquire a volume representation cookie and link it to a (proposed) cache. + */ +struct fscache_volume *__fscache_acquire_volume(const char *volume_key, + const char *cache_name, + const void *coherency_data, + size_t coherency_len) +{ + struct fscache_volume *volume; + + volume = fscache_alloc_volume(volume_key, cache_name, + coherency_data, coherency_len); + if (!volume) + return ERR_PTR(-ENOMEM); + + if (!fscache_hash_volume(volume)) { + fscache_put_volume(volume, fscache_volume_put_hash_collision); + return ERR_PTR(-EBUSY); + } + + fscache_create_volume(volume, false); + return volume; +} +EXPORT_SYMBOL(__fscache_acquire_volume); + +static void fscache_wake_pending_volume(struct fscache_volume *volume, + struct hlist_bl_head *h) +{ + struct fscache_volume *cursor; + struct hlist_bl_node *p; + + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_volume_same(cursor, volume)) { + fscache_see_volume(cursor, fscache_volume_see_hash_wake); + clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); + wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + return; + } + } +} + +/* + * Remove a volume cookie from the hash table. + */ +static void fscache_unhash_volume(struct fscache_volume *volume) +{ + struct hlist_bl_head *h; + unsigned int bucket; + + bucket = volume->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1); + h = &fscache_volume_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_del(&volume->hash_link); + if (test_bit(FSCACHE_VOLUME_COLLIDED_WITH, &volume->flags)) + fscache_wake_pending_volume(volume, h); + hlist_bl_unlock(h); +} + +/* + * Drop a cache's volume attachments. + */ +static void fscache_free_volume(struct fscache_volume *volume) +{ + struct fscache_cache *cache = volume->cache; + + if (volume->cache_priv) { + __fscache_begin_volume_access(volume, NULL, + fscache_access_relinquish_volume); + if (volume->cache_priv) + cache->ops->free_volume(volume); + fscache_end_volume_access(volume, NULL, + fscache_access_relinquish_volume_end); + } + + down_write(&fscache_addremove_sem); + list_del_init(&volume->proc_link); + atomic_dec(&volume->cache->n_volumes); + up_write(&fscache_addremove_sem); + + if (!hlist_bl_unhashed(&volume->hash_link)) + fscache_unhash_volume(volume); + + trace_fscache_volume(volume->debug_id, 0, fscache_volume_free); + kfree(volume->key); + kfree(volume); + fscache_stat_d(&fscache_n_volumes); + fscache_put_cache(cache, fscache_cache_put_volume); +} + +/* + * Drop a reference to a volume cookie. + */ +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + if (volume) { + unsigned int debug_id = volume->debug_id; + bool zero; + int ref; + + zero = __refcount_dec_and_test(&volume->ref, &ref); + trace_fscache_volume(debug_id, ref - 1, where); + if (zero) + fscache_free_volume(volume); + } +} + +/* + * Relinquish a volume representation cookie. + */ +void __fscache_relinquish_volume(struct fscache_volume *volume, + const void *coherency_data, + bool invalidate) +{ + if (WARN_ON(test_and_set_bit(FSCACHE_VOLUME_RELINQUISHED, &volume->flags))) + return; + + if (invalidate) { + set_bit(FSCACHE_VOLUME_INVALIDATE, &volume->flags); + } else if (coherency_data) { + memcpy(volume->coherency, coherency_data, volume->coherency_len); + } + + fscache_put_volume(volume, fscache_volume_put_relinquish); +} +EXPORT_SYMBOL(__fscache_relinquish_volume); + +/** + * fscache_withdraw_volume - Withdraw a volume from being cached + * @volume: Volume cookie + * + * Withdraw a cache volume from service, waiting for all accesses to complete + * before returning. + */ +void fscache_withdraw_volume(struct fscache_volume *volume) +{ + int n_accesses; + + _debug("withdraw V=%x", volume->debug_id); + + /* Allow wakeups on dec-to-0 */ + n_accesses = atomic_dec_return(&volume->n_accesses); + trace_fscache_access_volume(volume->debug_id, 0, + refcount_read(&volume->ref), + n_accesses, fscache_access_cache_unpin); + + wait_var_event(&volume->n_accesses, + atomic_read(&volume->n_accesses) == 0); +} +EXPORT_SYMBOL(fscache_withdraw_volume); + +#ifdef CONFIG_PROC_FS +/* + * Generate a list of volumes in /proc/fs/fscache/volumes + */ +static int fscache_volumes_seq_show(struct seq_file *m, void *v) +{ + struct fscache_volume *volume; + + if (v == &fscache_volumes) { + seq_puts(m, + "VOLUME REF nCOOK ACC FL CACHE KEY\n" + "======== ===== ===== === == =============== ================\n"); + return 0; + } + + volume = list_entry(v, struct fscache_volume, proc_link); + seq_printf(m, + "%08x %5d %5d %3d %02lx %-15.15s %s\n", + volume->debug_id, + refcount_read(&volume->ref), + atomic_read(&volume->n_cookies), + atomic_read(&volume->n_accesses), + volume->flags, + volume->cache->name ?: "-", + volume->key + 1); + return 0; +} + +static void *fscache_volumes_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_addremove_sem) +{ + down_read(&fscache_addremove_sem); + return seq_list_start_head(&fscache_volumes, *_pos); +} + +static void *fscache_volumes_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_volumes, _pos); +} + +static void fscache_volumes_seq_stop(struct seq_file *m, void *v) + __releases(&fscache_addremove_sem) +{ + up_read(&fscache_addremove_sem); +} + +const struct seq_operations fscache_volumes_seq_ops = { + .start = fscache_volumes_seq_start, + .next = fscache_volumes_seq_next, + .stop = fscache_volumes_seq_stop, + .show = fscache_volumes_seq_show, +}; +#endif /* CONFIG_PROC_FS */ diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 40ce9a1c12e5..038ed0b9aaa5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -45,7 +45,7 @@ config FUSE_DAX select INTERVAL_TREE depends on VIRTIO_FS depends on FS_DAX - depends on DAX_DRIVER + depends on DAX help This allows bypassing guest page cache and allows mapping host page cache directly in guest address space. diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 713818d74de6..182b24a14804 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1279,11 +1279,14 @@ out_err: return ret; } -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) { struct fuse_conn_dax *fcd; int err; + fc->dax_mode = dax_mode; + if (!dax_dev) return 0; @@ -1327,17 +1330,46 @@ static const struct address_space_operations fuse_dax_file_aops = { .invalidatepage = noop_invalidatepage, }; -void fuse_dax_inode_init(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + + if (dax_mode == FUSE_DAX_NEVER) + return false; + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ if (!fc->dax) + return false; + + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return fc->inode_dax && (flags & FUSE_ATTR_DAX); +} + +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) +{ + if (!fuse_should_enable_dax(inode, flags)) return; inode->i_flags |= S_DAX; inode->i_data.a_ops = &fuse_dax_file_aops; } +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 79f7eda49e06..cd54a529460d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -847,17 +847,17 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) replace_page_cache_page(oldpage, newpage); + get_page(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(newpage); + /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf); - get_page(newpage); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(newpage); - err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags)) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0654bfedcbb0..656e921f3506 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -17,6 +17,9 @@ #include <linux/xattr.h> #include <linux/iversion.h> #include <linux/posix_acl.h> +#include <linux/security.h> +#include <linux/types.h> +#include <linux/kernel.h> static void fuse_advise_use_readdirplus(struct inode *dir) { @@ -456,6 +459,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, return ERR_PTR(err); } +static int get_security_context(struct dentry *entry, umode_t mode, + void **security_ctx, u32 *security_ctxlen) +{ + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; + void *ctx = NULL, *ptr; + u32 ctxlen, total_len = sizeof(*header); + int err, nr_ctx = 0; + const char *name; + size_t namelen; + + err = security_dentry_init_security(entry, mode, &entry->d_name, + &name, &ctx, &ctxlen); + if (err) { + if (err != -EOPNOTSUPP) + goto out_err; + /* No LSM is supporting this security hook. Ignore error */ + ctxlen = 0; + ctx = NULL; + } + + if (ctxlen) { + nr_ctx = 1; + namelen = strlen(name) + 1; + err = -EIO; + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + goto out_err; + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + } + + err = -ENOMEM; + header = ptr = kzalloc(total_len, GFP_KERNEL); + if (!ptr) + goto out_err; + + header->nr_secctx = nr_ctx; + header->size = total_len; + ptr += sizeof(*header); + if (nr_ctx) { + fctx = ptr; + fctx->size = ctxlen; + ptr += sizeof(*fctx); + + strcpy(ptr, name); + ptr += namelen; + + memcpy(ptr, ctx, ctxlen); + } + *security_ctxlen = total_len; + *security_ctx = header; + err = 0; +out_err: + kfree(ctx); + return err; +} + /* * Atomic create+open operation * @@ -476,6 +535,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + void *security_ctx = NULL; + u32 security_ctxlen; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -517,7 +578,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; + + if (fm->fc->init_security) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + args.in_numargs = 3; + args.in_args[2].size = security_ctxlen; + args.in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, &args); + kfree(security_ctx); if (err) goto out_free_ff; @@ -620,6 +694,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; + void *security_ctx = NULL; + u32 security_ctxlen; if (fuse_is_bad(dir)) return -EIO; @@ -633,7 +709,22 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; + + if (fm->fc->init_security && args->opcode != FUSE_LINK) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + BUG_ON(args->in_numargs != 2); + + args->in_numargs = 3; + args->in_args[2].size = security_ctxlen; + args->in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, args); + kfree(security_ctx); if (err) goto out_put_forget_req; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d6c5f6361f7..829094451774 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2910,7 +2910,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); @@ -3169,7 +3169,7 @@ static const struct address_space_operations fuse_file_aops = { .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -3183,5 +3183,5 @@ void fuse_init_file_inode(struct inode *inode) fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_inode_init(inode); + fuse_dax_inode_init(inode, flags); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 198637b41e19..e8e59fbdefeb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,18 @@ struct fuse_dev { struct list_head entry; }; +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + struct fuse_fs_context { int fd; struct file *file; @@ -497,7 +509,7 @@ struct fuse_fs_context { bool no_control:1; bool no_force_umount:1; bool legacy_opts_show:1; - bool dax:1; + enum fuse_dax_mode dax_mode; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -765,6 +777,12 @@ struct fuse_conn { /* Propagate syncfs() to server */ unsigned int sync_fs:1; + /* Initialize security xattrs when creating a new inode */ + unsigned int init_security:1; + + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -802,6 +820,9 @@ struct fuse_conn { struct list_head devices; #ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + /* Dax specific conn data, non-NULL if DAX is enabled */ struct fuse_conn_dax *dax; #endif @@ -1007,7 +1028,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, /** * Initialize file operations on a regular file */ -void fuse_init_file_inode(struct inode *inode); +void fuse_init_file_inode(struct inode *inode, unsigned int flags); /** * Initialize inode operations on regular files and special files @@ -1269,11 +1290,13 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); -void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8b89e3ba7df3..ee846ce371d8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -301,6 +301,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) @@ -313,7 +316,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) @@ -767,8 +770,12 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",blksize=%lu", sb->s_blocksize); } #ifdef CONFIG_FUSE_DAX - if (fc->dax) - seq_puts(m, ",dax"); + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); #endif return 0; @@ -1109,73 +1116,80 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { + u64 flags = arg->flags | (u64) arg->flags2 << 32; + ra_pages = arg->max_readahead / PAGE_SIZE; - if (arg->flags & FUSE_ASYNC_READ) + if (flags & FUSE_ASYNC_READ) fc->async_read = 1; - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { - if (!(arg->flags & FUSE_FLOCK_LOCKS)) + if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) + if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ - if (arg->flags & FUSE_EXPORT_SUPPORT) + if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } - if (arg->flags & FUSE_BIG_WRITES) + if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; - if (arg->flags & FUSE_DONT_MASK) + if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; - if (arg->flags & FUSE_AUTO_INVAL_DATA) + if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA) + else if (flags & FUSE_EXPLICIT_INVAL_DATA) fc->explicit_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) { + if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) + if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } - if (arg->flags & FUSE_ASYNC_DIO) + if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (arg->flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; - if (arg->flags & FUSE_PARALLEL_DIROPS) + if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; - if (arg->flags & FUSE_HANDLE_KILLPRIV) + if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fm->sb->s_time_gran = arg->time_gran; - if ((arg->flags & FUSE_POSIX_ACL)) { + if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; fm->sb->s_xattr = fuse_acl_xattr_handlers; } - if (arg->flags & FUSE_CACHE_SYMLINKS) + if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; - if (arg->flags & FUSE_ABORT_ERROR) + if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } - if (IS_ENABLED(CONFIG_FUSE_DAX) && - arg->flags & FUSE_MAP_ALIGNMENT && - !fuse_dax_check_alignment(fc, arg->map_alignment)) { - ok = false; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; } - if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } - if (arg->flags & FUSE_SETXATTR_EXT) + if (flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1203,13 +1217,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; + u64 flags; ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - ia->in.flags |= + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -1219,13 +1234,19 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) - ia->in.flags |= FUSE_MAP_ALIGNMENT; + flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; #endif if (fm->fc->auto_submounts) - ia->in.flags |= FUSE_SUBMOUNTS; + flags |= FUSE_SUBMOUNTS; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; @@ -1514,7 +1535,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; if (IS_ENABLED(CONFIG_FUSE_DAX)) { - err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) goto err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4cfa4bc1f579..9d737904d07c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -88,12 +88,21 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + enum { OPT_DAX, + OPT_DAX_ENUM, }; static const struct fs_parameter_spec virtio_fs_parameters[] = { fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), {} }; @@ -110,7 +119,10 @@ static int virtio_fs_parse_param(struct fs_context *fsc, switch (opt) { case OPT_DAX: - ctx->dax = 1; + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; break; default: return -EINVAL; @@ -753,20 +765,6 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } -static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, - pgoff_t pgoff, void *addr, - size_t bytes, struct iov_iter *i) -{ - return copy_from_iter(addr, bytes, i); -} - -static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, - pgoff_t pgoff, void *addr, - size_t bytes, struct iov_iter *i) -{ - return copy_to_iter(addr, bytes, i); -} - static int virtio_fs_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { @@ -783,8 +781,6 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, static const struct dax_operations virtio_fs_dax_ops = { .direct_access = virtio_fs_direct_access, - .copy_from_iter = virtio_fs_copy_from_iter, - .copy_to_iter = virtio_fs_copy_to_iter, .zero_page_range = virtio_fs_zero_page_range, }; @@ -850,7 +846,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); - fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); if (IS_ERR(fs->dax_dev)) return PTR_ERR(fs->dax_dev); @@ -895,7 +891,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) return 0; out_vqs: - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); kfree(fs->vqs); @@ -927,7 +923,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); vdev->priv = NULL; @@ -1326,8 +1322,8 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) { - if (!fs->dax_dev) { + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { err = -EINVAL; pr_err("virtio-fs: dax can't be enabled as filesystem" " device does not support it.\n"); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7235d539e969..d67108489148 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -940,7 +940,7 @@ do_alloc: else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else - iomap->length = size - pos; + iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index adafaaf7d24d..3e718cfc19a7 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -773,8 +773,8 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, size_t *prev_count, size_t *window_size) { - char __user *p = i->iov[0].iov_base + i->iov_offset; size_t count = iov_iter_count(i); + char __user *p; int pages = 1; if (likely(!count)) @@ -787,14 +787,14 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (*prev_count != count || !*window_size) { int pages, nr_dirtied; - pages = min_t(int, BIO_MAX_VECS, - DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); + pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE)); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 1); pages = min(pages, nr_dirtied); } *prev_count = count; + p = i->iov[0].iov_base + i->iov_offset; *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); return true; } @@ -1013,6 +1013,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; + size_t orig_count = iov_iter_count(from); size_t read = 0; ssize_t ret; @@ -1057,6 +1058,7 @@ retry_under_glock: if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); + from->count = orig_count - read; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -1064,6 +1066,7 @@ retry_under_glock: leftover = fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { + from->count = min(from->count, window_size - leftover); if (!gfs2_holder_queued(gh)) { if (read) goto out_uninit; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 19f38aee1b61..b7ab8430333c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -411,14 +411,14 @@ static void do_error(struct gfs2_glock *gl, const int ret) static void demote_incompat_holders(struct gfs2_glock *gl, struct gfs2_holder *new_gh) { - struct gfs2_holder *gh; + struct gfs2_holder *gh, *tmp; /* * Demote incompatible holders before we make ourselves eligible. * (This holder may or may not allow auto-demoting, but we don't want * to demote the new holder before it's even granted.) */ - list_for_each_entry(gh, &gl->gl_holders, gh_list) { + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { /* * Since holders are at the front of the list, we stop when we * find the first non-holder. @@ -477,7 +477,7 @@ find_first_strong_holder(struct gfs2_glock *gl) /* * gfs2_instantiate - Call the glops instantiate function - * @gl: The glock + * @gh: The glock holder * * Returns: 0 if instantiate was successful, 2 if type specific operation is * underway, or error. @@ -496,7 +496,7 @@ again: * Since we unlock the lockref lock, we set a flag to indicate * instantiate is in progress. */ - if (test_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { + if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, TASK_UNINTERRUPTIBLE); /* @@ -509,14 +509,10 @@ again: goto again; } - set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - ret = glops->go_instantiate(gh); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); - clear_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - smp_mb__after_atomic(); - wake_up_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG); + clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); return ret; } @@ -1249,7 +1245,7 @@ out: } /** - * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * __gfs2_holder_init - initialize a struct gfs2_holder in the default way * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags @@ -1861,7 +1857,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { - struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, }; unsigned long delay = 0; unsigned long holdtime; unsigned long now = jiffies; @@ -1894,8 +1889,13 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) * keep the glock until the last strong holder is done with it. */ if (!find_first_strong_holder(gl)) { - if (state == LM_ST_UNLOCKED) - mock_gh.gh_state = LM_ST_EXCLUSIVE; + struct gfs2_holder mock_gh = { + .gh_gl = gl, + .gh_state = (state == LM_ST_UNLOCKED) ? + LM_ST_EXCLUSIVE : state, + .gh_iflags = BIT(HIF_HOLDER) + }; + demote_incompat_holders(gl, &mock_gh); } handle_callback(gl, state, delay, true); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 650ad77c4d0b..392800f082a6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -228,7 +228,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); truncate_inode_pages_range(mapping, start, end); - set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); } static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, @@ -764,6 +763,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_dump = inode_go_dump, .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, .go_subclass = 1, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6424b903e885..89905f4f29bb 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -40,37 +40,6 @@ static const struct inode_operations gfs2_file_iops; static const struct inode_operations gfs2_dir_iops; static const struct inode_operations gfs2_symlink_iops; -static int iget_test(struct inode *inode, void *opaque) -{ - u64 no_addr = *(u64 *)opaque; - - return GFS2_I(inode)->i_no_addr == no_addr; -} - -static int iget_set(struct inode *inode, void *opaque) -{ - u64 no_addr = *(u64 *)opaque; - - GFS2_I(inode)->i_no_addr = no_addr; - inode->i_ino = no_addr; - return 0; -} - -static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) -{ - struct inode *inode; - -repeat: - inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr); - if (!inode) - return inode; - if (is_bad_inode(inode)) { - iput(inode); - goto repeat; - } - return inode; -} - /** * gfs2_set_iop - Sets inode operations * @inode: The inode with correct i_mode filled in @@ -104,6 +73,22 @@ static void gfs2_set_iop(struct inode *inode) } } +static int iget_test(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + return GFS2_I(inode)->i_no_addr == no_addr; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + GFS2_I(inode)->i_no_addr = no_addr; + inode->i_ino = no_addr; + return 0; +} + /** * gfs2_inode_lookup - Lookup an inode * @sb: The super block @@ -132,12 +117,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl = NULL; struct gfs2_holder i_gh; int error; gfs2_holder_mark_uninitialized(&i_gh); - inode = gfs2_iget(sb, no_addr); + inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr); if (!inode) return ERR_PTR(-ENOMEM); @@ -145,22 +129,16 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_glock *io_gl; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) goto fail; - flush_delayed_work(&ip->i_gl->gl_work); - - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail; - if (blktype != GFS2_BLKST_UNLINKED) - gfs2_cancel_delete_work(io_gl); if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { /* * The GL_SKIP flag indicates to skip reading the inode - * block. We read the inode with gfs2_inode_refresh + * block. We read the inode when instantiating it * after possibly checking the block type. */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, @@ -181,24 +159,31 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } - glock_set_object(ip->i_gl, ip); set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) goto fail; - glock_set_object(ip->i_iopen_gh.gh_gl, ip); + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); gfs2_glock_put(io_gl); - io_gl = NULL; + if (unlikely(error)) + goto fail; /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; + glock_set_object(ip->i_gl, ip); + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_instantiate(&i_gh); - if (error) + if (error) { + glock_clear_object(ip->i_gl, ip); goto fail; + } } else { ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); @@ -206,31 +191,23 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_set_iop(inode); + unlock_new_inode(inode); } if (no_formal_ino && ip->i_no_formal_ino && no_formal_ino != ip->i_no_formal_ino) { - error = -ESTALE; - if (inode->i_state & I_NEW) - goto fail; iput(inode); - return ERR_PTR(error); + return ERR_PTR(-ESTALE); } - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - return inode; fail: - if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); + if (gfs2_holder_initialized(&ip->i_iopen_gh)) gfs2_glock_dq_uninit(&ip->i_iopen_gh); - } - if (io_gl) - gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); iget_failed(inode); @@ -730,18 +707,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - flush_delayed_work(&ip->i_gl->gl_work); error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_free_inode; gfs2_cancel_delete_work(io_gl); + error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); + BUG_ON(error); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_gunlock2; - glock_set_object(ip->i_gl, ip); error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock2; @@ -757,9 +735,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; + glock_set_object(ip->i_gl, ip); glock_set_object(io_gl, ip); gfs2_set_iop(inode); - insert_inode_hash(inode); free_vfs_inode = 0; /* After this point, the inode is no longer considered free. Any failures need to undo @@ -801,17 +779,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_glock_dq_uninit(ghs + 1); gfs2_glock_put(io_gl); gfs2_qa_put(dip); + unlock_new_inode(inode); return error; fail_gunlock3: + glock_clear_object(ip->i_gl, ip); glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: - glock_clear_object(io_gl, ip); gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { - glock_clear_object(ip->i_gl, ip); if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } @@ -829,7 +807,10 @@ fail_gunlock: mark_inode_dirty(inode); set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); - iput(inode); + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); } if (gfs2_holder_initialized(ghs + 1)) gfs2_glock_dq_uninit(ghs + 1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5b121371508a..64c67090f503 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1244,11 +1244,9 @@ static enum dinode_demise evict_should_delete(struct inode *inode, if (ret) return SHOULD_NOT_DELETE_DINODE; - if (test_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags)) { - ret = gfs2_instantiate(gh); - if (ret) - return SHOULD_NOT_DELETE_DINODE; - } + ret = gfs2_instantiate(gh); + if (ret) + return SHOULD_NOT_DELETE_DINODE; /* * The inode may have been recreated in the meantime. @@ -1402,13 +1400,6 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - if (ip->i_gl) { - glock_clear_object(ip->i_gl, ip); - wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); - gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put_eventually(ip->i_gl); - ip->i_gl = NULL; - } if (gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; @@ -1421,6 +1412,13 @@ out: gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); + ip->i_gl = NULL; + } } static struct inode *gfs2_alloc_inode(struct super_block *sb) diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c0a34d9ddee4..a6002b2d146d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -767,8 +767,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) wait_for_completion(&sdp->sd_kobj_unregister); } -static int gfs2_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int gfs2_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index d5c9d886cd9f..ef481c3d9019 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -924,6 +924,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_op = &hostfs_sbops; sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; + err = super_setup_bdi(sb); + if (err) + goto out; /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 49d2e686be74..a7c6c7498be0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma; /* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; diff --git a/fs/inode.c b/fs/inode.c index 3eba0940ffcf..980e7b7a5460 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -180,8 +180,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; - if (sb->s_type->fs_flags & FS_THP_SUPPORT) - __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -528,6 +526,55 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); +void dump_mapping(const struct address_space *mapping) +{ + struct inode *host; + const struct address_space_operations *a_ops; + struct hlist_node *dentry_first; + struct dentry *dentry_ptr; + struct dentry dentry; + unsigned long ino; + + /* + * If mapping is an invalid pointer, we don't want to crash + * accessing it, so probe everything depending on it carefully. + */ + if (get_kernel_nofault(host, &mapping->host) || + get_kernel_nofault(a_ops, &mapping->a_ops)) { + pr_warn("invalid mapping:%px\n", mapping); + return; + } + + if (!host) { + pr_warn("aops:%ps\n", a_ops); + return; + } + + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { + pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); + return; + } + + if (!dentry_first) { + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + return; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (get_kernel_nofault(dentry, dentry_ptr)) { + pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + a_ops, ino, dentry_ptr); + return; + } + + /* + * if dentry is corrupted, the %pd handler may still crash, + * but it's unlikely that we reach here with a corrupt mapping + */ + pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); +} + void clear_inode(struct inode *inode) { /* diff --git a/fs/internal.h b/fs/internal.h index 7979ff8d168c..8590c973c2f4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -37,7 +37,7 @@ static inline int emergency_thaw_bdev(struct super_block *sb) /* * buffer.c */ -int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, +int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* diff --git a/fs/io-wq.c b/fs/io-wq.c index 88202de519f6..a7763127f884 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -142,6 +142,7 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match); static void create_worker_cb(struct callback_head *cb); +static void io_wq_cancel_tw_create(struct io_wq *wq); static bool io_worker_get(struct io_worker *worker) { @@ -357,12 +358,22 @@ static bool io_queue_worker_create(struct io_worker *worker, test_and_set_bit_lock(0, &worker->create_state)) goto fail_release; + atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { - clear_bit_unlock(0, &worker->create_state); + /* + * EXIT may have been set after checking it above, check after + * adding the task_work and remove any creation item if it is + * now set. wq exit does that too, but we can have added this + * work item after we canceled in io_wq_exit_workers(). + */ + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) + io_wq_cancel_tw_create(wq); + io_worker_ref_put(wq); return true; } + io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); @@ -384,7 +395,9 @@ static void io_wqe_dec_running(struct io_worker *worker) if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); + raw_spin_unlock(&wqe->lock); io_queue_worker_create(worker, acct, create_worker_cb); + raw_spin_lock(&wqe->lock); } } @@ -657,7 +670,7 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -675,7 +688,7 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -694,7 +707,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, struct task_struct *tsk) { - tsk->pf_io_worker = worker; + tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; @@ -714,6 +727,13 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) static inline bool io_should_retry_thread(long err) { + /* + * Prevent perpetual task_work retry, if the task (or its group) is + * exiting. + */ + if (fatal_signal_pending(current)) + return false; + switch (err) { case -EAGAIN: case -ERESTARTSYS: @@ -1191,13 +1211,9 @@ void io_wq_exit_start(struct io_wq *wq) set_bit(IO_WQ_BIT_EXIT, &wq->state); } -static void io_wq_exit_workers(struct io_wq *wq) +static void io_wq_cancel_tw_create(struct io_wq *wq) { struct callback_head *cb; - int node; - - if (!wq->task) - return; while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; @@ -1205,6 +1221,16 @@ static void io_wq_exit_workers(struct io_wq *wq) worker = container_of(cb, struct io_worker, create_work); io_worker_cancel_cb(worker); } +} + +static void io_wq_exit_workers(struct io_wq *wq) +{ + int node; + + if (!wq->task) + return; + + io_wq_cancel_tw_create(wq); rcu_read_lock(); for_each_node(node) { diff --git a/fs/io-wq.h b/fs/io-wq.h index 41bf37674a49..dbecd27656c7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -52,6 +52,28 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, list->last = node; } +/** + * wq_list_merge - merge the second list to the first one. + * @list0: the first list + * @list1: the second list + * Return the first node after mergence. + */ +static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, + struct io_wq_work_list *list1) +{ + struct io_wq_work_node *ret; + + if (!list0->first) { + ret = list1->first; + } else { + ret = list0->first; + list0->last->next = list1->first; + } + INIT_WQ_LIST(list0); + INIT_WQ_LIST(list1); + return ret; +} + static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -200,6 +222,6 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { return in_task() && (current->flags & PF_IO_WORKER) && - current->pf_io_worker; + current->worker_private; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index b07196b4511c..de9c9de90655 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,7 @@ #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> @@ -108,7 +108,8 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ + IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ @@ -320,6 +321,7 @@ struct io_submit_state { bool plug_started; bool need_plug; + bool flush_cqes; unsigned short submit_nr; struct blk_plug plug; }; @@ -337,6 +339,7 @@ struct io_ring_ctx { unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; + unsigned int drain_disabled: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -471,6 +474,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; + struct io_wq_work_list prior_task_list; struct callback_head task_work; bool task_running; }; @@ -483,8 +487,6 @@ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; - bool canceled; struct wait_queue_entry wait; }; @@ -721,6 +723,7 @@ enum { REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, /* first byte is taken by user flags, shift it to not overlap */ REQ_F_FAIL_BIT = 8, @@ -737,6 +740,7 @@ enum { REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, + REQ_F_SKIP_LINK_CQES_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -758,6 +762,8 @@ enum { REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* IOSQE_CQE_SKIP_SUCCESS */ + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), @@ -791,6 +797,8 @@ enum { REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + /* don't post CQEs while failing linked requests */ + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), }; struct async_poll { @@ -882,6 +890,7 @@ struct io_kiocb { const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; + atomic_t poll_refs; }; struct io_tctx_node { @@ -1108,8 +1117,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags); +static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); + static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1264,6 +1273,26 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } +static unsigned int __io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf = req->kbuf; + unsigned int cflags; + + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(kbuf); + req->kbuf = NULL; + return cflags; +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbuf(req); +} + static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) { bool got = percpu_ref_tryget(ref); @@ -1278,6 +1307,7 @@ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) + __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *req; @@ -1293,6 +1323,44 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) +{ + bool matched; + + if (task && head->task != task) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; @@ -1301,6 +1369,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; + if (req->flags & REQ_F_CQE_SKIP) { + req->flags &= ~REQ_F_CQE_SKIP; + req->flags |= REQ_F_SKIP_LINK_CQES; + } } static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -1502,10 +1574,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); @@ -1514,8 +1586,11 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_submit_state *state = &req->ctx->submit_state; + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + if (!(req->flags & REQ_F_CQE_SKIP)) + ctx->submit_state.flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } @@ -1560,7 +1635,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req->ctx, req->user_data, status, 0); + io_fill_cqe_req(req, status, 0); io_put_req_deferred(req); } } @@ -1791,6 +1866,18 @@ static inline void io_get_task_refs(int nr) io_task_refs_refill(tctx); } +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +{ + struct io_uring_task *tctx = task->io_uring; + unsigned int refs = tctx->cached_refs; + + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } +} + static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { @@ -1819,8 +1906,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1841,20 +1928,26 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data return io_cqring_event_overflow(ctx, user_data, res, cflags); } -/* not as hot to bloat with inlining */ -static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - return __io_cqring_fill_event(ctx, user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(req->ctx, req->user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) +{ + ctx->cq_extra++; + return __io_fill_cqe(ctx, user_data, res, cflags); +} + +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); - __io_cqring_fill_event(ctx, req->user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1874,6 +1967,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post(req, res, cflags); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2062,8 +2164,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); return true; } @@ -2075,6 +2177,7 @@ static void io_fail_links(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *nxt, *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; req->link = NULL; while (link) { @@ -2087,7 +2190,10 @@ static void io_fail_links(struct io_kiocb *req) link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, res, 0); + if (!ignore_cqes) { + link->flags &= ~REQ_F_CQE_SKIP; + io_fill_cqe_req(link, res, 0); + } io_put_req_deferred(link); link = nxt; } @@ -2104,8 +2210,8 @@ static bool io_disarm_next(struct io_kiocb *req) req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); posted = true; } @@ -2132,7 +2238,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req) spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); if (posted) - io_commit_cqring(req->ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -2169,51 +2275,108 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } +static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) +{ + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +static void handle_prev_tw_list(struct io_wq_work_node *node, + struct io_ring_ctx **ctx, bool *uring_locked) +{ + if (*ctx && !*uring_locked) + spin_lock(&(*ctx)->completion_lock); + + do { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != *ctx) { + if (unlikely(!*uring_locked && *ctx)) + ctx_commit_and_unlock(*ctx); + + ctx_flush_and_put(*ctx, uring_locked); + *ctx = req->ctx; + /* if not contended, grab and improve batching */ + *uring_locked = mutex_trylock(&(*ctx)->uring_lock); + percpu_ref_get(&(*ctx)->refs); + if (unlikely(!*uring_locked)) + spin_lock(&(*ctx)->completion_lock); + } + if (likely(*uring_locked)) + req->io_task_work.func(req, uring_locked); + else + __io_req_complete_post(req, req->result, io_put_kbuf(req)); + node = next; + } while (node); + + if (unlikely(!*uring_locked)) + ctx_commit_and_unlock(*ctx); +} + +static void handle_tw_list(struct io_wq_work_node *node, + struct io_ring_ctx **ctx, bool *locked) +{ + do { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != *ctx) { + ctx_flush_and_put(*ctx, locked); + *ctx = req->ctx; + /* if not contended, grab and improve batching */ + *locked = mutex_trylock(&(*ctx)->uring_lock); + percpu_ref_get(&(*ctx)->refs); + } + req->io_task_work.func(req, locked); + node = next; + } while (node); +} + static void tctx_task_work(struct callback_head *cb) { - bool locked = false; + bool uring_locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); while (1) { - struct io_wq_work_node *node; + struct io_wq_work_node *node1, *node2; - if (!tctx->task_list.first && locked) + if (!tctx->task_list.first && + !tctx->prior_task_list.first && uring_locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); - node = tctx->task_list.first; + node1 = tctx->prior_task_list.first; + node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - if (!node) + INIT_WQ_LIST(&tctx->prior_task_list); + if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); - if (!node) + if (!node2 && !node1) break; - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &locked); - ctx = req->ctx; - /* if not contended, grab and improve batching */ - locked = mutex_trylock(&ctx->uring_lock); - percpu_ref_get(&ctx->refs); - } - req->io_task_work.func(req, &locked); - node = next; - } while (node); + if (node1) + handle_prev_tw_list(node1, &ctx, &uring_locked); + if (node2) + handle_tw_list(node2, &ctx, &uring_locked); cond_resched(); } - ctx_flush_and_put(ctx, &locked); + ctx_flush_and_put(ctx, &uring_locked); + + /* relaxed read is enough as only the task itself sets ->in_idle */ + if (unlikely(atomic_read(&tctx->in_idle))) + io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req) +static void io_req_task_work_add(struct io_kiocb *req, bool priority) { struct task_struct *tsk = req->task; struct io_uring_task *tctx = tsk->io_uring; @@ -2225,7 +2388,10 @@ static void io_req_task_work_add(struct io_kiocb *req) WARN_ON_ONCE(!tctx); spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + if (priority) + wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); + else + wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2250,8 +2416,7 @@ static void io_req_task_work_add(struct io_kiocb *req) spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); + node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2288,19 +2453,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->result = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static void io_req_task_queue_reissue(struct io_kiocb *req) { req->io_task_work.func = io_queue_async_work; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static inline void io_queue_next(struct io_kiocb *req) @@ -2364,17 +2529,22 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, + if (state->flush_cqes) { + spin_lock(&ctx->completion_lock); + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - __io_cqring_fill_event(ctx, req->user_data, req->result, - req->cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, req->result, + req->cflags); + } + + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + state->flush_cqes = false; } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); @@ -2405,7 +2575,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { req->io_task_work.func = io_free_req_work; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } } @@ -2424,24 +2594,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) -{ - unsigned int cflags; - - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; - req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); - return cflags; -} - -static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) - return 0; - return io_put_kbuf(req, req->kbuf); -} - static inline bool io_run_task_work(void) { if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { @@ -2504,8 +2656,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; - __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); + if (unlikely(req->flags & REQ_F_CQE_SKIP)) + continue; + + __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); nr_events++; } @@ -2679,9 +2833,9 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void io_req_task_complete(struct io_kiocb *req, bool *locked) +static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_rw_kbuf(req); + unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { @@ -2692,12 +2846,12 @@ static void io_req_task_complete(struct io_kiocb *req, bool *locked) } } -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, +static void __io_complete_rw(struct io_kiocb *req, long res, unsigned int issue_flags) { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2708,7 +2862,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; req->result = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); + io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -2852,9 +3006,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; + if (kiocb->ki_pos == -1) { + if (!(file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = file->f_pos; + } else { + kiocb->ki_pos = 0; + } } kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); @@ -2922,10 +3080,9 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, +static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ @@ -2937,28 +3094,21 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, } if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, 0, issue_flags); + req->file->f_pos = req->rw.kiocb.ki_pos; + if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) + __io_complete_rw(req, ret, issue_flags); else - io_rw_done(kiocb, ret); + io_rw_done(&req->rw.kiocb, ret); if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { io_req_task_queue_reissue(req); } else { - unsigned int cflags = io_put_rw_kbuf(req); - struct io_ring_ctx *ctx = req->ctx; - req_set_fail(req); - if (issue_flags & IO_URING_F_UNLOCKED) { - mutex_lock(&ctx->uring_lock); - __io_req_complete(req, issue_flags, ret, cflags); - mutex_unlock(&ctx->uring_lock); - } else { - __io_req_complete(req, issue_flags, ret, cflags); - } + req->result = ret; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req, false); } } } @@ -3186,10 +3336,12 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, size_t sqe_len; ssize_t ret; - BUILD_BUG_ON(ERR_PTR(0) != NULL); - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) - return ERR_PTR(io_import_fixed(req, rw, iter)); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { + ret = io_import_fixed(req, rw, iter); + if (ret) + return ERR_PTR(ret); + return NULL; + } /* buffer index only valid with fixed read/write, or buffer select */ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) @@ -3207,15 +3359,18 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, } ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); - return ERR_PTR(ret); + if (ret) + return ERR_PTR(ret); + return NULL; } iovec = s->fast_iov; if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, iovec, issue_flags); - if (!ret) - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); - return ERR_PTR(ret); + if (ret) + return ERR_PTR(ret); + iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + return NULL; } ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, @@ -3586,7 +3741,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: - kiocb_done(kiocb, ret, issue_flags); + kiocb_done(req, ret, issue_flags); out_free: /* it's faster to check here then delegate to kfree */ if (iovec) @@ -3683,7 +3838,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: - kiocb_done(kiocb, ret2, issue_flags); + kiocb_done(req, ret2, issue_flags); } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); @@ -4327,6 +4482,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, kfree(nxt); if (++i == nbufs) return i; + cond_resched(); } i++; kfree(buf); @@ -4795,17 +4951,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret) - req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4841,13 +4998,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - - if (ret < min_ret) + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail(req); + } __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4947,11 +5104,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); } -static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) -{ - return io_put_kbuf(req, req->kbuf); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -4989,8 +5141,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct io_buffer *kbuf; unsigned flags; - int min_ret = 0; - int ret, cflags = 0; + int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); @@ -5024,20 +5175,21 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_recv_kbuf(req); /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); return 0; } @@ -5050,8 +5202,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct iovec iov; unsigned flags; - int min_ret = 0; - int ret, cflags = 0; + int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); @@ -5083,16 +5234,18 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; out_free: - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_recv_kbuf(req); - if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail(req); - __io_req_complete(req, issue_flags, ret, cflags); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } + + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); return 0; } @@ -5259,52 +5412,23 @@ struct io_poll_table { int error; }; -static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, - __poll_t mask, io_req_tw_func_t func) -{ - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; +#define IO_POLL_CANCEL_FLAG BIT(31) +#define IO_POLL_REF_MASK ((1u << 20)-1) - trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); - - list_del_init(&poll->wait.entry); - - req->result = mask; - req->io_task_work.func = func; - - /* - * If this fails, then the task is exiting. When a task exits, the - * work gets canceled, so just cancel this request as well instead - * of executing it. We can't safely execute it anyway, as we may not - * have the needed state needed for it anyway. - */ - io_req_task_work_add(req); - return 1; +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } -static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) - __acquires(&req->ctx->completion_lock) +static void io_poll_mark_cancelled(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - WRITE_ONCE(poll->canceled, true); - - if (!req->result && !READ_ONCE(poll->canceled)) { - struct poll_table_struct pt = { ._key = poll->events }; - - req->result = vfs_poll(req->file, &pt) & poll->events; - } - - spin_lock(&ctx->completion_lock); - if (!req->result && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); - return true; - } - - return false; + atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) @@ -5322,133 +5446,199 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) return &req->apoll->poll; } -static void io_poll_remove_double(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) +static void io_poll_req_insert(struct io_kiocb *req) { - struct io_poll_iocb *poll = io_poll_get_double(req); + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; - lockdep_assert_held(&req->ctx->completion_lock); + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} - if (poll && poll->head) { - struct wait_queue_head *head = poll->head; +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - if (poll->wait.private) - req_ref_put(req); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } +static inline void io_poll_remove_entry(struct io_poll_iocb *poll) +{ + struct wait_queue_head *head = poll->head; + + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); } -static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) +static void io_poll_remove_entries(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll_double = io_poll_get_double(req); + + if (poll->head) + io_poll_remove_entry(poll); + if (poll_double && poll_double->head) + io_poll_remove_entry(poll_double); +} + +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->result. + */ +static int io_poll_check_events(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags = IORING_CQE_F_MORE; - int error; + struct io_poll_iocb *poll = io_poll_get_single(req); + int v; - if (READ_ONCE(req->poll.canceled)) { - error = -ECANCELED; - req->poll.events |= EPOLLONESHOT; - } else { - error = mangle_poll(mask); - } - if (req->poll.events & EPOLLONESHOT) - flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { - req->poll.events |= EPOLLONESHOT; - flags = 0; - } - if (flags & IORING_CQE_F_MORE) - ctx->cq_extra++; + /* req->task == current here, checking PF_EXITING is safe */ + if (unlikely(req->task->flags & PF_EXITING)) + io_poll_mark_cancelled(req); - return !(flags & IORING_CQE_F_MORE); + do { + v = atomic_read(&req->poll_refs); + + /* tw handler should be the owner, and so have some references */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return 0; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; + + if (!req->result) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; + } + + /* multishot, just fill an CQE and proceed */ + if (req->result && !(poll->events & EPOLLONESHOT)) { + __poll_t mask = mangle_poll(req->result & poll->events); + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->user_data, mask, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (unlikely(!filled)) + return -ECANCELED; + io_cqring_ev_posted(ctx); + } else if (req->result) { + return 0; + } + + /* + * Release all references, retry if someone tried to restart + * task_work while we were executing it. + */ + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + + return 1; } static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; + int ret; - if (io_poll_rewait(req, &req->poll)) { - spin_unlock(&ctx->completion_lock); + ret = io_poll_check_events(req); + if (ret > 0) + return; + + if (!ret) { + req->result = mangle_poll(req->result & req->poll.events); } else { - bool done; + req->result = ret; + req_set_fail(req); + } - if (req->poll.done) { - spin_unlock(&ctx->completion_lock); - return; - } - done = __io_poll_complete(req, req->result); - if (done) { - io_poll_remove_double(req); - hash_del(&req->hash_node); - req->poll.done = true; - } else { - req->result = 0; - add_wait_queue(req->poll.head, &req->poll.wait); - } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + __io_req_complete_post(req, req->result, 0); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} - if (done) { - nxt = io_put_req_find_next(req); - if (nxt) - io_req_task_submit(nxt, locked); - } - } +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req); + if (ret > 0) + return; + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock); + + if (!ret) + io_req_task_submit(req, locked); + else + io_req_complete_failed(req, ret); +} + +static void __io_poll_execute(struct io_kiocb *req, int mask) +{ + req->result = mask; + if (req->opcode == IORING_OP_POLL_ADD) + req->io_task_work.func = io_poll_task_func; + else + req->io_task_work.func = io_apoll_task_func; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + io_req_task_work_add(req, false); } -static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) +static inline void io_poll_execute(struct io_kiocb *req, int res) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + /* kick tw, which should complete the request */ + io_poll_execute(req, 0); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); __poll_t mask = key_to_poll(key); - unsigned long flags; - /* for instances that support it check for an event match first: */ + /* for instances that support it check for an event match first */ if (mask && !(mask & poll->events)) return 0; - if (!(poll->events & EPOLLONESHOT)) - return poll->wait.func(&poll->wait, mode, sync, key); - - list_del_init(&wait->entry); - - if (poll->head) { - bool done; - spin_lock_irqsave(&poll->head->lock, flags); - done = list_empty(&poll->wait.entry); - if (!done) + if (io_poll_get_ownership(req)) { + /* optional, saves extra locking for removal in tw handler */ + if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); - /* make sure double remove sees this as being gone */ - wait->private = NULL; - spin_unlock_irqrestore(&poll->head->lock, flags); - if (!done) { - /* use wait func handler, so it matches the rq type */ - poll->wait.func(&poll->wait, mode, sync, key); + poll->head = NULL; } + __io_poll_execute(req, mask); } - req_ref_put(req); return 1; } -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; - poll->done = false; - poll->canceled = false; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll_iocb **poll_ptr) @@ -5461,10 +5651,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * if this happens. */ if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *poll_one = poll; + struct io_poll_iocb *first = poll; /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) + if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { @@ -5473,21 +5663,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -EINVAL; return; } - /* - * Can't handle multishot for double wait for now, turn it - * into one-shot mode. - */ - if (!(poll_one->events & EPOLLONESHOT)) - poll_one->events |= EPOLLONESHOT; + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); - req_ref_get(req); - poll->wait.private = req; - + io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; @@ -5495,6 +5677,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->nr_entries++; poll->head = head; + poll->wait.private = req; if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -5502,70 +5685,24 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, add_wait_queue(head, &poll->wait); } -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); + __io_queue_proc(&pt->req->poll, pt, head, + (struct io_poll_iocb **) &pt->req->async_data); } -static void io_async_task_func(struct io_kiocb *req, bool *locked) +static int __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask) { - struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - - trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); - - if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_remove_double(req); - apoll->poll.done = true; - spin_unlock(&ctx->completion_lock); - - if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, -ECANCELED); -} - -static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->apoll->poll; - - trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, - key_to_poll(key)); - - return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - -static __poll_t __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, - struct io_poll_table *ipt, __poll_t mask, - wait_queue_func_t wake_func) - __acquires(&ctx->completion_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool cancel = false; + int v; INIT_HLIST_NODE(&req->hash_node); - io_init_poll_iocb(poll, mask, wake_func); + io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; poll->wait.private = req; @@ -5574,31 +5711,54 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, ipt->error = 0; ipt->nr_entries = 0; + /* + * Take the ownership to delay any tw execution up until we're done + * with poll arming. see io_poll_get_ownership(). + */ + atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; - if (unlikely(!ipt->nr_entries) && !ipt->error) - ipt->error = -EINVAL; + + if (mask && (poll->events & EPOLLONESHOT)) { + io_poll_remove_entries(req); + /* no one else has access to the req, forget about the ref */ + return mask; + } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + if (!ipt->error) + ipt->error = -EINVAL; + return 0; + } spin_lock(&ctx->completion_lock); - if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) - io_poll_remove_double(req); - if (likely(poll->head)) { - spin_lock_irq(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt->error) - cancel = true; - ipt->error = 0; - mask = 0; - } - if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock_irq(&poll->head->lock); + io_poll_req_insert(req); + spin_unlock(&ctx->completion_lock); + + if (mask) { + /* can't multishot if failed, just queue the event we've got */ + if (unlikely(ipt->error || !ipt->nr_entries)) + poll->events |= EPOLLONESHOT; + __io_poll_execute(req, mask); + return 0; } - return mask; + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0); + return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; + + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } enum { @@ -5613,7 +5773,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; @@ -5638,11 +5799,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_req_set_refcount(req); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, - io_async_wake); - spin_unlock(&ctx->completion_lock); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; @@ -5651,43 +5809,6 @@ static int io_arm_poll_handler(struct io_kiocb *req) return IO_APOLL_OK; } -static bool __io_poll_remove_one(struct io_kiocb *req, - struct io_poll_iocb *poll, bool do_cancel) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete = false; - - if (!poll->head) - return false; - spin_lock_irq(&poll->head->lock); - if (do_cancel) - WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); - do_complete = true; - } - spin_unlock_irq(&poll->head->lock); - hash_del(&req->hash_node); - return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - io_poll_remove_double(req); - do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - - if (do_complete) { - io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); - io_commit_cqring(req->ctx); - req_set_fail(req); - io_put_req_deferred(req); - } - return do_complete; -} - /* * Returns true if we found and killed one or more poll requests */ @@ -5696,7 +5817,8 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, { struct hlist_node *tmp; struct io_kiocb *req; - int posted = 0, i; + bool found = false; + int i; spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -5704,16 +5826,14 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task(req, tsk, cancel_all)) - posted += io_poll_remove_one(req); + if (io_match_task_safe(req, tsk, cancel_all)) { + io_poll_cancel_req(req); + found = true; + } } } spin_unlock(&ctx->completion_lock); - - if (posted) - io_cqring_ev_posted(ctx); - - return posted != 0; + return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, @@ -5734,19 +5854,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, return NULL; } +static bool io_poll_disarm(struct io_kiocb *req) + __must_hold(&ctx->completion_lock) +{ + if (!io_poll_get_ownership(req)) + return false; + io_poll_remove_entries(req); + hash_del(&req->hash_node); + return true; +} + static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, bool poll_only) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req; + struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); - req = io_poll_find(ctx, sqe_addr, poll_only); if (!req) return -ENOENT; - if (io_poll_remove_one(req)) - return 0; - - return -EALREADY; + io_poll_cancel_req(req); + return 0; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -5796,23 +5923,6 @@ static int io_poll_update_prep(struct io_kiocb *req, return 0; } -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->poll; - - return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); -} - static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; @@ -5825,6 +5935,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) + return -EINVAL; io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); @@ -5834,100 +5946,60 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - __poll_t mask; - bool done; + int ret; ipt.pt._qproc = io_poll_queue_proc; - mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, - io_poll_wake); - - if (mask) { /* no async, we'd stolen it */ - ipt.error = 0; - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - } - spin_unlock(&ctx->completion_lock); - - if (mask) { - io_cqring_ev_posted(ctx); - if (done) - io_put_req(req); - } - return ipt.error; + ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + ret = ret ?: ipt.error; + if (ret) + __io_req_complete(req, issue_flags, ret, 0); + return 0; } static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; - bool completing; - int ret; + int ret2, ret = 0; + bool locked; spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); - if (!preq) { - ret = -ENOENT; - goto err; - } - - if (!req->poll_update.update_events && !req->poll_update.update_user_data) { - completing = true; - ret = io_poll_remove_one(preq) ? 0 : -EALREADY; - goto err; - } - - /* - * Don't allow racy completion with singleshot, as we cannot safely - * update those. For multishot, if we're racing with completion, just - * let completion re-add it. - */ - completing = !__io_poll_remove_one(preq, &preq->poll, false); - if (completing && (preq->poll.events & EPOLLONESHOT)) { - ret = -EALREADY; - goto err; - } - /* we now have a detached poll request. reissue. */ - ret = 0; -err: - if (ret < 0) { + if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); - req_set_fail(req); - io_req_complete(req, ret); - return 0; - } - /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; + ret = preq ? -EALREADY : -ENOENT; + goto out; } - if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; spin_unlock(&ctx->completion_lock); - /* complete update request, we're done with it */ - io_req_complete(req, ret); - - if (!completing) { - ret = io_poll_add(preq, issue_flags); - if (ret < 0) { - req_set_fail(preq); - io_req_complete(preq, ret); + if (req->poll_update.update_events || req->poll_update.update_user_data) { + /* only mask one event flags, keep behavior flags */ + if (req->poll_update.update_events) { + preq->poll.events &= ~0xffff; + preq->poll.events |= req->poll_update.events & 0xffff; + preq->poll.events |= IO_POLL_UNMASK; } - } - return 0; -} + if (req->poll_update.update_user_data) + preq->user_data = req->poll_update.new_user_data; -static void io_req_task_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_timeout_data *data = req->async_data; + ret2 = io_poll_add(preq, issue_flags); + /* successfully updated, don't complete poll request */ + if (!ret2) + goto out; + } - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(preq); + preq->result = -ECANCELED; + locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &locked); +out: + if (ret < 0) req_set_fail(req); - io_req_complete_post(req, -ETIME, 0); + /* complete update request, we're done with it */ + __io_req_complete(req, issue_flags, ret, 0); + return 0; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -5944,8 +6016,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) atomic_read(&req->ctx->cq_timeouts) + 1); spin_unlock_irqrestore(&ctx->timeout_lock, flags); - req->io_task_work.func = io_req_task_timeout; - io_req_task_work_add(req); + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); + + req->result = -ETIME; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req, false); return HRTIMER_NORESTART; } @@ -5982,7 +6058,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return PTR_ERR(req); req_set_fail(req); - io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); + io_fill_cqe_req(req, -ECANCELED, 0); io_put_req_deferred(req); return 0; } @@ -6071,6 +6147,8 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; + if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + return -EINVAL; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; @@ -6156,6 +6234,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) + return -EINVAL; + data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -6497,12 +6578,15 @@ static __cold void io_drain_req(struct io_kiocb *req) u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ + spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { + spin_unlock(&ctx->completion_lock); queue: ctx->drain_active = false; io_req_task_queue(req); return; } + spin_unlock(&ctx->completion_lock); ret = io_req_prep_async(req); if (ret) { @@ -6533,10 +6617,8 @@ fail: static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - kfree(req->kbuf); - req->kbuf = NULL; - } + if (req->flags & REQ_F_BUFFER_SELECTED) + io_put_kbuf(req); if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { @@ -6880,10 +6962,11 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; - int ret; + int ret = -ENOENT; if (prev) { - ret = io_try_cancel_userdata(req, prev->user_data); + if (!(req->task->flags & PF_EXITING)) + ret = io_try_cancel_userdata(req, prev->user_data); io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); } else { @@ -6917,7 +7000,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); + io_req_task_work_add(req, false); return HRTIMER_NORESTART; } @@ -7052,10 +7135,10 @@ static void io_init_req_drain(struct io_kiocb *req) * If we need to drain a request in the middle of a link, drain * the head request and the next request/link after the current * link. Considering sequential execution of links, - * IOSQE_IO_DRAIN will be maintained for every request of our + * REQ_F_IO_DRAIN will be maintained for every request of our * link. */ - head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; ctx->drain_next = true; } } @@ -7088,8 +7171,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[opcode].buffer_select) return -EOPNOTSUPP; - if (sqe_flags & IOSQE_IO_DRAIN) + if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) + ctx->drain_disabled = true; + if (sqe_flags & IOSQE_IO_DRAIN) { + if (ctx->drain_disabled) + return -EOPNOTSUPP; io_init_req_drain(req); + } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) @@ -7101,7 +7189,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { ctx->drain_next = false; ctx->drain_active = true; - req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -8215,8 +8303,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, prsrc->tag, 0, 0); - ctx->cq_extra++; + io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -8628,6 +8715,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); + INIT_WQ_LIST(&tctx->prior_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -9255,10 +9343,8 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer *buf; unsigned long index; - xa_for_each(&ctx->io_buffers, index, buf) { + xa_for_each(&ctx->io_buffers, index, buf) __io_remove_buffers(ctx, buf, index, -1U); - cond_resched(); - } } static void io_req_caches_free(struct io_ring_ctx *ctx) @@ -9562,19 +9648,8 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - bool ret; - - if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { - struct io_ring_ctx *ctx = req->ctx; - /* protect against races with linked timeouts */ - spin_lock(&ctx->completion_lock); - ret = io_match_task(req, cancel->task, cancel->all); - spin_unlock(&ctx->completion_lock); - } else { - ret = io_match_task(req, cancel->task, cancel->all); - } - return ret; + return io_match_task_safe(req, cancel->task, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, @@ -9586,7 +9661,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -9764,7 +9839,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) } if (wq) { /* - * Must be after io_uring_del_task_file() (removes nodes under + * Must be after io_uring_del_tctx_node() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ io_wq_put_and_exit(wq); @@ -9779,21 +9854,9 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - unsigned int refs = tctx->cached_refs; - - if (refs) { - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); - } -} - /* * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. */ static __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) @@ -9835,8 +9898,10 @@ static __cold void io_uring_cancel_generic(bool cancel_all, cancel_all); } - prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); + io_run_task_work(); io_uring_drop_tctx_refs(current); + /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -9846,10 +9911,14 @@ static __cold void io_uring_cancel_generic(bool cancel_all, schedule(); finish_wait(&tctx->wait, &wait); } while (1); - atomic_dec(&tctx->in_idle); io_uring_clean_tctx(tctx); if (cancel_all) { + /* + * We shouldn't run task_works after cancel, so just leave + * ->in_idle set for normal exit. + */ + atomic_dec(&tctx->in_idle); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } @@ -10127,7 +10196,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, * and sq_tail and cq_head are changed by userspace. But it's ok since * we usually use these info when it is stuck. */ - seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); + seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); @@ -10436,7 +10505,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/fs/ioctl.c b/fs/ioctl.c index 504e69578112..1ed097e94af2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range __user, info[count]); + size = offsetof(struct file_dedupe_range, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 4143a3ff89db..fc070184b7fa 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - buffered-io.o \ + iter.o +iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ fiemap.o \ - iter.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1753c26c8e76..c938bbad075e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,8 +22,8 @@ #include "../internal.h" /* - * Structure allocated for each page or THP when block size < page size - * to track sub-page uptodate status and I/O completions. + * Structure allocated for each folio when block size < folio size + * to track sub-folio uptodate status and I/O completions. */ struct iomap_page { atomic_t read_bytes_pending; @@ -32,27 +32,20 @@ struct iomap_page { unsigned long uptodate[]; }; -static inline struct iomap_page *to_iomap_page(struct page *page) +static inline struct iomap_page *to_iomap_page(struct folio *folio) { - /* - * per-block data is stored in the head page. Callers should - * not be dealing with tail pages, and if they are, they can - * call thp_head() first. - */ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - - if (page_has_private(page)) - return (struct iomap_page *)page_private(page); + if (folio_test_private(folio)) + return folio_get_private(folio); return NULL; } static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct page *page) +iomap_page_create(struct inode *inode, struct folio *folio) { - struct iomap_page *iop = to_iomap_page(page); - unsigned int nr_blocks = i_blocks_per_page(inode, page); + struct iomap_page *iop = to_iomap_page(folio); + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); if (iop || nr_blocks <= 1) return iop; @@ -60,40 +53,40 @@ iomap_page_create(struct inode *inode, struct page *page) iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) bitmap_fill(iop->uptodate, nr_blocks); - attach_page_private(page, iop); + folio_attach_private(folio, iop); return iop; } -static void -iomap_page_release(struct page *page) +static void iomap_page_release(struct folio *folio) { - struct iomap_page *iop = detach_page_private(page); - unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); + struct iomap_page *iop = folio_detach_private(folio); + struct inode *inode = folio->mapping->host; + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != - PageUptodate(page)); + folio_test_uptodate(folio)); kfree(iop); } /* - * Calculate the range inside the page that we actually need to read. + * Calculate the range inside the folio that we actually need to read. */ -static void -iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, - loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) +static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, + loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { + struct iomap_page *iop = to_iomap_page(folio); loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); - unsigned poff = offset_in_page(*pos); - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); + size_t poff = offset_in_folio(folio, *pos); + size_t plen = min_t(loff_t, folio_size(folio) - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; @@ -131,7 +124,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * page cache for blocks that are entirely outside of i_size. */ if (orig_pos <= isize && orig_pos + length > isize) { - unsigned end = offset_in_page(isize - 1) >> block_bits; + unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) plen -= (last - end) * block_size; @@ -141,80 +134,87 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, *lenp = plen; } -static void -iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) +static void iomap_iop_set_range_uptodate(struct folio *folio, + struct iomap_page *iop, size_t off, size_t len) { - struct iomap_page *iop = to_iomap_page(page); - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; unsigned long flags; spin_lock_irqsave(&iop->uptodate_lock, flags); bitmap_set(iop->uptodate, first, last - first + 1); - if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) - SetPageUptodate(page); + if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) + folio_mark_uptodate(folio); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } -static void -iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) +static void iomap_set_range_uptodate(struct folio *folio, + struct iomap_page *iop, size_t off, size_t len) { - if (PageError(page)) + if (folio_test_error(folio)) return; - if (page_has_private(page)) - iomap_iop_set_range_uptodate(page, off, len); + if (iop) + iomap_iop_set_range_uptodate(folio, iop, off, len); else - SetPageUptodate(page); + folio_mark_uptodate(folio); } -static void -iomap_read_page_end_io(struct bio_vec *bvec, int error) +static void iomap_finish_folio_read(struct folio *folio, size_t offset, + size_t len, int error) { - struct page *page = bvec->bv_page; - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = to_iomap_page(folio); if (unlikely(error)) { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(folio); + folio_set_error(folio); } else { - iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); + iomap_set_range_uptodate(folio, iop, offset, len); } - if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) - unlock_page(page); + if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) + folio_unlock(folio); } -static void -iomap_read_end_io(struct bio *bio) +static void iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) - iomap_read_page_end_io(bvec, error); + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); bio_put(bio); } struct iomap_readpage_ctx { - struct page *cur_page; - bool cur_page_in_bio; + struct folio *cur_folio; + bool cur_folio_in_bio; struct bio *bio; struct readahead_control *rac; }; -static loff_t iomap_read_inline_data(const struct iomap_iter *iter, - struct page *page) +/** + * iomap_read_inline_data - copy inline data into the page cache + * @iter: iteration structure + * @folio: folio to copy to + * + * Copy the inline data in @iter into @folio and zero out the rest of the folio. + * Only a single IOMAP_INLINE extent is allowed at the end of each file. + * Returns zero for success to complete the read, or the usual negative errno. + */ +static int iomap_read_inline_data(const struct iomap_iter *iter, + struct folio *folio) { + struct iomap_page *iop; const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); + size_t offset = offset_in_folio(folio, iomap->offset); void *addr; - if (PageUptodate(page)) - return PAGE_SIZE - poff; + if (folio_test_uptodate(folio)) + return 0; if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) return -EIO; @@ -223,15 +223,17 @@ static loff_t iomap_read_inline_data(const struct iomap_iter *iter, return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; - if (poff > 0) - iomap_page_create(iter->inode, page); + if (offset > 0) + iop = iomap_page_create(iter->inode, folio); + else + iop = to_iomap_page(folio); - addr = kmap_local_page(page) + poff; + addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); - iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); - return PAGE_SIZE - poff; + iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); + return 0; } static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, @@ -250,36 +252,36 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; - struct page *page = ctx->cur_page; + struct folio *folio = ctx->cur_folio; struct iomap_page *iop; loff_t orig_pos = pos; - unsigned poff, plen; + size_t poff, plen; sector_t sector; if (iomap->type == IOMAP_INLINE) - return min(iomap_read_inline_data(iter, page), length); + return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, page); - iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); + iop = iomap_page_create(iter->inode, folio); + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { - zero_user(page, poff, plen); - iomap_set_range_uptodate(page, poff, plen); + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, iop, poff, plen); goto done; } - ctx->cur_page_in_bio = true; + ctx->cur_folio_in_bio = true; if (iop) atomic_add(plen, &iop->read_bytes_pending); sector = iomap_sector(iomap, pos); if (!ctx->bio || bio_end_sector(ctx->bio) != sector || - bio_add_page(ctx->bio, page, plen, poff) != plen) { - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + !bio_add_folio(ctx->bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -302,8 +304,9 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; - __bio_add_page(ctx->bio, page, plen, poff); + bio_add_folio(ctx->bio, folio, plen, poff); } + done: /* * Move the caller beyond our range so that it keeps making progress. @@ -317,30 +320,31 @@ done: int iomap_readpage(struct page *page, const struct iomap_ops *ops) { + struct folio *folio = page_folio(page); struct iomap_iter iter = { - .inode = page->mapping->host, - .pos = page_offset(page), - .len = PAGE_SIZE, + .inode = folio->mapping->host, + .pos = folio_pos(folio), + .len = folio_size(folio), }; struct iomap_readpage_ctx ctx = { - .cur_page = page, + .cur_folio = folio, }; int ret; - trace_iomap_readpage(page->mapping->host, 1); + trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_readpage_iter(&iter, &ctx, 0); if (ret < 0) - SetPageError(page); + folio_set_error(folio); if (ctx.bio) { submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_page_in_bio); + WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { - WARN_ON_ONCE(ctx.cur_page_in_bio); - unlock_page(page); + WARN_ON_ONCE(ctx.cur_folio_in_bio); + folio_unlock(folio); } /* @@ -359,17 +363,19 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, loff_t done, ret; for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { - if (!ctx->cur_page_in_bio) - unlock_page(ctx->cur_page); - put_page(ctx->cur_page); - ctx->cur_page = NULL; + if (ctx->cur_folio && + offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + if (!ctx->cur_folio_in_bio) + folio_unlock(ctx->cur_folio); + ctx->cur_folio = NULL; } - if (!ctx->cur_page) { - ctx->cur_page = readahead_page(ctx->rac); - ctx->cur_page_in_bio = false; + if (!ctx->cur_folio) { + ctx->cur_folio = readahead_folio(ctx->rac); + ctx->cur_folio_in_bio = false; } ret = iomap_readpage_iter(iter, ctx, done); + if (ret <= 0) + return ret; } return done; @@ -408,10 +414,9 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) if (ctx.bio) submit_bio(ctx.bio); - if (ctx.cur_page) { - if (!ctx.cur_page_in_bio) - unlock_page(ctx.cur_page); - put_page(ctx.cur_page); + if (ctx.cur_folio) { + if (!ctx.cur_folio_in_bio) + folio_unlock(ctx.cur_folio); } } EXPORT_SYMBOL_GPL(iomap_readahead); @@ -427,7 +432,8 @@ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { - struct iomap_page *iop = to_iomap_page(page); + struct folio *folio = page_folio(page); + struct iomap_page *iop = to_iomap_page(folio); struct inode *inode = page->mapping->host; unsigned len, first, last; unsigned i; @@ -453,36 +459,49 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { - trace_iomap_releasepage(page->mapping->host, page_offset(page), - PAGE_SIZE); + struct folio *folio = page_folio(page); + + trace_iomap_releasepage(folio->mapping->host, folio_pos(folio), + folio_size(folio)); /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to * ->releasepage() via shrink_active_list(); skip those here. */ - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; - iomap_page_release(page); + iomap_page_release(folio); return 1; } EXPORT_SYMBOL_GPL(iomap_releasepage); -void -iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) +void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) { - trace_iomap_invalidatepage(page->mapping->host, offset, len); + trace_iomap_invalidatepage(folio->mapping->host, offset, len); /* - * If we're invalidating the entire page, clear the dirty state from it - * and release it to avoid unnecessary buildup of the LRU. + * If we're invalidating the entire folio, clear the dirty state + * from it and release it to avoid unnecessary buildup of the LRU. */ - if (offset == 0 && len == PAGE_SIZE) { - WARN_ON_ONCE(PageWriteback(page)); - cancel_dirty_page(page); - iomap_page_release(page); + if (offset == 0 && len == folio_size(folio)) { + WARN_ON_ONCE(folio_test_writeback(folio)); + folio_cancel_dirty(folio); + iomap_page_release(folio); + } else if (folio_test_large(folio)) { + /* Must release the iop so the page can be split */ + WARN_ON_ONCE(!folio_test_uptodate(folio) && + folio_test_dirty(folio)); + iomap_page_release(folio); } } +EXPORT_SYMBOL_GPL(iomap_invalidate_folio); + +void iomap_invalidatepage(struct page *page, unsigned int offset, + unsigned int len) +{ + iomap_invalidate_folio(page_folio(page), offset, len); +} EXPORT_SYMBOL_GPL(iomap_invalidatepage); #ifdef CONFIG_MIGRATION @@ -490,19 +509,21 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { + struct folio *folio = page_folio(page); + struct folio *newfolio = page_folio(newpage); int ret; - ret = migrate_page_move_mapping(mapping, newpage, page, 0); + ret = folio_migrate_mapping(mapping, newfolio, folio, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); + if (folio_test_private(folio)) + folio_attach_private(newfolio, folio_detach_private(folio)); if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(newfolio, folio); else - migrate_page_states(newpage, page); + folio_migrate_flags(newfolio, folio); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL_GPL(iomap_migrate_page); @@ -521,9 +542,8 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) truncate_pagecache_range(inode, max(pos, i_size), pos + len); } -static int -iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, - unsigned plen, const struct iomap *iomap) +static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, + size_t poff, size_t plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; @@ -532,26 +552,27 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, bio.bi_opf = REQ_OP_READ; bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_set_dev(&bio, iomap->bdev); - __bio_add_page(&bio, page, plen, poff); + bio_add_folio(&bio, folio, plen, poff); return submit_bio_wait(&bio); } static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - unsigned len, struct page *page) + size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, page); + struct iomap_page *iop = iomap_page_create(iter->inode, folio); loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); - unsigned from = offset_in_page(pos), to = from + len, poff, plen; + size_t from = offset_in_folio(folio, pos), to = from + len; + size_t poff, plen; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; - ClearPageError(page); + folio_clear_error(folio); do { - iomap_adjust_read_range(iter->inode, iop, &block_start, + iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; @@ -564,39 +585,35 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (iomap_block_needs_zeroing(iter, block_start)) { if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; - zero_user_segments(page, poff, from, to, poff + plen); + folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_page_sync(block_start, page, + int status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; } - iomap_set_range_uptodate(page, poff, plen); + iomap_set_range_uptodate(folio, iop, poff, plen); } while ((block_start += plen) < block_end); return 0; } static int iomap_write_begin_inline(const struct iomap_iter *iter, - struct page *page) + struct folio *folio) { - int ret; - /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; - ret = iomap_read_inline_data(iter, page); - if (ret < 0) - return ret; - return 0; + return iomap_read_inline_data(iter, folio); } static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - unsigned len, struct page **pagep) + size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct page *page; + struct folio *folio; + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); @@ -606,35 +623,40 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (fatal_signal_pending(current)) return -EINTR; + if (!mapping_large_folio_support(iter->inode->i_mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (page_ops && page_ops->page_prepare) { status = page_ops->page_prepare(iter->inode, pos, len); if (status) return status; } - page = grab_cache_page_write_begin(iter->inode->i_mapping, - pos >> PAGE_SHIFT, AOP_FLAG_NOFS); - if (!page) { + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (!folio) { status = -ENOMEM; goto out_no_page; } + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; if (srcmap->type == IOMAP_INLINE) - status = iomap_write_begin_inline(iter, page); + status = iomap_write_begin_inline(iter, folio); else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) - status = __block_write_begin_int(page, pos, len, NULL, srcmap); + status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else - status = __iomap_write_begin(iter, pos, len, page); + status = __iomap_write_begin(iter, pos, len, folio); if (unlikely(status)) goto out_unlock; - *pagep = page; + *foliop = folio; return 0; out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); iomap_write_failed(iter->inode, pos, len); out_no_page: @@ -644,9 +666,10 @@ out_no_page: } static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, - size_t copied, struct page *page) + size_t copied, struct folio *folio) { - flush_dcache_page(page); + struct iomap_page *iop = to_iomap_page(folio); + flush_dcache_folio(folio); /* * The blocks that were entirely written will now be uptodate, so we @@ -659,24 +682,24 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, * non-uptodate page as a zero-length write, and force the caller to * redo the whole thing. */ - if (unlikely(copied < len && !PageUptodate(page))) + if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; - iomap_set_range_uptodate(page, offset_in_page(pos), len); - __set_page_dirty_nobuffers(page); + iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); + filemap_dirty_folio(inode->i_mapping, folio); return copied; } static size_t iomap_write_end_inline(const struct iomap_iter *iter, - struct page *page, loff_t pos, size_t copied) + struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; void *addr; - WARN_ON_ONCE(!PageUptodate(page)); + WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); - flush_dcache_page(page); - addr = kmap_local_page(page) + pos; + flush_dcache_folio(folio); + addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); @@ -686,7 +709,7 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, - size_t copied, struct page *page) + size_t copied, struct folio *folio) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -694,12 +717,12 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(iter, page, pos, copied); + ret = iomap_write_end_inline(iter, folio, pos, copied); } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, - copied, page, NULL); + copied, &folio->page, NULL); } else { - ret = __iomap_write_end(iter->inode, pos, len, copied, page); + ret = __iomap_write_end(iter->inode, pos, len, copied, folio); } /* @@ -711,13 +734,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, i_size_write(iter->inode, pos + ret); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - unlock_page(page); + folio_unlock(folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, ret, page); - put_page(page); + page_ops->page_done(iter->inode, pos, ret, &folio->page); + folio_put(folio); if (ret < len) iomap_write_failed(iter->inode, pos, len); @@ -732,6 +755,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) long status = 0; do { + struct folio *folio; struct page *page; unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ @@ -755,16 +779,17 @@ again: break; } - status = iomap_write_begin(iter, pos, bytes, &page); + status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(iter->inode->i_mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(iter, pos, bytes, copied, page); + status = iomap_write_end(iter, pos, bytes, copied, folio); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -830,13 +855,13 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) do { unsigned long offset = offset_in_page(pos); unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); - struct page *page; + struct folio *folio; - status = iomap_write_begin(iter, pos, bytes, &page); + status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - status = iomap_write_end(iter, pos, bytes, bytes, page); + status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -870,26 +895,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) -{ - struct page *page; - int status; - unsigned offset = offset_in_page(pos); - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - - status = iomap_write_begin(iter, pos, bytes, &page); - if (status) - return status; - - zero_user(page, offset, bytes); - mark_page_accessed(page); - - return iomap_write_end(iter, pos, bytes, bytes, page); -} - static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); @@ -900,14 +907,25 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) return length; do { - s64 bytes; + struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); - if (IS_DAX(iter->inode)) - bytes = dax_iomap_zero(pos, length, iomap); - else - bytes = __iomap_zero_iter(iter, pos, length); - if (bytes < 0) - return bytes; + status = iomap_write_begin(iter, pos, bytes, &folio); + if (status) + return status; + + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + folio_zero_range(folio, offset, bytes); + folio_mark_accessed(folio); + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) + return -EIO; pos += bytes; length -= bytes; @@ -951,21 +969,21 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, - struct page *page) +static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, + struct folio *folio) { loff_t length = iomap_length(iter); int ret; if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, iter->pos, length, NULL, + ret = __block_write_begin_int(folio, iter->pos, length, NULL, &iter->iomap); if (ret) return ret; - block_commit_write(page, 0, length); + block_commit_write(&folio->page, 0, length); } else { - WARN_ON_ONCE(!PageUptodate(page)); - set_page_dirty(page); + WARN_ON_ONCE(!folio_test_uptodate(folio)); + folio_mark_dirty(folio); } return length; @@ -977,44 +995,43 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, }; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); ssize_t ret; - lock_page(page); - ret = page_mkwrite_check_truncate(page, iter.inode); + folio_lock(folio); + ret = folio_mkwrite_check_truncate(folio, iter.inode); if (ret < 0) goto out_unlock; - iter.pos = page_offset(page); + iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_page_mkwrite_iter(&iter, page); + iter.processed = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -static void -iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error, unsigned int len) +static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, + size_t len, int error) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = to_iomap_page(folio); if (error) { - SetPageError(page); + folio_set_error(folio); mapping_set_error(inode->i_mapping, error); } - WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) - end_page_writeback(page); + folio_end_writeback(folio); } /* @@ -1033,8 +1050,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; /* * For the last bio, bi_private points to the ioend, so we @@ -1045,10 +1061,10 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) else next = bio->bi_private; - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error, - bv->bv_len); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_write(inode, fi.folio, fi.length, + error); bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1243,29 +1259,29 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, * first; otherwise finish off the current ioend and start another. */ static void -iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, +iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, struct iomap_page *iop, struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { - sector_t sector = iomap_sector(&wpc->iomap, offset); + sector_t sector = iomap_sector(&wpc->iomap, pos); unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); + size_t poff = offset_in_folio(folio, pos); - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); + wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); } - if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + bio_add_folio(wpc->ioend->io_bio, folio, len, poff); } if (iop) atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); + wbc_account_cgroup_owner(wbc, &folio->page, len); } /* @@ -1287,44 +1303,43 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, - struct page *page, u64 end_offset) + struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, page); + struct iomap_page *iop = iomap_page_create(inode, folio); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); - u64 file_offset; /* file offset of page */ + unsigned nblocks = i_blocks_per_folio(inode, folio); + u64 pos = folio_pos(folio); int error = 0, count = 0, i; LIST_HEAD(submit_list); WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. + * Walk through the folio to find areas to write back. If we + * run off the end of the current map or find the current map + * invalid, grab a new one. */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { + for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { if (iop && !test_bit(i, iop->uptodate)) continue; - error = wpc->ops->map_blocks(wpc, inode, file_offset); + error = wpc->ops->map_blocks(wpc, inode, pos); if (error) break; if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) continue; if (wpc->iomap.type == IOMAP_HOLE) continue; - iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, + iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, &submit_list); count++; } WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!PageLocked(page)); - WARN_ON_ONCE(PageWriteback(page)); - WARN_ON_ONCE(PageDirty(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); /* * We cannot cancel the ioend directly here on error. We may have @@ -1339,17 +1354,17 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * won't be affected by I/O completion and we must unlock it * now. */ - if (wpc->ops->discard_page) - wpc->ops->discard_page(page, file_offset); + if (wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); if (!count) { - ClearPageUptodate(page); - unlock_page(page); + folio_clear_uptodate(folio); + folio_unlock(folio); goto done; } } - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); /* * Preserve the original error if there was one; catch @@ -1370,9 +1385,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * with a partial page truncate on a sub-page block sized filesystem. */ if (!count) - end_page_writeback(page); + folio_end_writeback(folio); done: - mapping_set_error(page->mapping, error); + mapping_set_error(folio->mapping, error); return error; } @@ -1386,16 +1401,15 @@ done: static int iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) { + struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - pgoff_t end_index; - u64 end_offset; - loff_t offset; + struct inode *inode = folio->mapping->host; + u64 end_pos, isize; - trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); + trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); /* - * Refuse to write the page out if we're called from reclaim context. + * Refuse to write the folio out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1409,10 +1423,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) goto redirty; /* - * Is this page beyond the end of the file? + * Is this folio beyond the end of the file? * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. + * The folio index is less than the end_index, adjust the end_pos + * to the highest offset that this folio should represent. * ----------------------------------------------------- * | file mapping | <EOF> | * ----------------------------------------------------- @@ -1421,11 +1435,9 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * | desired writeback range | see else | * ---------------------------------^------------------| */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; - else { + isize = i_size_read(inode); + end_pos = folio_pos(folio) + folio_size(folio); + if (end_pos > isize) { /* * Check whether the page to write out is beyond or straddles * i_size or not. @@ -1437,7 +1449,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * | | Straddles | * ---------------------------------^-----------|--------| */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); + size_t poff = offset_in_folio(folio, isize); + pgoff_t end_index = isize >> PAGE_SHIFT; /* * Skip the page if it's fully outside i_size, e.g. due to a @@ -1456,8 +1469,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) + if (folio->index > end_index || + (folio->index == end_index && poff == 0)) goto redirty; /* @@ -1468,17 +1481,15 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; + folio_zero_segment(folio, poff, folio_size(folio)); + end_pos = isize; } - return iomap_writepage_map(wpc, wbc, inode, page, end_offset); + return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b4dc51063d36..03ea367df19a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/backing-dev.h> #include <linux/uio.h> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 35302bc192eb..0b86a4365b66 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -757,6 +757,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); + jbd2_journal_lock_updates(journal); return 0; } @@ -768,6 +769,7 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { + jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0); write_lock(&journal->j_state_lock); diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 2b4d5013dc5d..6da92ecaf66d 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -161,5 +161,5 @@ static int jffs2_garbage_collect_thread(void *_c) spin_lock(&c->erase_completion_lock); c->gc_task = NULL; spin_unlock(&c->erase_completion_lock); - complete_and_exit(&c->gc_thread_exit, 0); + kthread_complete_and_exit(&c->gc_thread_exit, 0); } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 4fc8cd698d1a..bd7d58d27bfc 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -136,20 +136,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; - pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) - return -ENOMEM; - *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; @@ -160,7 +155,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) - goto out_page; + goto out_err; mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); @@ -190,7 +185,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); if (f->metadata) { @@ -205,7 +200,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } jffs2_complete_reservation(c); inode->i_size = pageofs; @@ -213,6 +208,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, } /* + * While getting a page and reading data in, lock c->alloc_sem until + * the page is Uptodate. Otherwise GC task may attempt to read the same + * page in read_cache_page(), which causes a deadlock. + */ + mutex_lock(&c->alloc_sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + ret = -ENOMEM; + goto release_sem; + } + *pagep = pg; + + /* * Read in the page if it wasn't already present. Cannot optimize away * the whole page write case until jffs2_write_end can handle the * case of a short-copy. @@ -221,15 +229,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); mutex_unlock(&f->sem); - if (ret) - goto out_page; + if (ret) { + unlock_page(pg); + put_page(pg); + goto release_sem; + } } jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); - return ret; -out_page: - unlock_page(pg); - put_page(pg); +release_sem: + mutex_unlock(&c->alloc_sem); +out_err: return ret; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8e0a1378a4b1..e6d9772ddb4c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,6 @@ #include "kernfs-internal.h" -DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ @@ -26,7 +25,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return atomic_read(&kn->active) >= 0; } @@ -457,14 +456,15 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) + __releases(&kernfs_root(kn)->kernfs_rwsem) + __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn) kernfs_drain_open_files(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); } /** @@ -718,11 +718,12 @@ err_unlock: int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; + struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -753,7 +754,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -767,7 +768,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -788,7 +789,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -820,7 +821,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name; - lockdep_assert_held_read(&kernfs_rwsem); + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ spin_lock_irq(&kernfs_rename_lock); @@ -859,11 +860,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return kn; } @@ -883,11 +885,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return kn; } @@ -912,6 +915,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); + init_rwsem(&root->kernfs_rwsem); INIT_LIST_HEAD(&root->supers); /* @@ -957,7 +961,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, */ void kernfs_destroy_root(struct kernfs_root *root) { - kernfs_remove(root->kn); /* will also free @root */ + /* + * kernfs_remove holds kernfs_rwsem from the root so the root + * shouldn't be freed during the operation. + */ + kernfs_get(root->kn); + kernfs_remove(root->kn); + kernfs_put(root->kn); /* will also free @root */ } /** @@ -1035,6 +1045,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; + struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1046,18 +1057,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) /* If the kernfs parent node has changed discard and * proceed to ->lookup. */ - down_read(&kernfs_rwsem); spin_lock(&dentry->d_lock); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { + spin_unlock(&dentry->d_lock); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_dir_changed(parent, dentry)) { - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } - } - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); + } else + spin_unlock(&dentry->d_lock); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. @@ -1066,7 +1078,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) } kn = kernfs_dentry_node(dentry); - down_read(&kernfs_rwsem); + root = kernfs_root(kn); + down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1085,10 +1098,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 1; out_bad: - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } @@ -1102,10 +1115,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; + struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; - down_read(&kernfs_rwsem); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; @@ -1116,7 +1131,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, * create a negative. */ if (!kernfs_active(kn)) { - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); @@ -1131,7 +1146,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); @@ -1254,7 +1269,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn; - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1289,8 +1304,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1304,14 +1320,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. @@ -1381,9 +1397,11 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - down_write(&kernfs_rwsem); + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); __kernfs_remove(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } /** @@ -1469,8 +1487,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn) bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* @@ -1498,9 +1517,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); schedule(); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1513,7 +1532,7 @@ bool kernfs_remove_self(struct kernfs_node *kn) */ kernfs_unbreak_active_protection(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -1530,6 +1549,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", @@ -1537,13 +1557,14 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - down_write(&kernfs_rwsem); + root = kernfs_root(parent); + down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) __kernfs_remove(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); if (kn) return 0; @@ -1562,6 +1583,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; + struct kernfs_root *root; const char *old_name = NULL; int error; @@ -1569,7 +1591,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL; - down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1623,7 +1646,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, error = 0; out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; } @@ -1694,11 +1717,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; + struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; - down_read(&kernfs_rwsem); + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1715,12 +1741,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); } - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 60e2a86c535e..9414a7a60a9f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -847,6 +847,7 @@ static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; + struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); @@ -859,8 +860,9 @@ repeat: kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); + root = kernfs_root(kn); /* kick fsnotify */ - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +900,7 @@ repeat: iput(inode); } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index c0eae1725435..3d783d80f5da 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root; int error; if (!kn) return -EINVAL; - down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr); out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; } @@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root = kernfs_root(kn); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); generic_fillattr(&init_user_ns, inode, stat); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } @@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + struct kernfs_root *root; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; + root = kernfs_root(kn); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); ret = generic_permission(&init_user_ns, inode, mask); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return ret; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f2f909d09f52..cfa79715fc1a 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; @@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - down_read(&kernfs_rwsem); + down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - up_read(&kernfs_rwsem); + up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc) if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; @@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_add(&info->node, &info->root->supers); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } fc->root = dget(sb->s_root); @@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc) void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = info->root; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_del(&info->node); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 19a6c71c6ff5..0ab13824822f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -113,11 +113,12 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; + struct kernfs_root *root = kernfs_root(parent); int error; - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return error; } diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 8317f7ca402b..5052be9261d9 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -148,7 +148,7 @@ static int ndr_read_int16(struct ndr *n, __u16 *value) static int ndr_read_int32(struct ndr *n, __u32 *value) { if (n->offset + sizeof(__u32) > n->length) - return 0; + return -EINVAL; if (value) *value = le32_to_cpu(*(__le32 *)ndr_get_field(n)); diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 0a5d8450e835..02a44d28bdaf 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -271,9 +271,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; - if (conn->cipher_type) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 121f8e8c70ac..b8b3a4c28b74 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -915,6 +915,25 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, } } +/** + * smb3_encryption_negotiated() - checks if server and client agreed on enabling encryption + * @conn: smb connection + * + * Return: true if connection should be encrypted, else false + */ +static bool smb3_encryption_negotiated(struct ksmbd_conn *conn) +{ + if (!conn->ops->generate_encryptionkey) + return false; + + /* + * SMB 3.0 and 3.0.2 dialects use the SMB2_GLOBAL_CAP_ENCRYPTION flag. + * SMB 3.1.1 uses the cipher_type field. + */ + return (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) || + conn->cipher_type; +} + static void decode_compress_ctxt(struct ksmbd_conn *conn, struct smb2_compression_capabilities_context *pneg_ctxt) { @@ -1469,8 +1488,7 @@ static int ntlm_authenticate(struct ksmbd_work *work) (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) sess->sign = true; - if (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION && - conn->ops->generate_encryptionkey && + if (smb3_encryption_negotiated(conn) && !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { rc = conn->ops->generate_encryptionkey(sess); if (rc) { @@ -1559,8 +1577,7 @@ static int krb5_authenticate(struct ksmbd_work *work) (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) sess->sign = true; - if ((conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) && - conn->ops->generate_encryptionkey) { + if (smb3_encryption_negotiated(conn)) { retval = conn->ops->generate_encryptionkey(sess); if (retval) { ksmbd_debug(SMB, @@ -1697,8 +1714,10 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) - return -EINVAL; + negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; + } negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + negblob_off); @@ -2960,6 +2979,10 @@ int smb2_open(struct ksmbd_work *work) &pntsd_size, &fattr); posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); + if (rc) { + kfree(pntsd); + goto err_out; + } rc = ksmbd_vfs_set_sd_xattr(conn, user_ns, @@ -4457,6 +4480,12 @@ static void get_file_stream_info(struct ksmbd_work *work, &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + goto out; + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; @@ -4465,12 +4494,6 @@ static void get_file_stream_info(struct ksmbd_work *work, goto out; } - buf_free_len = - smb2_calc_max_out_buf_len(work, 8, - le32_to_cpu(req->OutputBufferLength)); - if (buf_free_len < 0) - goto out; - while (idx < xattr_list_len) { stream_name = xattr_list + idx; streamlen = strlen(stream_name); @@ -4496,8 +4519,10 @@ static void get_file_stream_info(struct ksmbd_work *work, ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); next = sizeof(struct smb2_file_stream_info) + streamlen * 2; - if (next > buf_free_len) + if (next > buf_free_len) { + kfree(stream_buf); break; + } file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, @@ -4514,6 +4539,7 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->NextEntryOffset = cpu_to_le32(next); } +out: if (!S_ISDIR(stat.mode) && buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) { file_info = (struct smb2_file_stream_info *) @@ -4522,14 +4548,13 @@ static void get_file_stream_info(struct ksmbd_work *work, "::$DATA", 7, conn->local_nls, 0); streamlen *= 2; file_info->StreamNameLength = cpu_to_le32(streamlen); - file_info->StreamSize = 0; - file_info->StreamAllocationSize = 0; + file_info->StreamSize = cpu_to_le64(stat.size); + file_info->StreamAllocationSize = cpu_to_le64(stat.blocks << 9); nbytes += sizeof(struct smb2_file_stream_info) + streamlen; } /* last entry offset should be 0 */ file_info->NextEntryOffset = 0; -out: kvfree(xattr_list); rsp->OutputBufferLength = cpu_to_le32(nbytes); @@ -5068,7 +5093,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | PROTECTED_DACL_SECINFO | UNPROTECTED_DACL_SECINFO)) { - pr_err("Unsupported addition info: 0x%x)\n", + ksmbd_debug(SMB, "Unsupported addition info: 0x%x)\n", addition_info); pntsd->revision = cpu_to_le16(1); diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index bd792db32623..6ecf55ea1fed 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/mnt_idmapping.h> #include "smbacl.h" #include "smb_common.h" @@ -274,14 +275,7 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - /* - * Translate raw sid into kuid in the server's user - * namespace. - */ - uid = make_kuid(&init_user_ns, id); - - /* If this is an idmapped mount, apply the idmapping. */ - uid = kuid_from_mnt(user_ns, uid); + uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -291,14 +285,7 @@ static int sid_to_id(struct user_namespace *user_ns, gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - /* - * Translate raw sid into kgid in the server's user - * namespace. - */ - gid = make_kgid(&init_user_ns, id); - - /* If this is an idmapped mount, apply the idmapping. */ - gid = kgid_from_mnt(user_ns, gid); + gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 73e08cad412b..811af3309429 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/posix_acl.h> +#include <linux/mnt_idmapping.h> #include "mgmt/tree_connect.h" @@ -216,7 +217,7 @@ static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, kuid_t kuid; /* If this is an idmapped mount, apply the idmapping. */ - kuid = kuid_into_mnt(mnt_userns, pace->e_uid); + kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ return from_kuid(&init_user_ns, kuid); @@ -228,7 +229,7 @@ static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, kgid_t kgid; /* If this is an idmapped mount, apply the idmapping. */ - kgid = kgid_into_mnt(mnt_userns, pace->e_gid); + kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ return from_kgid(&init_user_ns, kgid); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b220e1b91726..0475c5a5d061 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -54,13 +54,9 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; -static struct task_struct *nlmsvc_task; -static struct svc_rqst *nlmsvc_rqst; +static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; -static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); - unsigned int lockd_net_id; /* @@ -184,7 +180,12 @@ lockd(void *vrqstp) nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - return 0; + + dprintk("lockd_down: service stopped\n"); + + svc_exit_thread(rqstp); + + module_put_and_kthread_exit(0); } static int create_lockd_listener(struct svc_serv *serv, const char *name, @@ -290,8 +291,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) __func__, net->ns.inum); } } else { - pr_err("%s: no users! task=%p, net=%x\n", - __func__, nlmsvc_task, net->ns.inum); + pr_err("%s: no users! net=%x\n", + __func__, net->ns.inum); BUG(); } } @@ -302,20 +303,16 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -332,21 +329,17 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin6); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -357,86 +350,22 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static void lockd_unregister_notifiers(void) -{ - unregister_inetaddr_notifier(&lockd_inetaddr_notifier); -#if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&lockd_inet6addr_notifier); -#endif - wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); -} - -static void lockd_svc_exit_thread(void) -{ - atomic_dec(&nlm_ntf_refcnt); - lockd_unregister_notifiers(); - svc_exit_thread(nlmsvc_rqst); -} - -static int lockd_start_svc(struct svc_serv *serv) -{ - int error; - - if (nlmsvc_rqst) - return 0; - - /* - * Create the kernel thread and wait for it to start. - */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(nlmsvc_rqst)) { - error = PTR_ERR(nlmsvc_rqst); - printk(KERN_WARNING - "lockd_up: svc_rqst allocation failed, error=%d\n", - error); - lockd_unregister_notifiers(); - goto out_rqst; - } - - atomic_inc(&nlm_ntf_refcnt); - svc_sock_update_bufs(serv); - serv->sv_maxconn = nlm_max_connections; - - nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); - if (IS_ERR(nlmsvc_task)) { - error = PTR_ERR(nlmsvc_task); - printk(KERN_WARNING - "lockd_up: kthread_run failed, error=%d\n", error); - goto out_task; - } - nlmsvc_rqst->rq_task = nlmsvc_task; - wake_up_process(nlmsvc_task); - - dprintk("lockd_up: service started\n"); - return 0; - -out_task: - lockd_svc_exit_thread(); - nlmsvc_task = NULL; -out_rqst: - nlmsvc_rqst = NULL; - return error; -} - static const struct svc_serv_ops lockd_sv_ops = { .svo_shutdown = svc_rpcb_cleanup, + .svo_function = lockd, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_module = THIS_MODULE, }; -static struct svc_serv *lockd_create_svc(void) +static int lockd_get(void) { struct svc_serv *serv; + int error; - /* - * Check whether we're already up and running. - */ - if (nlmsvc_rqst) { - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ - svc_get(nlmsvc_rqst->rq_server); - return nlmsvc_rqst->rq_server; + if (nlmsvc_serv) { + svc_get(nlmsvc_serv); + nlmsvc_users++; + return 0; } /* @@ -454,14 +383,41 @@ static struct svc_serv *lockd_create_svc(void) serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } + + serv->sv_maxconn = nlm_max_connections; + error = svc_set_num_threads(serv, NULL, 1); + /* The thread now holds the only reference */ + svc_put(serv); + if (error < 0) + return error; + + nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); - return serv; + nlmsvc_users++; + return 0; +} + +static void lockd_put(void) +{ + if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) + return; + if (--nlmsvc_users) + return; + + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + + svc_set_num_threads(nlmsvc_serv, NULL, 0); + nlmsvc_serv = NULL; + dprintk("lockd_down: service destroyed\n"); } /* @@ -469,36 +425,21 @@ static struct svc_serv *lockd_create_svc(void) */ int lockd_up(struct net *net, const struct cred *cred) { - struct svc_serv *serv; int error; mutex_lock(&nlmsvc_mutex); - serv = lockd_create_svc(); - if (IS_ERR(serv)) { - error = PTR_ERR(serv); - goto err_create; - } + error = lockd_get(); + if (error) + goto err; - error = lockd_up_net(serv, net, cred); + error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { - lockd_unregister_notifiers(); - goto err_put; + lockd_put(); + goto err; } - error = lockd_start_svc(serv); - if (error < 0) { - lockd_down_net(serv, net); - goto err_put; - } - nlmsvc_users++; - /* - * Note: svc_serv structures have an initial use count of 1, - * so we exit through here on both success and failure. - */ -err_put: - svc_destroy(serv); -err_create: +err: mutex_unlock(&nlmsvc_mutex); return error; } @@ -511,27 +452,8 @@ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); - lockd_down_net(nlmsvc_rqst->rq_server, net); - if (nlmsvc_users) { - if (--nlmsvc_users) - goto out; - } else { - printk(KERN_ERR "lockd_down: no users! task=%p\n", - nlmsvc_task); - BUG(); - } - - if (!nlmsvc_task) { - printk(KERN_ERR "lockd_down: no lockd running.\n"); - BUG(); - } - kthread_stop(nlmsvc_task); - dprintk("lockd_down: service stopped\n"); - lockd_svc_exit_thread(); - dprintk("lockd_down: service destroyed\n"); - nlmsvc_task = NULL; - nlmsvc_rqst = NULL; -out: + lockd_down_net(nlmsvc_serv, net); + lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e9b85d8fd5fe..cb3658ab9b7a 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -470,8 +470,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct nlm_block *block = NULL; +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct inode *inode = nlmsvc_file_inode(file); +#endif + struct nlm_block *block = NULL; int error; int mode; int async_block = 0; @@ -484,7 +486,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + if (nlmsvc_file_file(file)->f_op->lock) { async_block = wait; wait = 0; } diff --git a/fs/namei.c b/fs/namei.c index 1f9d2187c765..d81f04f8d818 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3958,7 +3958,8 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, inode_lock(dentry->d_inode); error = -EBUSY; - if (is_local_mountpoint(dentry)) + if (is_local_mountpoint(dentry) || + (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 659a8f39c61a..dc31ad6b370f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -31,6 +31,7 @@ #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> +#include <linux/mnt_idmapping.h> #include "pnode.h" #include "internal.h" @@ -561,7 +562,7 @@ static void free_vfsmnt(struct mount *mnt) struct user_namespace *mnt_userns; mnt_userns = mnt_user_ns(&mnt->mnt); - if (mnt_userns != &init_user_ns) + if (!initial_idmapping(mnt_userns)) put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP @@ -965,6 +966,7 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; + struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -982,6 +984,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + fs_userns = mnt->mnt.mnt_sb->s_user_ns; + if (!initial_idmapping(fs_userns)) + mnt->mnt.mnt_userns = get_user_ns(fs_userns); + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1072,7 +1078,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, atomic_inc(&sb->s_active); mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (mnt->mnt.mnt_userns != &init_user_ns) + if (!initial_idmapping(mnt->mnt.mnt_userns)) mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -3927,28 +3933,32 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; + struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_userns) return 0; /* + * Creating an idmapped mount with the filesystem wide idmapping + * doesn't make sense so block that. We don't allow mushy semantics. + */ + if (kattr->mnt_userns == fs_userns) + return -EINVAL; + + /* * Once a mount has been idmapped we don't allow it to change its * mapping. It makes things simpler and callers can just create * another bind-mount they can idmap if they want to. */ - if (mnt_user_ns(m) != &init_user_ns) + if (is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; - /* Don't yet support filesystem mountable in user namespaces. */ - if (m->mnt_sb->s_user_ns != &init_user_ns) - return -EINVAL; - /* We're not controlling the superblock. */ - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ @@ -4002,14 +4012,27 @@ out: static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns; + struct user_namespace *mnt_userns, *old_mnt_userns; if (!kattr->mnt_userns) return; + /* + * We're the only ones able to change the mount's idmapping. So + * mnt->mnt.mnt_userns is stable and we can retrieve it directly. + */ + old_mnt_userns = mnt->mnt.mnt_userns; + mnt_userns = get_user_ns(kattr->mnt_userns); /* Pairs with smp_load_acquire() in mnt_user_ns(). */ smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); + + /* + * If this is an idmapped filesystem drop the reference we've taken + * in vfs_create_mount() before. + */ + if (!initial_idmapping(old_mnt_userns)) + put_user_ns(old_mnt_userns); } static void mount_setattr_commit(struct mount_kattr *kattr, @@ -4133,13 +4156,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } /* - * The init_user_ns is used to indicate that a vfsmount is not idmapped. - * This is simpler than just having to treat NULL as unmapped. Users - * wanting to idmap a mount to init_user_ns can just use a namespace - * with an identity mapping. + * The initial idmapping cannot be used to create an idmapped + * mount. We use the initial idmapping as an indicator of a mount + * that is not idmapped. It can simply be passed into helpers that + * are aware of idmapped mounts as a convenient shortcut. A user + * can just create a dedicated identity mapping to achieve the same + * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (mnt_userns == &init_user_ns) { + if (initial_idmapping(mnt_userns)) { err = -EPERM; goto out_fput; } @@ -4263,12 +4288,11 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); - if (err) - return err; - - err = do_mount_setattr(&target, &kattr); + if (!err) { + err = do_mount_setattr(&target, &kattr); + path_put(&target); + } finish_mount_kattr(&kattr); - path_put(&target); return err; } diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 9320a42dfaf9..6169659857b3 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -170,7 +170,7 @@ static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error */ static void netfs_read_from_cache(struct netfs_read_request *rreq, struct netfs_read_subrequest *subreq, - bool seek_data) + enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; struct iov_iter iter; @@ -180,7 +180,7 @@ static void netfs_read_from_cache(struct netfs_read_request *rreq, subreq->start + subreq->transferred, subreq->len - subreq->transferred); - cres->ops->read(cres, subreq->start, &iter, seek_data, + cres->ops->read(cres, subreq->start, &iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -323,7 +323,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size); + rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); @@ -354,16 +354,11 @@ static void netfs_rreq_write_to_cache_work(struct work_struct *work) netfs_rreq_do_write_to_cache(rreq); } -static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq, - bool was_async) +static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq) { - if (was_async) { - rreq->work.func = netfs_rreq_write_to_cache_work; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_rreq_do_write_to_cache(rreq); - } + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); } /* @@ -466,7 +461,7 @@ static void netfs_rreq_short_read(struct netfs_read_request *rreq, netfs_get_read_subrequest(subreq); atomic_inc(&rreq->nr_rd_ops); if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, true); + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); else netfs_read_from_server(rreq, subreq); } @@ -558,7 +553,7 @@ again: wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags)) - return netfs_rreq_write_to_cache(rreq, was_async); + return netfs_rreq_write_to_cache(rreq); netfs_rreq_completed(rreq, was_async); } @@ -794,7 +789,7 @@ static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq, netfs_read_from_server(rreq, subreq); break; case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, false); + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); break; default: BUG(); @@ -960,7 +955,7 @@ int netfs_readpage(struct file *file, rreq = netfs_alloc_read_request(ops, netfs_priv, file); if (!rreq) { if (netfs_priv) - ops->cleanup(netfs_priv, folio_file_mapping(folio)); + ops->cleanup(folio_file_mapping(folio), netfs_priv); folio_unlock(folio); return -ENOMEM; } @@ -1008,8 +1003,8 @@ out: } EXPORT_SYMBOL(netfs_readpage); -/** - * netfs_skip_folio_read - prep a folio for writing without reading first +/* + * Prepare a folio for writing without reading first * @folio: The folio being prepared * @pos: starting position for the write * @len: length of write @@ -1191,7 +1186,7 @@ have_folio: goto error; have_folio_no_wait: if (netfs_priv) - ops->cleanup(netfs_priv, mapping); + ops->cleanup(mapping, netfs_priv); *_folio = folio; _leave(" = 0"); return 0; @@ -1202,7 +1197,7 @@ error: folio_unlock(folio); folio_put(folio); if (netfs_priv) - ops->cleanup(netfs_priv, mapping); + ops->cleanup(mapping, netfs_priv); _leave(" = %d", ret); return ret; } diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 22d11fdc6deb..5f6db37f461e 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -12,7 +12,7 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ export.o sysfs.o fs_context.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o -nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o obj-$(CONFIG_NFS_V2) += nfsv2.o nfsv2-y := nfs2super.o proc.o nfs2xdr.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86d856de1389..054cc1255fac 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -93,7 +93,7 @@ nfs4_callback_svc(void *vrqstp) svc_process(rqstp); } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -137,7 +137,7 @@ nfs41_callback_svc(void *vrqstp) } } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -169,12 +169,12 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - if (serv->sv_nrthreads-1 == nrservs) + if (serv->sv_nrthreads == nrservs) return 0; - ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + ret = svc_set_num_threads(serv, NULL, nrservs); if (ret) { - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); return ret; } dprintk("nfs_callback_up: service started\n"); @@ -235,14 +235,12 @@ err_bind: static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -266,14 +264,8 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) /* * Check whether we're already up and running. */ - if (cb_info->serv) { - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ - svc_get(cb_info->serv); - return cb_info->serv; - } + if (cb_info->serv) + return svc_get(cb_info->serv); switch (minorversion) { case 0: @@ -294,7 +286,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); @@ -335,16 +327,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_start; cb_info->users++; - /* - * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_prepare_thread increments that. So we need to call svc_destroy - * on both success and failure so that the refcount is 1 when the - * thread exits. - */ err_net: if (!cb_info->users) cb_info->serv = NULL; - svc_destroy(serv); + svc_put(serv); err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -369,8 +355,8 @@ void nfs_callback_down(int minorversion, struct net *net) cb_info->users--; if (cb_info->users == 0) { svc_get(serv); - serv->sv_ops->svo_setup(serv, NULL, 0); - svc_destroy(serv); + svc_set_num_threads(serv, NULL, 0); + svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e4dc1ab9312..8d8b85b5a641 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -183,8 +183,6 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; - nfs_fscache_get_client_cookie(clp); - return clp; error_cleanup: @@ -238,8 +236,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - nfs_fscache_release_client_cookie(clp); - /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 731d31015b6a..347793626f19 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -18,6 +18,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9cff8709c80a..eabfdab543c8 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -59,6 +59,7 @@ #include "internal.h" #include "iostat.h" #include "pnfs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -959,6 +960,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) } else { result = requested; } + nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); out_release: nfs_direct_req_release(dreq); out: diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 171c424cb6d5..01596f2d0a1e 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -158,5 +158,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, + EXPORT_OP_NOATOMIC_ATTR, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 24e7dccce355..76d76acbc594 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -84,6 +84,7 @@ nfs_file_release(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSRELEASE); nfs_file_clear_open_context(filp); + nfs_fscache_release_file(inode, filp); return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -415,8 +416,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page_file_mapping(page)->host, page); - - nfs_fscache_invalidate_page(page, page->mapping->host); + wait_on_page_fscache(page); } /* @@ -475,12 +475,11 @@ static void nfs_check_dirty_writeback(struct page *page, static int nfs_launder_page(struct page *page) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", inode->i_ino, (long long)page_offset(page)); - nfs_fscache_wait_on_page_write(nfsi, page); + wait_on_page_fscache(page); return nfs_wb_page(inode, page); } @@ -555,7 +554,11 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - nfs_fscache_wait_on_page_write(NFS_I(inode), page); + if (PageFsCache(page) && + wait_on_page_fscache_killable(vmf->page) < 0) { + ret = VM_FAULT_RETRY; + goto out; + } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 0d444a90f513..ea17fa1f31ec 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -10,6 +10,7 @@ * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com> */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c deleted file mode 100644 index 573b1da9342c..000000000000 --- a/fs/nfs/fscache-index.c +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* NFS FS-Cache index structure definition - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/nfs_fs.h> -#include <linux/nfs_fs_sb.h> -#include <linux/in6.h> -#include <linux/iversion.h> - -#include "internal.h" -#include "fscache.h" - -#define NFSDBG_FACILITY NFSDBG_FSCACHE - -/* - * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks - * the cookie for the top-level index object for NFS into here. The top-level - * index can than have other cache objects inserted into it. - */ -struct fscache_netfs nfs_fscache_netfs = { - .name = "nfs", - .version = 0, -}; - -/* - * Register NFS for caching - */ -int nfs_fscache_register(void) -{ - return fscache_register_netfs(&nfs_fscache_netfs); -} - -/* - * Unregister NFS for caching - */ -void nfs_fscache_unregister(void) -{ - fscache_unregister_netfs(&nfs_fscache_netfs); -} - -/* - * Define the server object for FS-Cache. This is used to describe a server - * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and - * server address parameters. - */ -const struct fscache_cookie_def nfs_fscache_server_index_def = { - .name = "NFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -/* - * Define the superblock object for FS-Cache. This is used to describe a - * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS - * parameters that might cause a separate superblock. - */ -const struct fscache_cookie_def nfs_fscache_super_index_def = { - .name = "NFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -/* - * Consult the netfs about the state of an object - * - This function can be absent if the index carries no state data - * - The netfs data from the cookie being used as the target is - * presented, as is the auxiliary data - */ -static -enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Get an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - * - The read context is passed back to NFS in the event that a data read on the - * cache fails with EIO - in which case the server must be contacted to - * retrieve the data, which requires the read context for security. - */ -static void nfs_fh_get_context(void *cookie_netfs_data, void *context) -{ - get_nfs_open_context(context); -} - -/* - * Release an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - */ -static void nfs_fh_put_context(void *cookie_netfs_data, void *context) -{ - if (context) - put_nfs_open_context(context); -} - -/* - * Define the inode object for FS-Cache. This is used to describe an inode - * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for - * an inode. - * - * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime - * held in the cache auxiliary data for the data storage object with those in - * the inode struct in memory. - */ -const struct fscache_cookie_def nfs_fscache_inode_object_def = { - .name = "NFS.fh", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = nfs_fscache_inode_check_aux, - .get_context = nfs_fh_get_context, - .put_context = nfs_fh_put_context, -}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index d743629e05e1..cfe901650ab0 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -22,24 +22,18 @@ #define NFSDBG_FACILITY NFSDBG_FSCACHE -static struct rb_root nfs_fscache_keys = RB_ROOT; -static DEFINE_SPINLOCK(nfs_fscache_keys_lock); +#define NFS_MAX_KEY_LEN 1000 -/* - * Layout of the key for an NFS server cache object. - */ -struct nfs_server_key { - struct { - uint16_t nfsversion; /* NFS protocol version */ - uint32_t minorversion; /* NFSv4 minor version */ - uint16_t family; /* address family */ - __be16 port; /* IP port */ - } hdr; - union { - struct in_addr ipv4_addr; /* IPv4 address */ - struct in6_addr ipv6_addr; /* IPv6 address */ - }; -} __packed; +static bool nfs_append_int(char *key, int *_len, unsigned long long x) +{ + if (*_len > NFS_MAX_KEY_LEN) + return false; + if (x == 0) + key[(*_len)++] = ','; + else + *_len += sprintf(key + *_len, ",%llx", x); + return true; +} /* * Get the per-client index cookie for an NFS client if the appropriate mount @@ -47,160 +41,108 @@ struct nfs_server_key { * - We always try and get an index cookie for the client, but get filehandle * cookies on a per-superblock basis, depending on the mount flags */ -void nfs_fscache_get_client_cookie(struct nfs_client *clp) +static bool nfs_fscache_get_client_key(struct nfs_client *clp, + char *key, int *_len) { const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; - struct nfs_server_key key; - uint16_t len = sizeof(key.hdr); - memset(&key, 0, sizeof(key)); - key.hdr.nfsversion = clp->rpc_ops->version; - key.hdr.minorversion = clp->cl_minorversion; - key.hdr.family = clp->cl_addr.ss_family; + *_len += snprintf(key + *_len, NFS_MAX_KEY_LEN - *_len, + ",%u.%u,%x", + clp->rpc_ops->version, + clp->cl_minorversion, + clp->cl_addr.ss_family); switch (clp->cl_addr.ss_family) { case AF_INET: - key.hdr.port = sin->sin_port; - key.ipv4_addr = sin->sin_addr; - len += sizeof(key.ipv4_addr); - break; + if (!nfs_append_int(key, _len, sin->sin_port) || + !nfs_append_int(key, _len, sin->sin_addr.s_addr)) + return false; + return true; case AF_INET6: - key.hdr.port = sin6->sin6_port; - key.ipv6_addr = sin6->sin6_addr; - len += sizeof(key.ipv6_addr); - break; + if (!nfs_append_int(key, _len, sin6->sin6_port) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[0]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[1]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[2]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[3])) + return false; + return true; default: printk(KERN_WARNING "NFS: Unknown network family '%d'\n", clp->cl_addr.ss_family); - clp->fscache = NULL; - return; + return false; } - - /* create a cache index for looking up filehandles */ - clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, - &nfs_fscache_server_index_def, - &key, len, - NULL, 0, - clp, 0, true); - dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", - clp, clp->fscache); -} - -/* - * Dispose of a per-client cookie - */ -void nfs_fscache_release_client_cookie(struct nfs_client *clp) -{ - dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", - clp, clp->fscache); - - fscache_relinquish_cookie(clp->fscache, NULL, false); - clp->fscache = NULL; } /* - * Get the cache cookie for an NFS superblock. We have to handle - * uniquification here because the cache doesn't do it for us. + * Get the cache cookie for an NFS superblock. * * The default uniquifier is just an empty string, but it may be overridden * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent * superblock across an automount point of some nature. */ -void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) +int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) { - struct nfs_fscache_key *key, *xkey; + struct fscache_volume *vcookie; struct nfs_server *nfss = NFS_SB(sb); - struct rb_node **p, *parent; - int diff; + unsigned int len = 3; + char *key; - nfss->fscache_key = NULL; - nfss->fscache = NULL; - if (!uniq) { - uniq = ""; - ulen = 1; + if (uniq) { + nfss->fscache_uniq = kmemdup_nul(uniq, ulen, GFP_KERNEL); + if (!nfss->fscache_uniq) + return -ENOMEM; } - key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + key = kmalloc(NFS_MAX_KEY_LEN + 24, GFP_KERNEL); if (!key) - return; - - key->nfs_client = nfss->nfs_client; - key->key.super.s_flags = sb->s_flags & NFS_SB_MASK; - key->key.nfs_server.flags = nfss->flags; - key->key.nfs_server.rsize = nfss->rsize; - key->key.nfs_server.wsize = nfss->wsize; - key->key.nfs_server.acregmin = nfss->acregmin; - key->key.nfs_server.acregmax = nfss->acregmax; - key->key.nfs_server.acdirmin = nfss->acdirmin; - key->key.nfs_server.acdirmax = nfss->acdirmax; - key->key.nfs_server.fsid = nfss->fsid; - key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; - - key->key.uniq_len = ulen; - memcpy(key->key.uniquifier, uniq, ulen); - - spin_lock(&nfs_fscache_keys_lock); - p = &nfs_fscache_keys.rb_node; - parent = NULL; - while (*p) { - parent = *p; - xkey = rb_entry(parent, struct nfs_fscache_key, node); - - if (key->nfs_client < xkey->nfs_client) - goto go_left; - if (key->nfs_client > xkey->nfs_client) - goto go_right; - - diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - - if (key->key.uniq_len == 0) - goto non_unique; - diff = memcmp(key->key.uniquifier, - xkey->key.uniquifier, - key->key.uniq_len); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - goto non_unique; - - go_left: - p = &(*p)->rb_left; - continue; - go_right: - p = &(*p)->rb_right; + return -ENOMEM; + + memcpy(key, "nfs", 3); + if (!nfs_fscache_get_client_key(nfss->nfs_client, key, &len) || + !nfs_append_int(key, &len, nfss->fsid.major) || + !nfs_append_int(key, &len, nfss->fsid.minor) || + !nfs_append_int(key, &len, sb->s_flags & NFS_SB_MASK) || + !nfs_append_int(key, &len, nfss->flags) || + !nfs_append_int(key, &len, nfss->rsize) || + !nfs_append_int(key, &len, nfss->wsize) || + !nfs_append_int(key, &len, nfss->acregmin) || + !nfs_append_int(key, &len, nfss->acregmax) || + !nfs_append_int(key, &len, nfss->acdirmin) || + !nfs_append_int(key, &len, nfss->acdirmax) || + !nfs_append_int(key, &len, nfss->client->cl_auth->au_flavor)) + goto out; + + if (ulen > 0) { + if (ulen > NFS_MAX_KEY_LEN - len) + goto out; + key[len++] = ','; + memcpy(key + len, uniq, ulen); + len += ulen; } - - rb_link_node(&key->node, parent, p); - rb_insert_color(&key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - nfss->fscache_key = key; + key[len] = 0; /* create a cache index for looking up filehandles */ - nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, - &nfs_fscache_super_index_def, - &key->key, - sizeof(key->key) + ulen, - NULL, 0, - nfss, 0, true); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + NULL, 0 /* coherency_data */); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - return; + nfss, vcookie); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(key); + return PTR_ERR(vcookie); + } + pr_err("NFS: Cache volume key already in use (%s)\n", key); + vcookie = NULL; + } + nfss->fscache = vcookie; -non_unique: - spin_unlock(&nfs_fscache_keys_lock); +out: kfree(key); - nfss->fscache_key = NULL; - nfss->fscache = NULL; - printk(KERN_WARNING "NFS:" - " Cache request denied due to non-unique superblock keys\n"); + return 0; } /* @@ -213,29 +155,9 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); - fscache_relinquish_cookie(nfss->fscache, NULL, false); + fscache_relinquish_volume(nfss->fscache, NULL, false); nfss->fscache = NULL; - - if (nfss->fscache_key) { - spin_lock(&nfs_fscache_keys_lock); - rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - kfree(nfss->fscache_key); - nfss->fscache_key = NULL; - } -} - -static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, - struct nfs_inode *nfsi) -{ - memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + kfree(nfss->fscache_uniq); } /* @@ -254,10 +176,12 @@ void nfs_fscache_init_inode(struct inode *inode) nfs_fscache_update_auxdata(&auxdata, nfsi); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi->fh.data, nfsi->fh.size, - &auxdata, sizeof(auxdata), - nfsi, nfsi->vfs_inode.i_size, false); + 0, + nfsi->fh.data, /* index_key */ + nfsi->fh.size, + &auxdata, /* aux_data */ + sizeof(auxdata), + i_size_read(&nfsi->vfs_inode)); } /* @@ -265,24 +189,15 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { - struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - nfs_fscache_update_auxdata(&auxdata, nfsi); - fscache_relinquish_cookie(cookie, &auxdata, false); + fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } -static bool nfs_fscache_can_enable(void *data) -{ - struct inode *inode = data; - - return !inode_is_open_for_write(inode); -} - /* * Enable or disable caching for a file that is being opened as appropriate. * The cookie is allocated when the inode is initialised, but is not enabled at @@ -307,100 +222,104 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); + bool open_for_write = inode_is_open_for_write(inode); if (!fscache_cookie_valid(cookie)) return; - nfs_fscache_update_auxdata(&auxdata, nfsi); - - if (inode_is_open_for_write(inode)) { + fscache_use_cookie(cookie, open_for_write); + if (open_for_write) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); - clear_bit(NFS_INO_FSCACHE, &nfsi->flags); - fscache_disable_cookie(cookie, &auxdata, true); - fscache_uncache_all_inode_pages(cookie, inode); - } else { - dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); - fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size, - nfs_fscache_can_enable, inode); - if (fscache_cookie_enabled(cookie)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); } } EXPORT_SYMBOL_GPL(nfs_fscache_open_file); -/* - * Release the caching state associated with a page, if the page isn't busy - * interacting with the cache. - * - Returns true (can release page) or false (page busy). - */ -int nfs_fscache_release_page(struct page *page, gfp_t gfp) +void nfs_fscache_release_file(struct inode *inode, struct file *filp) { - if (PageFsCache(page)) { - struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); - - BUG_ON(!cookie); - dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(page->mapping->host)); - - if (!fscache_maybe_release_page(cookie, page, gfp)) - return 0; + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); + if (fscache_cookie_valid(cookie)) { + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_unuse_cookie(cookie, &auxdata, NULL); } +} - return 1; +static inline void fscache_end_operation(struct netfs_cache_resources *cres) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); } /* - * Release the caching state associated with a page if undergoing complete page - * invalidation. + * Fallback page reading interface. */ -void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +static int fscache_fallback_read_page(struct inode *inode, struct page *page) { + struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + int ret; - BUG_ON(!cookie); - - dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(inode)); + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); - fscache_wait_on_page_write(cookie, page); + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; - BUG_ON(!PageLocked(page)); - fscache_uncache_page(cookie, page); - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); + ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, + NULL, NULL); + fscache_end_operation(&cres); + return ret; } /* - * Handle completion of a page being read from the cache. - * - Called in process (keventd) context. + * Fallback page writing interface. */ -static void nfs_readpage_from_fscache_complete(struct page *page, - void *context, - int error) +static int fscache_fallback_write_page(struct inode *inode, struct page *page, + bool no_space_allocated_yet) { - dfprintk(FSCACHE, - "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", - page, context, error); - - /* - * If the read completes with an error, mark the page with PG_checked, - * unlock the page, and let the VM reissue the readpage. - */ - if (!error) - SetPageUptodate(page); - else - SetPageChecked(page); - unlock_page(page); + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + loff_t start = page_offset(page); + size_t len = PAGE_SIZE; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_write_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + no_space_allocated_yet); + if (ret == 0) + ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + fscache_end_operation(&cres); + return ret; } /* * Retrieve a page from fscache */ -int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, struct page *page) +int __nfs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; @@ -409,112 +328,49 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, nfs_i_fscache(inode), page, page->index, page->flags, inode); if (PageChecked(page)) { + dfprintk(FSCACHE, "NFS: readpage_from_fscache: PageChecked\n"); ClearPageChecked(page); return 1; } - ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), - page, - nfs_readpage_from_fscache_complete, - ctx, - GFP_KERNEL); - - switch (ret) { - case 0: /* read BIO submitted (page in fscache) */ - dfprintk(FSCACHE, - "NFS: readpage_from_fscache: BIO submitted\n"); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); - return ret; - - case -ENOBUFS: /* inode not in cache */ - case -ENODATA: /* page not in cache */ + ret = fscache_fallback_read_page(inode, page); + if (ret < 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); dfprintk(FSCACHE, - "NFS: readpage_from_fscache %d\n", ret); - return 1; - - default: - dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - } - return ret; -} - -/* - * Retrieve a set of pages from fscache - */ -int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - unsigned npages = *nr_pages; - int ret; - - dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - nfs_i_fscache(inode), npages, inode); - - ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), - mapping, pages, nr_pages, - nfs_readpage_from_fscache_complete, - ctx, - mapping_gfp_mask(mapping)); - if (*nr_pages < npages) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, - npages); - if (*nr_pages > 0) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, - *nr_pages); - - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - BUG_ON(!list_empty(pages)); - BUG_ON(*nr_pages != 0); - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: submitted\n"); - + "NFS: readpage_from_fscache failed %d\n", ret); + SetPageChecked(page); return ret; - - case -ENOBUFS: /* some pages aren't cached and can't be */ - case -ENODATA: /* some pages aren't cached */ - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); - return 1; - - default: - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: ret %d\n", ret); } - return ret; + /* Read completed synchronously */ + dfprintk(FSCACHE, "NFS: readpage_from_fscache: read successful\n"); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); + SetPageUptodate(page); + return 0; } /* - * Store a newly fetched page in fscache - * - PG_fscache must be set on the page + * Store a newly fetched page in fscache. We can be certain there's no page + * stored in the cache as yet otherwise we would've read it from there. */ -void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; dfprintk(FSCACHE, - "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - nfs_i_fscache(inode), page, page->index, page->flags, sync); + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n", + nfs_i_fscache(inode), page, page->index, page->flags); + + ret = fscache_fallback_write_page(inode, page, true); - ret = fscache_write_page(nfs_i_fscache(inode), page, - inode->i_size, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); if (ret != 0) { - fscache_uncache_page(nfs_i_fscache(inode), page); - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); } else { - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_OK); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); } } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 6754c8607230..25a5c0f82392 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -8,51 +8,16 @@ #ifndef _NFS_FSCACHE_H #define _NFS_FSCACHE_H +#include <linux/swap.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/fscache.h> +#include <linux/iversion.h> #ifdef CONFIG_NFS_FSCACHE /* - * set of NFS FS-Cache objects that form a superblock key - */ -struct nfs_fscache_key { - struct rb_node node; - struct nfs_client *nfs_client; /* the server */ - - /* the elements of the unique key - as used by nfs_compare_super() and - * nfs_compare_mount_options() to distinguish superblocks */ - struct { - struct { - unsigned long s_flags; /* various flags - * (& NFS_MS_MASK) */ - } super; - - struct { - struct nfs_fsid fsid; - int flags; - unsigned int rsize; /* read size */ - unsigned int wsize; /* write size */ - unsigned int acregmin; /* attr cache timeouts */ - unsigned int acregmax; - unsigned int acdirmin; - unsigned int acdirmax; - } nfs_server; - - struct { - rpc_authflavor_t au_flavor; - } rpc_auth; - - /* uniquifier - can be used if nfs_server.flags includes - * NFS_MOUNT_UNSHARED */ - u8 uniq_len; - char uniquifier[0]; - } key; -}; - -/* * Definition of the auxiliary data attached to NFS inode storage objects * within the cache. * @@ -70,84 +35,42 @@ struct nfs_fscache_inode_auxdata { }; /* - * fscache-index.c - */ -extern struct fscache_netfs nfs_fscache_netfs; -extern const struct fscache_cookie_def nfs_fscache_server_index_def; -extern const struct fscache_cookie_def nfs_fscache_super_index_def; -extern const struct fscache_cookie_def nfs_fscache_inode_object_def; - -extern int nfs_fscache_register(void); -extern void nfs_fscache_unregister(void); - -/* * fscache.c */ -extern void nfs_fscache_get_client_cookie(struct nfs_client *); -extern void nfs_fscache_release_client_cookie(struct nfs_client *); - -extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); +extern int nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); extern void nfs_fscache_init_inode(struct inode *); extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); +extern void nfs_fscache_release_file(struct inode *, struct file *); -extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); -extern int nfs_fscache_release_page(struct page *, gfp_t); +extern int __nfs_readpage_from_fscache(struct inode *, struct page *); +extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr, + unsigned long bytes); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *); -extern int __nfs_readpage_from_fscache(struct nfs_open_context *, - struct inode *, struct page *); -extern int __nfs_readpages_from_fscache(struct nfs_open_context *, - struct inode *, struct address_space *, - struct list_head *, unsigned *); -extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); - -/* - * wait for a page to complete writing to the cache - */ -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) -{ - if (PageFsCache(page)) - fscache_wait_on_page_write(nfsi->fscache, page); -} - -/* - * release the caching state associated with a page if undergoing complete page - * invalidation - */ -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { - if (PageFsCache(page)) - __nfs_fscache_invalidate_page(page, inode); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + fscache_note_page_release(nfs_i_fscache(page->mapping->host)); + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); + } + return true; } /* * Retrieve a page from an inode data storage object. */ -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, +static inline int nfs_readpage_from_fscache(struct inode *inode, struct page *page) { if (NFS_I(inode)->fscache) - return __nfs_readpage_from_fscache(ctx, inode, page); - return -ENOBUFS; -} - -/* - * Retrieve a set of pages from an inode data storage object. - */ -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - if (NFS_I(inode)->fscache) - return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, - nr_pages); + return __nfs_readpage_from_fscache(inode, page); return -ENOBUFS; } @@ -156,27 +79,38 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, * in the cache. */ static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, - int sync) + struct page *page) { - if (PageFsCache(page)) - __nfs_readpage_to_fscache(inode, page, sync); + if (NFS_I(inode)->fscache) + __nfs_readpage_to_fscache(inode, page); } -/* - * Invalidate the contents of fscache for this inode. This will not sleep. - */ -static inline void nfs_fscache_invalidate(struct inode *inode) +static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, + struct nfs_inode *nfsi) { - fscache_invalidate(NFS_I(inode)->fscache); + memset(auxdata, 0, sizeof(*auxdata)); + auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); } /* - * Wait for an object to finish being invalidated. + * Invalidate the contents of fscache for this inode. This will not sleep. */ -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) { - fscache_wait_on_invalidate(NFS_I(inode)->fscache); + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache) { + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_invalidate(nfsi->fscache, &auxdata, + i_size_read(&nfsi->vfs_inode), flags); + } } /* @@ -190,48 +124,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server) } #else /* CONFIG_NFS_FSCACHE */ -static inline int nfs_fscache_register(void) { return 0; } -static inline void nfs_fscache_unregister(void) {} - -static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} -static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} - static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode(struct inode *inode) {} static inline void nfs_fscache_clear_inode(struct inode *inode) {} static inline void nfs_fscache_open_file(struct inode *inode, struct file *filp) {} +static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {} static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { return 1; /* True: may release page */ } -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) {} - -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, +static inline int nfs_readpage_from_fscache(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, int sync) {} + struct page *page) {} -static inline void nfs_fscache_invalidate(struct inode *inode) {} -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dd53704c3f40..a918c3a834b6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -209,7 +209,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_fscache_invalidate(inode, 0); flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); nfsi->cache_validity |= flags; @@ -219,6 +219,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) NFS_INO_DATA_INVAL_DEFER); else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; + trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); @@ -1288,6 +1289,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map { int ret; + nfs_fscache_invalidate(inode, 0); if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { ret = nfs_sync_mapping(mapping); @@ -1299,7 +1301,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return ret; } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - nfs_fscache_wait_on_invalidate(inode); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, @@ -2373,10 +2374,6 @@ static int __init init_nfs_fs(void) if (err < 0) goto out9; - err = nfs_fscache_register(); - if (err < 0) - goto out8; - err = nfsiod_start(); if (err) goto out7; @@ -2428,8 +2425,6 @@ out5: out6: nfsiod_stop(); out7: - nfs_fscache_unregister(); -out8: unregister_pernet_subsys(&nfs_net_ops); out9: nfs_sysfs_exit(); @@ -2444,7 +2439,6 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); - nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 08355b66e7cb..8b21ff1be717 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -289,7 +289,9 @@ static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) loff_t newsize = pos + len; loff_t end = newsize - 1; - truncate_pagecache_range(inode, pos, end); + WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) i_size_write(inode, newsize); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index c8bad735e4c1..271e5f92ed01 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1434,8 +1434,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, status = decode_clone(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->dst_fattr, res->server); - + decode_getfattr(xdr, res->dst_fattr, res->server); out: res->rpc_status = status; return status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ecc4594299d6..d88b779f9dd0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1998,6 +1998,10 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) dprintk("%s: exit with error %d for server %s\n", __func__, -EPROTONOSUPPORT, clp->cl_hostname); return -EPROTONOSUPPORT; + case -ENOSPC: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EIO); + return -EIO; case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: @@ -2689,6 +2693,6 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); nfs4_state_manager(clp); nfs_put_client(clp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 21dac847f1e4..317ce27bdc4b 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -42,7 +42,6 @@ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \ - { BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ @@ -162,6 +161,7 @@ DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); TRACE_EVENT(nfs_access_exit, TP_PROTO( diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d11af2a9299c..eb00229c1a50 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error) struct address_space *mapping = page_file_mapping(page); if (PageUptodate(page)) - nfs_readpage_to_fscache(inode, page, 0); + nfs_readpage_to_fscache(inode, page); else if (!PageError(page) && !PagePrivate(page)) generic_error_remove_page(mapping, page); unlock_page(page); @@ -305,6 +305,12 @@ readpage_async_filler(void *data, struct page *page) aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + if (!IS_SYNC(page->mapping->host)) { + error = nfs_readpage_from_fscache(page->mapping->host, page); + if (error == 0) + goto out_unlock; + } + new = nfs_create_request(desc->ctx, page, 0, aligned_len); if (IS_ERR(new)) goto out_error; @@ -320,6 +326,7 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); +out_unlock: unlock_page(page); out: return error; @@ -366,12 +373,6 @@ int nfs_readpage(struct file *file, struct page *page) desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); xchg(&desc.ctx->error, 0); - if (!IS_SYNC(inode)) { - ret = nfs_readpage_from_fscache(desc.ctx, inode, page); - if (ret == 0) - goto out_wait; - } - nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); @@ -381,7 +382,6 @@ int nfs_readpage(struct file *file, struct page *page) nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; -out_wait: if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) @@ -419,14 +419,6 @@ int nfs_readpages(struct file *file, struct address_space *mapping, } else desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); - /* attempt to read as many of the pages as possible from the cache - * - this returns -ENOBUFS immediately if the cookie is negative - */ - ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, - pages, &nr_pages); - if (ret == 0) - goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); @@ -434,7 +426,6 @@ int nfs_readpages(struct file *file, struct address_space *mapping, nfs_pageio_complete_read(&desc.pgio); -read_complete: put_nfs_open_context(desc.ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3aced401735c..6ab5eeb000dc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1204,42 +1204,42 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc) } #ifdef CONFIG_NFS_FSCACHE -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_fs_context *ctx) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { struct nfs_server *nfss = NFS_SB(sb); char *uniq = NULL; int ulen = 0; - nfss->fscache_key = NULL; nfss->fscache = NULL; if (!ctx) - return; + return 0; if (ctx->clone_data.sb) { struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb); if (!(mnt_s->options & NFS_OPTION_FSCACHE)) - return; - if (mnt_s->fscache_key) { - uniq = mnt_s->fscache_key->key.uniquifier; - ulen = mnt_s->fscache_key->key.uniq_len; + return 0; + if (mnt_s->fscache_uniq) { + uniq = mnt_s->fscache_uniq; + ulen = strlen(uniq); } } else { if (!(ctx->options & NFS_OPTION_FSCACHE)) - return; + return 0; if (ctx->fscache_uniq) { uniq = ctx->fscache_uniq; ulen = strlen(ctx->fscache_uniq); } } - nfs_fscache_get_super_cookie(sb, uniq, ulen); + return nfs_fscache_get_super_cookie(sb, uniq, ulen); } #else -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_fs_context *ctx) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { + return 0; } #endif @@ -1299,7 +1299,9 @@ int nfs_get_tree_common(struct fs_context *fc) s->s_blocksize_bits = bsize; s->s_blocksize = 1U << bsize; } - nfs_get_cache_cookie(s, ctx); + error = nfs_get_cache_cookie(s, ctx); + if (error < 0) + goto error_splat_super; } error = nfs_get_root(s, fc); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9b7619ce17a7..987a187bd39a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -294,6 +294,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: spin_unlock(&inode->i_lock); + nfs_fscache_invalidate(inode, 0); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ @@ -2125,8 +2126,11 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (PagePrivate(page)) return -EBUSY; - if (!nfs_fscache_release_page(page, GFP_KERNEL)) - return -EBUSY; + if (PageFsCache(page)) { + if (mode == MIGRATE_ASYNC) + return -EBUSY; + wait_on_page_fscache(page); + } return migrate_page(mapping, newpage, page, mode); } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 9421dae22737..668c7527b17e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -427,7 +427,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } - if (mnt_user_ns(path->mnt) != &init_user_ns) { + if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fdf89fcf1a0c..8bc807c5fea4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -44,12 +44,9 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); struct nfsd_fcache_disposal { - struct list_head list; struct work_struct work; - struct net *net; spinlock_t lock; struct list_head freeme; - struct rcu_head rcu; }; static struct workqueue_struct *nfsd_filecache_wq __read_mostly; @@ -62,8 +59,6 @@ static long nfsd_file_lru_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; -static DEFINE_SPINLOCK(laundrette_lock); -static LIST_HEAD(laundrettes); static void nfsd_file_gc(void); @@ -194,7 +189,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); } nf->nf_mark = NULL; - init_rwsem(&nf->nf_rwsem); trace_nfsd_file_alloc(nf); } return nf; @@ -249,7 +243,7 @@ nfsd_file_do_unhash(struct nfsd_file *nf) trace_nfsd_file_unhash(nf); if (nfsd_file_check_write_error(nf)) - nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); atomic_long_dec(&nfsd_filecache_count); @@ -367,19 +361,13 @@ nfsd_file_list_remove_disposal(struct list_head *dst, static void nfsd_file_list_add_disposal(struct list_head *files, struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net == net) { - spin_lock(&l->lock); - list_splice_tail_init(files, &l->freeme); - spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); - break; - } - } - rcu_read_unlock(); + spin_lock(&l->lock); + list_splice_tail_init(files, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); } static void @@ -755,7 +743,7 @@ nfsd_file_cache_purge(struct net *net) } static struct nfsd_fcache_disposal * -nfsd_alloc_fcache_disposal(struct net *net) +nfsd_alloc_fcache_disposal(void) { struct nfsd_fcache_disposal *l; @@ -763,7 +751,6 @@ nfsd_alloc_fcache_disposal(struct net *net) if (!l) return NULL; INIT_WORK(&l->work, nfsd_file_delayed_close); - l->net = net; spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -772,61 +759,27 @@ nfsd_alloc_fcache_disposal(struct net *net) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - rcu_assign_pointer(l->net, NULL); cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); - kfree_rcu(l, rcu); -} - -static void -nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_add_tail_rcu(&l->list, &laundrettes); - spin_unlock(&laundrette_lock); -} - -static void -nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_del_rcu(&l->list); - spin_unlock(&laundrette_lock); -} - -static int -nfsd_alloc_fcache_disposal_net(struct net *net) -{ - struct nfsd_fcache_disposal *l; - - l = nfsd_alloc_fcache_disposal(net); - if (!l) - return -ENOMEM; - nfsd_add_fcache_disposal(l); - return 0; + kfree(l); } static void nfsd_free_fcache_disposal_net(struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net != net) - continue; - nfsd_del_fcache_disposal(l); - rcu_read_unlock(); - nfsd_free_fcache_disposal(l); - return; - } - rcu_read_unlock(); + nfsd_free_fcache_disposal(l); } int nfsd_file_cache_start_net(struct net *net) { - return nfsd_alloc_fcache_disposal_net(net); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->fcache_disposal = nfsd_alloc_fcache_disposal(); + return nn->fcache_disposal ? 0 : -ENOMEM; } void diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 7872df5a0fe3..435ceab27897 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -46,7 +46,6 @@ struct nfsd_file { refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; - struct rw_semaphore nf_rwsem; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 935c1028c217..1b1a962a1804 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,6 +11,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/percpu_counter.h> +#include <linux/siphash.h> /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -108,9 +109,8 @@ struct nfsd_net { bool nfsd_net_up; bool lockd_up; - /* Time of server startup */ - struct timespec64 nfssvc_boot; - seqlock_t boot_lock; + seqlock_t writeverf_lock; + unsigned char writeverf[8]; /* * Max number of connections this nfsd container will allow. Defaults @@ -123,12 +123,13 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; - - wait_queue_head_t ntf_wq; - atomic_t ntf_refcnt; - - /* Allow umount to wait for nfsd state cleanup */ - struct completion nfsd_shutdown_complete; + /* When a listening socket is added to nfsd, keep_active is set + * and this justifies a reference on nfsd_serv. This stops + * nfsd_serv from being freed. When the number of threads is + * set, keep_active is cleared and the reference is dropped. So + * when the last thread exits, the service will be destroyed. + */ + int keep_active; /* * clientid and stateid data for construction of net unique COPY @@ -184,6 +185,10 @@ struct nfsd_net { /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; + + struct nfsd_fcache_disposal *fcache_disposal; + + siphash_key_t siphash_key; }; /* Simple check to find out if a given net was properly initialized */ @@ -193,6 +198,6 @@ extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); -void nfsd_reset_boot_verifier(struct nfsd_net *nn); +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_write_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4418517f6f12..8ef53f6726ec 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -202,15 +202,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, nvecs, &cnt, resp->committed, resp->verf); resp->count = cnt; -out: return rpc_success; } @@ -438,22 +434,19 @@ nfsd3_proc_link(struct svc_rqst *rqstp) static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd3_readdirres *resp, - int count) + u32 count) { struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - count = min_t(u32, count, svc_max_payload(rqstp)); + count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ buf->buflen = count - XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; - while (count > 0) { - rqstp->rq_next_page++; - count -= PAGE_SIZE; - } + rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* This is xdr_init_encode(), but it assumes that * the head kvec has already been consumed. */ @@ -462,7 +455,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, xdr->page_ptr = buf->pages; xdr->iov = NULL; xdr->p = page_address(*buf->pages); - xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); xdr->rqst = NULL; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index c3ac1b6aa3aa..7c45ba4db61b 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -487,71 +487,6 @@ neither: return true; } -static bool fs_supports_change_attribute(struct super_block *sb) -{ - return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; -} - -/* - * Fill in the pre_op attr for the wcc data - */ -void fill_pre_wcc(struct svc_fh *fhp) -{ - struct inode *inode; - struct kstat stat; - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - - if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; - inode = d_inode(fhp->fh_dentry); - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &stat); - - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; - } - if (v4) - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - - fhp->fh_pre_saved = true; -} - -/* - * Fill in the post_op attr for the wcc data - */ -void fill_post_wcc(struct svc_fh *fhp) -{ - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - struct inode *inode = d_inode(fhp->fh_dentry); - - if (fhp->fh_no_wcc) - return; - - if (fhp->fh_post_saved) - printk("nfsd: inode locked twice during operation.\n"); - - fhp->fh_post_saved = true; - - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); - - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } - } - if (v4) - fhp->fh_post_change = - nfsd4_change_attribute(&fhp->fh_post_attr, inode); -} - /* * XDR decode functions */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a36261f89bdf..ed1ee25647be 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -598,7 +598,7 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); - nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); + nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 @@ -1101,7 +1101,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, + status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); @@ -1510,11 +1510,14 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) { + struct file *dst = copy->nf_dst->nf_file; + struct file *src = copy->nf_src->nf_file; + errseq_t since; ssize_t bytes_copied = 0; u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; - __be32 status; + int status; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1522,9 +1525,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, - src_pos, copy->nf_dst->nf_file, dst_pos, - bytes_total); + bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1534,11 +1536,11 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) } while (bytes_total > 0 && !copy->cp_synchronous); /* for a non-zero asynchronous copy do a commit of data */ if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) { - down_write(©->nf_dst->nf_rwsem); - status = vfs_fsync_range(copy->nf_dst->nf_file, - copy->cp_dst_pos, + since = READ_ONCE(dst->f_wb_err); + status = vfs_fsync_range(dst, copy->cp_dst_pos, copy->cp_res.wr_bytes_written, 0); - up_write(©->nf_dst->nf_rwsem); + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); if (!status) copy->committed = true; } @@ -2528,7 +2530,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } - fh_clear_wcc(current_fh); + fh_clear_pre_post_attrs(current_fh); /* If op is non-idempotent */ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 6fedc49726bf..c634483d85d2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -2156,6 +2156,7 @@ static struct notifier_block nfsd4_cld_block = { int register_cld_notifier(void) { + WARN_ON(!nfsd_net_id); return rpc_pipefs_notifier_register(&nfsd4_cld_block); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bfad94c70b84..72900b89cf84 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -246,6 +246,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); + WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; @@ -271,6 +272,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); + kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); @@ -280,11 +282,20 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, } static void +free_nbl(struct kref *kref) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + kfree(nbl); +} + +static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); - kfree(nbl); + kref_put(&nbl->nbl_kref, free_nbl); } static void @@ -302,6 +313,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo) struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); + WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); @@ -360,11 +372,13 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * st_{access,deny}_bmap field of the stateid, in order to track not * only what share bits are currently in force, but also what * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. + * to enforce the recommendation in + * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that + * the server return an error if the client attempt to downgrade to a + * combination of share bits not explicable by closing some of its + * previous opens. * - * XXX: This enforcement is actually incomplete, since we don't keep + * This enforcement is arguably incomplete, since we don't keep * track of access/deny bit combinations; so, e.g., we allow: * * OPEN allow read, deny write @@ -372,6 +386,10 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * DOWNGRADE allow read, deny none * * which we should reject. + * + * But you could also argue that our current code is already overkill, + * since it only exists to return NFS4ERR_INVAL on incorrect client + * behavior. */ static unsigned int bmap_to_share_mode(unsigned long bmap) @@ -1207,6 +1225,11 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) return 0; } +static bool delegation_hashed(struct nfs4_delegation *dp) +{ + return !(list_empty(&dp->dl_perfile)); +} + static bool unhash_delegation_locked(struct nfs4_delegation *dp) { @@ -1214,7 +1237,7 @@ unhash_delegation_locked(struct nfs4_delegation *dp) lockdep_assert_held(&state_lock); - if (list_empty(&dp->dl_perfile)) + if (!delegation_hashed(dp)) return false; dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; @@ -4598,7 +4621,7 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) * queued for a lease break. Don't queue it again. */ spin_lock(&state_lock); - if (dp->dl_time == 0) { + if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } @@ -6035,7 +6058,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - status = check_special_stateids(net, fhp, stateid, flags); + if (cstid) + status = nfserr_bad_stateid; + else + status = check_special_stateids(net, fhp, stateid, + flags); goto done; } @@ -6831,7 +6858,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; - struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6853,7 +6879,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } - sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6905,8 +6930,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6918,8 +6942,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -6940,6 +6963,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + /* + * Most filesystems with their own ->lock operations will block + * the nfsd thread waiting to acquire the lock. That leads to + * deadlocks (we don't want every nfsd thread tied up waiting + * for file locks), so don't attempt blocking lock notifications + * on those filesystems: + */ + if (nf->nf_file->f_op->lock) + fl_flags &= ~FL_SLEEP; + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); @@ -6970,6 +7003,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } @@ -6982,6 +7016,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: + kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ @@ -7002,8 +7037,13 @@ out: /* dequeue it if we queued it before */ if (fl_flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); - list_del_init(&nbl->nbl_list); - list_del_init(&nbl->nbl_lru); + if (!list_empty(&nbl->nbl_list) && + !list_empty(&nbl->nbl_lru)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + kref_put(&nbl->nbl_kref, free_nbl); + } + /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b2a1d969a172..899de438e529 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -277,24 +277,10 @@ nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) static __be32 nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) { - u32 i, count; - __be32 *p; - - if (xdr_stream_decode_u32(argp->xdr, &count) < 0) - return nfserr_bad_xdr; - /* request sanity */ - if (count > 1000) - return nfserr_bad_xdr; - p = xdr_inline_decode(argp->xdr, count << 2); - if (!p) - return nfserr_bad_xdr; - i = 0; - while (i < count) - bmval[i++] = be32_to_cpup(p++); - while (i < bmlen) - bmval[i++] = 0; + ssize_t status; - return nfs_ok; + status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen); + return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok; } static __be32 @@ -4807,8 +4793,8 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, return nfserr_resource; *p++ = htonl(NFS4_CONTENT_HOLE); - p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + p = xdr_encode_hyper(p, read->rd_offset); + p = xdr_encode_hyper(p, count); *eof = (read->rd_offset + count) >= f_size; *maxcount = min_t(unsigned long, count, *maxcount); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6e0b6f3148dc..a4a69ab6ab28 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -87,7 +87,7 @@ nfsd_hashsize(unsigned int limit) static u32 nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) { - return hash_32(be32_to_cpu(xid), nn->maskbits); + return hash_32((__force u32)xid, nn->maskbits); } static struct svc_cacherep * diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index af8531c3854a..b9f27fbcd768 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -742,13 +742,12 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred return err; err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0) { - nfsd_destroy(net); - return err; - } - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (err >= 0 && + !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return err; } @@ -783,8 +782,10 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return 0; out_close: xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); @@ -793,10 +794,7 @@ out_close: svc_xprt_put(xprt); } out_err: - if (!list_empty(&nn->nfsd_serv->sv_permsocks)) - nn->nfsd_serv->sv_nrthreads--; - else - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -1485,9 +1483,8 @@ static __net_init int nfsd_init_net(struct net *net) nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; - atomic_set(&nn->ntf_refcnt, 0); - init_waitqueue_head(&nn->ntf_wq); - seqlock_init(&nn->boot_lock); + get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); + seqlock_init(&nn->writeverf_lock); return 0; @@ -1521,12 +1518,9 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_cld_notifier(); - if (retval) - return retval; retval = nfsd4_init_slabs(); if (retval) - goto out_unregister_notifier; + return retval; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; @@ -1545,9 +1539,14 @@ static int __init init_nfsd(void) goto out_free_exports; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) + goto out_free_filesystem; + retval = register_cld_notifier(); + if (retval) goto out_free_all; return 0; out_free_all: + unregister_pernet_subsys(&nfsd_net_ops); +out_free_filesystem: unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); @@ -1561,13 +1560,12 @@ out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_notifier: - unregister_cld_notifier(); return retval; } static void __exit exit_nfsd(void) { + unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); @@ -1577,7 +1575,6 @@ static void __exit exit_nfsd(void) nfsd4_free_slabs(); nfsd4_exit_pnfs(); unregister_filesystem(&nfsd_fs_type); - unregister_cld_notifier(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 498e5a489826..3e5008b475ff 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -97,7 +97,7 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_destroy(struct net *net); +void nfsd_put(struct net *net); bool i_am_nfsd(void); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f3779fa72c89..145208bcb9bd 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -611,6 +611,70 @@ out_negative: return nfserr_serverfault; } +#ifdef CONFIG_NFSD_V3 + +/** + * fh_fill_pre_attrs - Fill in pre-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_pre_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode; + struct kstat stat; + __be32 err; + + if (fhp->fh_no_wcc || fhp->fh_pre_saved) + return; + + inode = d_inode(fhp->fh_dentry); + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_saved = true; +} + +/** + * fh_fill_post_attrs - Fill in post-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_post_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode = d_inode(fhp->fh_dentry); + __be32 err; + + if (fhp->fh_no_wcc) + return; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } else + fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr, inode); +} + +#endif /* CONFIG_NFSD_V3 */ + /* * Release a file handle. */ @@ -623,7 +687,7 @@ fh_put(struct svc_fh *fhp) fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); - fh_clear_wcc(fhp); + fh_clear_pre_post_attrs(fhp); } fh_drop_write(fhp); if (exp) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index d11e4b6870d6..434930d8a946 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -284,12 +284,13 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) #endif #ifdef CONFIG_NFSD_V3 -/* - * The wcc data stored in current_fh should be cleared - * between compound ops. + +/** + * fh_clear_pre_post_attrs - Reset pre/post attributes + * @fhp: file handle to be updated + * */ -static inline void -fh_clear_wcc(struct svc_fh *fhp) +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) { fhp->fh_post_saved = false; fhp->fh_pre_saved = false; @@ -323,13 +324,24 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat, return time_to_chattr(&stat->ctime); } -extern void fill_pre_wcc(struct svc_fh *fhp); -extern void fill_post_wcc(struct svc_fh *fhp); -#else -#define fh_clear_wcc(ignored) -#define fill_pre_wcc(ignored) -#define fill_post_wcc(notused) -#endif /* CONFIG_NFSD_V3 */ +extern void fh_fill_pre_attrs(struct svc_fh *fhp); +extern void fh_fill_post_attrs(struct svc_fh *fhp); + +#else /* !CONFIG_NFSD_V3 */ + +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_pre_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_post_attrs(struct svc_fh *fhp) +{ +} + +#endif /* !CONFIG_NFSD_V3 */ /* @@ -355,7 +367,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) inode = d_inode(dentry); inode_lock_nested(inode, subclass); - fill_pre_wcc(fhp); + fh_fill_pre_attrs(fhp); fhp->fh_locked = true; } @@ -372,7 +384,7 @@ static inline void fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { - fill_post_wcc(fhp); + fh_fill_post_attrs(fhp); inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index eea5b59b6a6c..18b8eb43a19b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -235,10 +235,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) argp->len, argp->offset); nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, nvecs, @@ -247,7 +243,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) return rpc_drop_reply; -out: return rpc_success; } @@ -556,17 +551,17 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd_readdirres *resp, - int count) + u32 count) { struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - count = min_t(u32, count, PAGE_SIZE); + count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ - buf->buflen = count - sizeof(__be32) * 2; + buf->buflen = count - XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; @@ -577,7 +572,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, xdr->page_ptr = buf->pages; xdr->iov = NULL; xdr->p = page_address(*buf->pages); - xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); xdr->rqst = NULL; } @@ -850,6 +845,7 @@ nfserrno (int errno) { nfserr_io, -EIO }, { nfserr_nxio, -ENXIO }, { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, @@ -878,6 +874,8 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, { nfserr_no_grace, -ENOGRACE}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80431921e5d7..b8c682b62d29 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> +#include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> @@ -55,18 +56,17 @@ static __be32 nfsd_init_request(struct svc_rqst *, struct svc_process_info *); /* - * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members - * of the svc_serv struct. In particular, ->sv_nrthreads but also to some - * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members + * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number - * of nfsd threads must exist and each must listed in ->sp_all_threads in each - * entry of ->sv_pools[]. + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless + * nn->keep_active is set). That number of nfsd threads must + * exist and each must be listed in ->sp_all_threads in some entry of + * ->sv_pools[]. * - * Transitions of the thread count between zero and non-zero are of particular - * interest since the svc_serv needs to be created and initialized at that - * point, or freed. + * Each active thread holds a counted reference on nn->nfsd_serv, as does + * the nn->keep_active flag and various transient calls to svc_get(). * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in @@ -345,33 +345,57 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +/** + * nfsd_copy_write_verifier - Atomically copy a write verifier + * @verf: buffer in which to receive the verifier cookie + * @nn: NFS net namespace + * + * This function provides a wait-free mechanism for copying the + * namespace's write verifier without tearing it. + */ +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { int seq = 0; do { - read_seqbegin_or_lock(&nn->boot_lock, &seq); - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - } while (need_seqretry(&nn->boot_lock, seq)); - done_seqretry(&nn->boot_lock, seq); + read_seqbegin_or_lock(&nn->writeverf_lock, &seq); + memcpy(verf, nn->writeverf, sizeof(*verf)); + } while (need_seqretry(&nn->writeverf_lock, seq)); + done_seqretry(&nn->writeverf_lock, seq); } -static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { - ktime_get_real_ts64(&nn->nfssvc_boot); + struct timespec64 now; + u64 verf; + + /* + * Because the time value is hashed, y2038 time_t overflow + * is irrelevant in this usage. + */ + ktime_get_raw_ts64(&now); + verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); + memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } -void nfsd_reset_boot_verifier(struct nfsd_net *nn) +/** + * nfsd_reset_write_verifier - Generate a new write verifier + * @nn: NFS net namespace + * + * This function updates the ->writeverf field of @nn. This field + * contains an opaque cookie that, according to Section 18.32.3 of + * RFC 8881, "the client can use to determine whether a server has + * changed instance state (e.g., server restart) between a call to + * WRITE and a subsequent call to either WRITE or COMMIT. This + * cookie MUST be unchanged during a single instance of the NFSv4.1 + * server and MUST be unique between instances of the NFSv4.1 + * server." + */ +void nfsd_reset_write_verifier(struct nfsd_net *nn) { - write_seqlock(&nn->boot_lock); - nfsd_reset_boot_verifier_locked(nn); - write_sequnlock(&nn->boot_lock); + write_seqlock(&nn->writeverf_lock); + nfsd_reset_write_verifier_locked(nn); + write_sequnlock(&nn->writeverf_lock); } static int nfsd_startup_net(struct net *net, const struct cred *cred) @@ -435,6 +459,7 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -444,18 +469,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; @@ -475,10 +499,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; @@ -487,8 +511,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); + out: return NOTIFY_DONE; } @@ -505,7 +529,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -513,7 +536,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -594,20 +616,9 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, .svo_module = THIS_MODULE, }; -static void nfsd_complete_shutdown(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - WARN_ON(!mutex_is_locked(&nfsd_mutex)); - - nn->nfsd_serv = NULL; - complete(&nn->nfsd_shutdown_complete); -} - void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -622,11 +633,9 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ - serv->sv_ops->svo_setup(serv, NULL, 0); - nfsd_destroy(net); + svc_set_num_threads(serv, NULL, 0); + nfsd_put(net); mutex_unlock(&nfsd_mutex); - /* Wait for shutdown of nfsd_serv to complete */ - wait_for_completion(&nn->nfsd_shutdown_complete); } bool i_am_nfsd(void) @@ -638,6 +647,7 @@ int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) { @@ -647,19 +657,23 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - &nfsd_thread_sv_ops); - if (nn->nfsd_serv == NULL) + serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + &nfsd_thread_sv_ops); + if (serv == NULL) return -ENOMEM; - init_completion(&nn->nfsd_shutdown_complete); - nn->nfsd_serv->sv_maxconn = nn->max_connections; - error = svc_bind(nn->nfsd_serv, net); + serv->sv_maxconn = nn->max_connections; + error = svc_bind(serv, net); if (error < 0) { - svc_destroy(nn->nfsd_serv); - nfsd_complete_shutdown(net); + /* NOT nfsd_put() as notifiers (see below) haven't + * been set up yet. + */ + svc_put(serv); return error; } + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = serv; + spin_unlock(&nfsd_notifier_lock); set_max_drc(); /* check if the notifier is already set */ @@ -669,8 +683,7 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - atomic_inc(&nn->ntf_refcnt); - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); return 0; } @@ -697,16 +710,26 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -void nfsd_destroy(struct net *net) +/* This is the callback for kref_put() below. + * There is no code here as the first thing to be done is + * call svc_shutdown_net(), but we cannot get the 'net' from + * the kref. So do all the work when kref_put returns true. + */ +static void nfsd_noop(struct kref *ref) +{ +} + +void nfsd_put(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int destroy = (nn->nfsd_serv->sv_nrthreads == 1); - if (destroy) + if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { svc_shutdown_net(nn->nfsd_serv, net); - svc_destroy(nn->nfsd_serv); - if (destroy) - nfsd_complete_shutdown(net); + svc_destroy(&nn->nfsd_serv->sv_refcnt); + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); + } } int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) @@ -733,7 +756,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { - int new = nthreads[i] * NFSD_MAXSERVS / tot; + int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } @@ -753,12 +776,13 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* apply the new numbers */ svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - &nn->nfsd_serv->sv_pools[i], nthreads[i]); + err = svc_set_num_threads(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], + nthreads[i]); if (err) break; } - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -795,21 +819,19 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_startup_net(net, cred); if (error) - goto out_destroy; - error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - NULL, nrservs); + goto out_put; + error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); if (error) goto out_shutdown; - /* We are holding a reference to nn->nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nn->nfsd_serv->sv_nrthreads - 1; + error = nn->nfsd_serv->sv_nrthreads; out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); -out_destroy: - nfsd_destroy(net); /* Release server */ +out_put: + /* Threads now hold service active */ + if (xchg(&nn->keep_active, 0)) + nfsd_put(net); + nfsd_put(net); out: mutex_unlock(&nfsd_mutex); return error; @@ -923,9 +945,6 @@ nfsd(void *vrqstp) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int err; - /* Lock module and set up kernel thread */ - mutex_lock(&nfsd_mutex); - /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ @@ -945,8 +964,7 @@ nfsd(void *vrqstp) allow_signal(SIGINT); allow_signal(SIGQUIT); - nfsdstats.th_cnt++; - mutex_unlock(&nfsd_mutex); + atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -973,20 +991,36 @@ nfsd(void *vrqstp) /* Clear signals before calling svc_exit_thread() */ flush_signals(current); - mutex_lock(&nfsd_mutex); - nfsdstats.th_cnt --; + atomic_dec(&nfsdstats.th_cnt); out: - rqstp->rq_server = NULL; + /* Take an extra ref so that the svc_put in svc_exit_thread() + * doesn't call svc_destroy() + */ + svc_get(nn->nfsd_serv); /* Release the thread */ svc_exit_thread(rqstp); - nfsd_destroy(net); + /* We need to drop a ref, but may not drop the last reference + * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that + * could deadlock with nfsd_shutdown_threads() waiting for us. + * So three options are: + * - drop a non-final reference, + * - get the mutex without waiting + * - sleep briefly andd try the above again + */ + while (!svc_put_not_last(nn->nfsd_serv)) { + if (mutex_trylock(&nfsd_mutex)) { + nfsd_put(net); + mutex_unlock(&nfsd_mutex); + break; + } + msleep(20); + } /* Release module */ - mutex_unlock(&nfsd_mutex); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -1096,7 +1130,6 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) mutex_unlock(&nfsd_mutex); return -ENODEV; } - /* bump up the psudo refcount while traversing */ svc_get(nn->nfsd_serv); ret = svc_pool_stats_open(nn->nfsd_serv, file); mutex_unlock(&nfsd_mutex); @@ -1109,8 +1142,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file) struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - /* this function really, really should have been called svc_put() */ - nfsd_destroy(net); + nfsd_put(net); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e73bdbb1634a..95457cfd37fc 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -568,6 +568,10 @@ struct nfs4_ol_stateid { struct list_head st_locks; struct nfs4_stateowner *st_stateowner; struct nfs4_clnt_odstate *st_clnt_odstate; +/* + * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the + * comment above bmap_to_share_mode() for explanation: + */ unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; @@ -629,6 +633,7 @@ struct nfsd4_blocked_lock { struct file_lock nbl_lock; struct knfsd_fh nbl_fh; struct nfsd4_callback nbl_cb; + struct kref nbl_kref; }; struct nfsd4_compound_state; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 1d3b881e7382..a8c5a02a84f0 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -45,7 +45,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 51ecda852e23..9b43dc3d9991 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -29,11 +29,9 @@ enum { struct nfsd_stats { struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - /* Protected by nfsd_mutex */ - unsigned int th_cnt; /* number of available threads */ + atomic_t th_cnt; /* number of available threads */ }; - extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index f1e0d3c51bc2..c4cf56327843 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -47,7 +47,7 @@ rqstp->rq_xprt->xpt_remotelen); \ } while (0); -TRACE_EVENT(nfsd_garbage_args_err, +DECLARE_EVENT_CLASS(nfsd_xdr_err_class, TP_PROTO( const struct svc_rqst *rqstp ), @@ -69,27 +69,13 @@ TRACE_EVENT(nfsd_garbage_args_err, ) ); -TRACE_EVENT(nfsd_cant_encode_err, - TP_PROTO( - const struct svc_rqst *rqstp - ), - TP_ARGS(rqstp), - TP_STRUCT__entry( - NFSD_TRACE_PROC_ARG_FIELDS - - __field(u32, vers) - __field(u32, proc) - ), - TP_fast_assign( - NFSD_TRACE_PROC_ARG_ASSIGNMENTS +#define DEFINE_NFSD_XDR_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \ + TP_PROTO(const struct svc_rqst *rqstp), \ + TP_ARGS(rqstp)) - __entry->vers = rqstp->rq_vers; - __entry->proc = rqstp->rq_proc; - ), - TP_printk("xid=0x%08x vers=%u proc=%u", - __entry->xid, __entry->vers, __entry->proc - ) -); +DEFINE_NFSD_XDR_ERR_EVENT(garbage_args); +DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); #define show_nfsd_may_flags(x) \ __print_flags(x, "|", \ @@ -413,6 +399,56 @@ TRACE_EVENT(nfsd_dirent, ) ) +DECLARE_EVENT_CLASS(nfsd_copy_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *src_fhp, + loff_t src_offset, + struct svc_fh *dst_fhp, + loff_t dst_offset, + u64 count, + int status), + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, src_fh_hash) + __field(loff_t, src_offset) + __field(u32, dst_fh_hash) + __field(loff_t, dst_offset) + __field(u64, count) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle); + __entry->src_offset = src_offset; + __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle); + __entry->dst_offset = dst_offset; + __entry->count = count; + __entry->status = status; + ), + TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld " + "dst_fh_hash=0x%08x dst_offset=%lld " + "count=%llu status=%d", + __entry->xid, __entry->src_fh_hash, __entry->src_offset, + __entry->dst_fh_hash, __entry->dst_offset, + (unsigned long long)__entry->count, + __entry->status) +) + +#define DEFINE_NFSD_COPY_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *src_fhp, \ + loff_t src_offset, \ + struct svc_fh *dst_fhp, \ + loff_t dst_offset, \ + u64 count, \ + int status), \ + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \ + count, status)) + +DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); + #include "state.h" #include "filecache.h" #include "vfs.h" @@ -538,6 +574,34 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ DEFINE_NET_EVENT(grace_start); DEFINE_NET_EVENT(grace_complete); +TRACE_EVENT(nfsd_writeverf_reset, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int error + ), + TP_ARGS(nn, rqstp, error), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(int, error) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->error = error; + + /* avoid seqlock inside TP_fast_assign */ + memcpy(__entry->verifier, nn->writeverf, + NFS4_VERIFIER_SIZE); + ), + TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s", + __entry->boot_time, __entry->xid, __entry->error, + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE) + ) +); + TRACE_EVENT(nfsd_clid_cred_mismatch, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c99857689e2c..99c2b9dfbb10 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -40,6 +40,7 @@ #include "../internal.h" #include "acl.h" #include "idmap.h" +#include "xdr4.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" @@ -517,15 +518,23 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, - struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync) +static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp) +{ + return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate; +} + +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync) { struct file *src = nf_src->nf_file; struct file *dst = nf_dst->nf_file; + errseq_t since; loff_t cloned; __be32 ret = 0; - down_write(&nf_dst->nf_rwsem); + since = READ_ONCE(dst->f_wb_err); cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); if (cloned < 0) { ret = nfserrno(cloned); @@ -540,15 +549,25 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); + if (!status) status = commit_inode_metadata(file_inode(src)); if (status < 0) { - nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net, - nfsd_net_id)); + struct nfsd_net *nn = net_generic(nf_dst->nf_net, + nfsd_net_id); + + trace_nfsd_clone_file_range_err(rqstp, + &nfsd4_get_cstate(rqstp)->save_fh, + src_pos, + &nfsd4_get_cstate(rqstp)->current_fh, + dst_pos, + count, status); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, status); ret = nfserrno(status); } } out_err: - up_write(&nf_dst->nf_rwsem); return ret; } @@ -777,6 +796,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { __be32 err; + bool retried = false; validate_process_creds(); /* @@ -792,9 +812,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, */ if (type == S_IFREG) may_flags |= NFSD_MAY_OWNER_OVERRIDE; +retry: err = fh_verify(rqstp, fhp, type, may_flags); - if (!err) + if (!err) { err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + if (err == nfserr_stale && !retried) { + retried = true; + fh_put(fhp); + goto retry; + } + } validate_process_creds(); return err; } @@ -944,10 +971,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned long *cnt, int stable, __be32 *verf) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; struct iov_iter iter; + errseq_t since; __be32 nfserr; int host_err; int use_wgather; @@ -985,36 +1014,28 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, flags |= RWF_SYNC; iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); - if (flags & RWF_SYNC) { - down_write(&nf->nf_rwsem); - host_err = vfs_iter_write(file, &iter, &pos, flags); - if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); - up_write(&nf->nf_rwsem); - } else { - down_read(&nf->nf_rwsem); - if (verf) - nfsd_copy_boot_verifier(verf, - net_generic(SVC_NET(rqstp), - nfsd_net_id)); - host_err = vfs_iter_write(file, &iter, &pos, flags); - up_read(&nf->nf_rwsem); - } + since = READ_ONCE(file->f_wb_err); + if (verf) + nfsd_copy_write_verifier(verf, nn); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); goto out_nfserr; } *cnt = host_err; nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); + host_err = filemap_check_wb_err(file->f_mapping, since); + if (host_err < 0) + goto out_nfserr; if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); - if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + if (host_err < 0) { + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); + } } out_nfserr: @@ -1089,19 +1110,6 @@ out: } #ifdef CONFIG_NFSD_V3 -static int -nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset, - loff_t end) -{ - struct address_space *mapping = nf->nf_file->f_mapping; - int ret = filemap_fdatawrite_range(mapping, offset, end); - - if (ret) - return ret; - filemap_fdatawait_range_keep_errors(mapping, offset, end); - return 0; -} - /* * Commit all pending writes to stable storage. * @@ -1115,6 +1123,7 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count, __be32 *verf) { + struct nfsd_net *nn; struct nfsd_file *nf; loff_t end = LLONG_MAX; __be32 err = nfserr_inval; @@ -1131,29 +1140,28 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; + nn = net_generic(nf->nf_net, nfsd_net_id); if (EX_ISSYNC(fhp->fh_export)) { - int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end); + errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); + int err2; - down_write(&nf->nf_rwsem); - if (!err2) - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_write_verifier(verf, nn); + err2 = filemap_check_wb_err(nf->nf_file->f_mapping, + since); break; case -EINVAL: err = nfserr_notsupp; break; default: - err = nfserrno(err2); - nfsd_reset_boot_verifier(net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err2); } - up_write(&nf->nf_rwsem); + err = nfserrno(err2); } else - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_write_verifier(verf, nn); nfsd_file_put(nf); out: @@ -1747,8 +1755,8 @@ retry: * so do it by hand */ trap = lock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = true; - fill_pre_wcc(ffhp); - fill_pre_wcc(tfhp); + fh_fill_pre_attrs(ffhp); + fh_fill_pre_attrs(tfhp); odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1808,8 +1816,8 @@ retry: * were the same, so again we do it by hand. */ if (!close_cached) { - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + fh_fill_post_attrs(ffhp); + fh_fill_post_attrs(tfhp); } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index b21b76e6b9a8..9f56dcb22ff7 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -57,7 +57,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 81f35c5b5a40..379d22e28ed6 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -57,7 +57,7 @@ static void nilfs_##name##_attr_release(struct kobject *kobj) \ complete(&subgroups->sg_##name##_kobj_unregister); \ } \ static struct kobj_type nilfs_##name##_ktype = { \ - .default_attrs = nilfs_##name##_attrs, \ + .default_groups = nilfs_##name##_groups, \ .sysfs_ops = &nilfs_##name##_attr_ops, \ .release = nilfs_##name##_attr_release, \ } @@ -129,6 +129,7 @@ static struct attribute *nilfs_snapshot_attrs[] = { NILFS_SNAPSHOT_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_snapshot); static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -166,7 +167,7 @@ static const struct sysfs_ops nilfs_snapshot_attr_ops = { }; static struct kobj_type nilfs_snapshot_ktype = { - .default_attrs = nilfs_snapshot_attrs, + .default_groups = nilfs_snapshot_groups, .sysfs_ops = &nilfs_snapshot_attr_ops, .release = nilfs_snapshot_attr_release, }; @@ -226,6 +227,7 @@ static struct attribute *nilfs_mounted_snapshots_attrs[] = { NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_mounted_snapshots); NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); @@ -339,6 +341,7 @@ static struct attribute *nilfs_checkpoints_attrs[] = { NILFS_CHECKPOINTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_checkpoints); NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); @@ -428,6 +431,7 @@ static struct attribute *nilfs_segments_attrs[] = { NILFS_SEGMENTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_segments); NILFS_DEV_INT_GROUP_OPS(segments, dev); NILFS_DEV_INT_GROUP_TYPE(segments, dev); @@ -689,6 +693,7 @@ static struct attribute *nilfs_segctor_attrs[] = { NILFS_SEGCTOR_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_segctor); NILFS_DEV_INT_GROUP_OPS(segctor, dev); NILFS_DEV_INT_GROUP_TYPE(segctor, dev); @@ -816,6 +821,7 @@ static struct attribute *nilfs_superblock_attrs[] = { NILFS_SUPERBLOCK_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_superblock); NILFS_DEV_INT_GROUP_OPS(superblock, dev); NILFS_DEV_INT_GROUP_TYPE(superblock, dev); @@ -924,6 +930,7 @@ static struct attribute *nilfs_dev_attrs[] = { NILFS_DEV_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_dev); static ssize_t nilfs_dev_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -961,7 +968,7 @@ static const struct sysfs_ops nilfs_dev_attr_ops = { }; static struct kobj_type nilfs_dev_ktype = { - .default_attrs = nilfs_dev_attrs, + .default_groups = nilfs_dev_groups, .sysfs_ops = &nilfs_dev_attr_ops, .release = nilfs_dev_attr_release, }; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e85e13c50d6d..d5ebebb034ff 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -196,7 +196,7 @@ static __u32 convert_arg(unsigned long arg) if (arg & DN_ATTRIB) new_mask |= FS_ATTRIB; if (arg & DN_RENAME) - new_mask |= FS_DN_RENAME; + new_mask |= FS_RENAME; if (arg & DN_CREATE) new_mask |= (FS_CREATE | FS_MOVED_TO); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b6091775aa6e..985e995d2a39 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -76,8 +76,10 @@ static bool fanotify_info_equal(struct fanotify_info *info1, struct fanotify_info *info2) { if (info1->dir_fh_totlen != info2->dir_fh_totlen || + info1->dir2_fh_totlen != info2->dir2_fh_totlen || info1->file_fh_totlen != info2->file_fh_totlen || - info1->name_len != info2->name_len) + info1->name_len != info2->name_len || + info1->name2_len != info2->name2_len) return false; if (info1->dir_fh_totlen && @@ -85,14 +87,24 @@ static bool fanotify_info_equal(struct fanotify_info *info1, fanotify_info_dir_fh(info2))) return false; + if (info1->dir2_fh_totlen && + !fanotify_fh_equal(fanotify_info_dir2_fh(info1), + fanotify_info_dir2_fh(info2))) + return false; + if (info1->file_fh_totlen && !fanotify_fh_equal(fanotify_info_file_fh(info1), fanotify_info_file_fh(info2))) return false; - return !info1->name_len || - !memcmp(fanotify_info_name(info1), fanotify_info_name(info2), - info1->name_len); + if (info1->name_len && + memcmp(fanotify_info_name(info1), fanotify_info_name(info2), + info1->name_len)) + return false; + + return !info1->name2_len || + !memcmp(fanotify_info_name2(info1), fanotify_info_name2(info2), + info1->name2_len); } static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, @@ -141,6 +153,13 @@ static bool fanotify_should_merge(struct fanotify_event *old, if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR)) return false; + /* + * FAN_RENAME event is reported with special info record types, + * so we cannot merge it with other events. + */ + if ((old->mask & FAN_RENAME) != (new->mask & FAN_RENAME)) + return false; + switch (old->type) { case FANOTIFY_EVENT_TYPE_PATH: return fanotify_path_equal(fanotify_event_path(old), @@ -272,8 +291,9 @@ out: */ static u32 fanotify_group_event_mask(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info, - u32 event_mask, const void *data, - int data_type, struct inode *dir) + u32 *match_mask, u32 event_mask, + const void *data, int data_type, + struct inode *dir) { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | @@ -299,7 +319,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -318,11 +338,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, * If the event is on a child and this mark is on a parent not * watching children, don't send it! */ - if (type == FSNOTIFY_OBJ_TYPE_PARENT && + if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD)) continue; marks_mask |= mark->mask; + + /* Record the mark types of this group that matched the event */ + *match_mask |= 1U << type; } test_mask = event_mask & marks_mask & ~marks_ignored_mask; @@ -411,7 +434,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, * be zero in that case if encoding fh len failed. */ err = -ENOENT; - if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4)) + if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4) || fh_len > MAX_HANDLE_SZ) goto out_err; /* No external buffer in a variable size allocated fh */ @@ -458,17 +481,41 @@ out_err: } /* - * The inode to use as identifier when reporting fid depends on the event. - * Report the modified directory inode on dirent modification events. - * Report the "victim" inode otherwise. + * FAN_REPORT_FID is ambiguous in that it reports the fid of the child for + * some events and the fid of the parent for create/delete/move events. + * + * With the FAN_REPORT_TARGET_FID flag, the fid of the child is reported + * also in create/delete/move events in addition to the fid of the parent + * and the name of the child. + */ +static inline bool fanotify_report_child_fid(unsigned int fid_mode, u32 mask) +{ + if (mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return (fid_mode & FAN_REPORT_TARGET_FID); + + return (fid_mode & FAN_REPORT_FID) && !(mask & FAN_ONDIR); +} + +/* + * The inode to use as identifier when reporting fid depends on the event + * and the group flags. + * + * With the group flag FAN_REPORT_TARGET_FID, always report the child fid. + * + * Without the group flag FAN_REPORT_TARGET_FID, report the modified directory + * fid on dirent events and the child fid otherwise. + * * For example: - * FS_ATTRIB reports the child inode even if reported on a watched parent. - * FS_CREATE reports the modified dir inode and not the created inode. + * FS_ATTRIB reports the child fid even if reported on a watched parent. + * FS_CREATE reports the modified dir fid without FAN_REPORT_TARGET_FID. + * and reports the created child fid with FAN_REPORT_TARGET_FID. */ static struct inode *fanotify_fid_inode(u32 event_mask, const void *data, - int data_type, struct inode *dir) + int data_type, struct inode *dir, + unsigned int fid_mode) { - if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + if ((event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) && + !(fid_mode & FAN_REPORT_TARGET_FID)) return dir; return fsnotify_data_inode(data, data_type); @@ -552,25 +599,34 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, return &ffe->fae; } -static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, +static struct fanotify_event *fanotify_alloc_name_event(struct inode *dir, __kernel_fsid_t *fsid, const struct qstr *name, struct inode *child, + struct dentry *moved, unsigned int *hash, gfp_t gfp) { struct fanotify_name_event *fne; struct fanotify_info *info; struct fanotify_fh *dfh, *ffh; - unsigned int dir_fh_len = fanotify_encode_fh_len(id); + struct inode *dir2 = moved ? d_inode(moved->d_parent) : NULL; + const struct qstr *name2 = moved ? &moved->d_name : NULL; + unsigned int dir_fh_len = fanotify_encode_fh_len(dir); + unsigned int dir2_fh_len = fanotify_encode_fh_len(dir2); unsigned int child_fh_len = fanotify_encode_fh_len(child); - unsigned int size; - - size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len; + unsigned long name_len = name ? name->len : 0; + unsigned long name2_len = name2 ? name2->len : 0; + unsigned int len, size; + + /* Reserve terminating null byte even for empty name */ + size = sizeof(*fne) + name_len + name2_len + 2; + if (dir_fh_len) + size += FANOTIFY_FH_HDR_LEN + dir_fh_len; + if (dir2_fh_len) + size += FANOTIFY_FH_HDR_LEN + dir2_fh_len; if (child_fh_len) size += FANOTIFY_FH_HDR_LEN + child_fh_len; - if (name) - size += name->len + 1; fne = kmalloc(size, gfp); if (!fne) return NULL; @@ -580,24 +636,41 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, *hash ^= fanotify_hash_fsid(fsid); info = &fne->info; fanotify_info_init(info); - dfh = fanotify_info_dir_fh(info); - info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0); + if (dir_fh_len) { + dfh = fanotify_info_dir_fh(info); + len = fanotify_encode_fh(dfh, dir, dir_fh_len, hash, 0); + fanotify_info_set_dir_fh(info, len); + } + if (dir2_fh_len) { + dfh = fanotify_info_dir2_fh(info); + len = fanotify_encode_fh(dfh, dir2, dir2_fh_len, hash, 0); + fanotify_info_set_dir2_fh(info, len); + } if (child_fh_len) { ffh = fanotify_info_file_fh(info); - info->file_fh_totlen = fanotify_encode_fh(ffh, child, - child_fh_len, hash, 0); + len = fanotify_encode_fh(ffh, child, child_fh_len, hash, 0); + fanotify_info_set_file_fh(info, len); } - if (name) { - long salt = name->len; - + if (name_len) { fanotify_info_copy_name(info, name); - *hash ^= full_name_hash((void *)salt, name->name, name->len); + *hash ^= full_name_hash((void *)name_len, name->name, name_len); + } + if (name2_len) { + fanotify_info_copy_name2(info, name2); + *hash ^= full_name_hash((void *)name2_len, name2->name, + name2_len); } - pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", - __func__, id->i_ino, size, dir_fh_len, child_fh_len, + pr_debug("%s: size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", + __func__, size, dir_fh_len, child_fh_len, info->name_len, info->name_len, fanotify_info_name(info)); + if (dir2_fh_len) { + pr_debug("%s: dir2_fh_len=%u name2_len=%u name2='%.*s'\n", + __func__, dir2_fh_len, info->name2_len, + info->name2_len, fanotify_info_name2(info)); + } + return &fne->fae; } @@ -639,19 +712,21 @@ static struct fanotify_event *fanotify_alloc_error_event( return &fee->fae; } -static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, - u32 mask, const void *data, - int data_type, struct inode *dir, - const struct qstr *file_name, - __kernel_fsid_t *fsid) +static struct fanotify_event *fanotify_alloc_event( + struct fsnotify_group *group, + u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, + __kernel_fsid_t *fsid, u32 match_mask) { struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; - struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct inode *id = fanotify_fid_inode(mask, data, data_type, dir, + fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct mem_cgroup *old_memcg; + struct dentry *moved = NULL; struct inode *child = NULL; bool name_event = false; unsigned int hash = 0; @@ -660,11 +735,10 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { /* - * With both flags FAN_REPORT_DIR_FID and FAN_REPORT_FID, we - * report the child fid for events reported on a non-dir child + * For certain events and group flags, report the child fid * in addition to reporting the parent fid and maybe child name. */ - if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir) + if (fanotify_report_child_fid(fid_mode, mask) && id != dirid) child = id; id = dirid; @@ -688,6 +762,38 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) { name_event = true; } + + /* + * In the special case of FAN_RENAME event, use the match_mask + * to determine if we need to report only the old parent+name, + * only the new parent+name or both. + * 'dirid' and 'file_name' are the old parent+name and + * 'moved' has the new parent+name. + */ + if (mask & FAN_RENAME) { + bool report_old, report_new; + + if (WARN_ON_ONCE(!match_mask)) + return NULL; + + /* Report both old and new parent+name if sb watching */ + report_old = report_new = + match_mask & (1U << FSNOTIFY_ITER_TYPE_SB); + report_old |= + match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE); + report_new |= + match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE2); + + if (!report_old) { + /* Do not report old parent+name */ + dirid = NULL; + file_name = NULL; + } + if (report_new) { + /* Report new parent+name */ + moved = fsnotify_data_dentry(data, data_type); + } + } } /* @@ -709,9 +815,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, } else if (fanotify_is_error_event(mask)) { event = fanotify_alloc_error_event(group, fsid, data, data_type, &hash); - } else if (name_event && (file_name || child)) { - event = fanotify_alloc_name_event(id, fsid, file_name, child, - &hash, gfp); + } else if (name_event && (file_name || moved || child)) { + event = fanotify_alloc_name_event(dirid, fsid, file_name, child, + moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); } else { @@ -746,7 +852,7 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) int type; __kernel_fsid_t fsid = {}; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { struct fsnotify_mark_connector *conn; if (!fsnotify_iter_should_report_type(iter_info, type)) @@ -800,6 +906,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, struct fanotify_event *event; struct fsnotify_event *fsn_event; __kernel_fsid_t fsid = {}; + u32 match_mask = 0; BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); @@ -821,15 +928,17 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); + BUILD_BUG_ON(FAN_RENAME != FS_RENAME); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21); - mask = fanotify_group_event_mask(group, iter_info, mask, data, - data_type, dir); + mask = fanotify_group_event_mask(group, iter_info, &match_mask, + mask, data, data_type, dir); if (!mask) return 0; - pr_debug("%s: group=%p mask=%x\n", __func__, group, mask); + pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__, + group, mask, match_mask); if (fanotify_is_perm_event(mask)) { /* @@ -848,7 +957,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } event = fanotify_alloc_event(group, mask, data, data_type, dir, - file_name, &fsid); + file_name, &fsid, match_mask); ret = -ENOMEM; if (unlikely(!event)) { /* diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index d25f500bf7e7..a3d5b751cac5 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -40,15 +40,45 @@ struct fanotify_fh { struct fanotify_info { /* size of dir_fh/file_fh including fanotify_fh hdr size */ u8 dir_fh_totlen; + u8 dir2_fh_totlen; u8 file_fh_totlen; u8 name_len; - u8 pad; + u8 name2_len; + u8 pad[3]; unsigned char buf[]; /* * (struct fanotify_fh) dir_fh starts at buf[0] - * (optional) file_fh starts at buf[dir_fh_totlen] - * name starts at buf[dir_fh_totlen + file_fh_totlen] + * (optional) dir2_fh starts at buf[dir_fh_totlen] + * (optional) file_fh starts at buf[dir_fh_totlen + dir2_fh_totlen] + * name starts at buf[dir_fh_totlen + dir2_fh_totlen + file_fh_totlen] + * ... */ +#define FANOTIFY_DIR_FH_SIZE(info) ((info)->dir_fh_totlen) +#define FANOTIFY_DIR2_FH_SIZE(info) ((info)->dir2_fh_totlen) +#define FANOTIFY_FILE_FH_SIZE(info) ((info)->file_fh_totlen) +#define FANOTIFY_NAME_SIZE(info) ((info)->name_len + 1) +#define FANOTIFY_NAME2_SIZE(info) ((info)->name2_len + 1) + +#define FANOTIFY_DIR_FH_OFFSET(info) 0 +#define FANOTIFY_DIR2_FH_OFFSET(info) \ + (FANOTIFY_DIR_FH_OFFSET(info) + FANOTIFY_DIR_FH_SIZE(info)) +#define FANOTIFY_FILE_FH_OFFSET(info) \ + (FANOTIFY_DIR2_FH_OFFSET(info) + FANOTIFY_DIR2_FH_SIZE(info)) +#define FANOTIFY_NAME_OFFSET(info) \ + (FANOTIFY_FILE_FH_OFFSET(info) + FANOTIFY_FILE_FH_SIZE(info)) +#define FANOTIFY_NAME2_OFFSET(info) \ + (FANOTIFY_NAME_OFFSET(info) + FANOTIFY_NAME_SIZE(info)) + +#define FANOTIFY_DIR_FH_BUF(info) \ + ((info)->buf + FANOTIFY_DIR_FH_OFFSET(info)) +#define FANOTIFY_DIR2_FH_BUF(info) \ + ((info)->buf + FANOTIFY_DIR2_FH_OFFSET(info)) +#define FANOTIFY_FILE_FH_BUF(info) \ + ((info)->buf + FANOTIFY_FILE_FH_OFFSET(info)) +#define FANOTIFY_NAME_BUF(info) \ + ((info)->buf + FANOTIFY_NAME_OFFSET(info)) +#define FANOTIFY_NAME2_BUF(info) \ + ((info)->buf + FANOTIFY_NAME2_OFFSET(info)) } __aligned(4); static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh) @@ -87,7 +117,21 @@ static inline struct fanotify_fh *fanotify_info_dir_fh(struct fanotify_info *inf { BUILD_BUG_ON(offsetof(struct fanotify_info, buf) % 4); - return (struct fanotify_fh *)info->buf; + return (struct fanotify_fh *)FANOTIFY_DIR_FH_BUF(info); +} + +static inline int fanotify_info_dir2_fh_len(struct fanotify_info *info) +{ + if (!info->dir2_fh_totlen || + WARN_ON_ONCE(info->dir2_fh_totlen < FANOTIFY_FH_HDR_LEN)) + return 0; + + return info->dir2_fh_totlen - FANOTIFY_FH_HDR_LEN; +} + +static inline struct fanotify_fh *fanotify_info_dir2_fh(struct fanotify_info *info) +{ + return (struct fanotify_fh *)FANOTIFY_DIR2_FH_BUF(info); } static inline int fanotify_info_file_fh_len(struct fanotify_info *info) @@ -101,32 +145,90 @@ static inline int fanotify_info_file_fh_len(struct fanotify_info *info) static inline struct fanotify_fh *fanotify_info_file_fh(struct fanotify_info *info) { - return (struct fanotify_fh *)(info->buf + info->dir_fh_totlen); + return (struct fanotify_fh *)FANOTIFY_FILE_FH_BUF(info); } -static inline const char *fanotify_info_name(struct fanotify_info *info) +static inline char *fanotify_info_name(struct fanotify_info *info) { - return info->buf + info->dir_fh_totlen + info->file_fh_totlen; + if (!info->name_len) + return NULL; + + return FANOTIFY_NAME_BUF(info); +} + +static inline char *fanotify_info_name2(struct fanotify_info *info) +{ + if (!info->name2_len) + return NULL; + + return FANOTIFY_NAME2_BUF(info); } static inline void fanotify_info_init(struct fanotify_info *info) { + BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN + MAX_HANDLE_SZ > U8_MAX); + BUILD_BUG_ON(NAME_MAX > U8_MAX); + info->dir_fh_totlen = 0; + info->dir2_fh_totlen = 0; info->file_fh_totlen = 0; info->name_len = 0; + info->name2_len = 0; +} + +/* These set/copy helpers MUST be called by order */ +static inline void fanotify_info_set_dir_fh(struct fanotify_info *info, + unsigned int totlen) +{ + if (WARN_ON_ONCE(info->dir2_fh_totlen > 0) || + WARN_ON_ONCE(info->file_fh_totlen > 0) || + WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->dir_fh_totlen = totlen; } -static inline unsigned int fanotify_info_len(struct fanotify_info *info) +static inline void fanotify_info_set_dir2_fh(struct fanotify_info *info, + unsigned int totlen) { - return info->dir_fh_totlen + info->file_fh_totlen + info->name_len; + if (WARN_ON_ONCE(info->file_fh_totlen > 0) || + WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->dir2_fh_totlen = totlen; +} + +static inline void fanotify_info_set_file_fh(struct fanotify_info *info, + unsigned int totlen) +{ + if (WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->file_fh_totlen = totlen; } static inline void fanotify_info_copy_name(struct fanotify_info *info, const struct qstr *name) { + if (WARN_ON_ONCE(name->len > NAME_MAX) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + info->name_len = name->len; - strcpy(info->buf + info->dir_fh_totlen + info->file_fh_totlen, - name->name); + strcpy(fanotify_info_name(info), name->name); +} + +static inline void fanotify_info_copy_name2(struct fanotify_info *info, + const struct qstr *name) +{ + if (WARN_ON_ONCE(name->len > NAME_MAX)) + return; + + info->name2_len = name->len; + strcpy(fanotify_info_name2(info), name->name); } /* @@ -271,6 +373,13 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) return info ? fanotify_info_dir_fh_len(info) : 0; } +static inline int fanotify_event_dir2_fh_len(struct fanotify_event *event) +{ + struct fanotify_info *info = fanotify_event_info(event); + + return info ? fanotify_info_dir2_fh_len(info) : 0; +} + static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) { /* For error events, even zeroed fh are reported. */ @@ -284,6 +393,17 @@ static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) return fanotify_event_dir_fh_len(event) > 0; } +static inline bool fanotify_event_has_dir2_fh(struct fanotify_event *event) +{ + return fanotify_event_dir2_fh_len(event) > 0; +} + +static inline bool fanotify_event_has_any_dir_fh(struct fanotify_event *event) +{ + return fanotify_event_has_dir_fh(event) || + fanotify_event_has_dir2_fh(event); +} + struct fanotify_path_event { struct fanotify_event fae; struct path path; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 559bc1e9926d..73a3e939c921 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -129,12 +129,29 @@ static int fanotify_fid_info_len(int fh_len, int name_len) FANOTIFY_EVENT_ALIGN); } +/* FAN_RENAME may have one or two dir+name info records */ +static int fanotify_dir_name_info_len(struct fanotify_event *event) +{ + struct fanotify_info *info = fanotify_event_info(event); + int dir_fh_len = fanotify_event_dir_fh_len(event); + int dir2_fh_len = fanotify_event_dir2_fh_len(event); + int info_len = 0; + + if (dir_fh_len) + info_len += fanotify_fid_info_len(dir_fh_len, + info->name_len); + if (dir2_fh_len) + info_len += fanotify_fid_info_len(dir2_fh_len, + info->name2_len); + + return info_len; +} + static size_t fanotify_event_len(unsigned int info_mode, struct fanotify_event *event) { size_t event_len = FAN_EVENT_METADATA_LEN; struct fanotify_info *info; - int dir_fh_len; int fh_len; int dot_len = 0; @@ -146,9 +163,8 @@ static size_t fanotify_event_len(unsigned int info_mode, info = fanotify_event_info(event); - if (fanotify_event_has_dir_fh(event)) { - dir_fh_len = fanotify_event_dir_fh_len(event); - event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + if (fanotify_event_has_any_dir_fh(event)) { + event_len += fanotify_dir_name_info_len(event); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* @@ -332,11 +348,10 @@ static int process_access_response(struct fsnotify_group *group, static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { - struct fanotify_event_info_error info; + struct fanotify_event_info_error info = { }; struct fanotify_error_event *fee = FANOTIFY_EE(event); info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; - info.hdr.pad = 0; info.hdr.len = FANOTIFY_ERROR_INFO_LEN; if (WARN_ON(count < info.hdr.len)) @@ -380,6 +395,8 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return -EFAULT; break; case FAN_EVENT_INFO_TYPE_DFID_NAME: + case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: + case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: if (WARN_ON_ONCE(!name || !name_len)) return -EFAULT; break; @@ -479,11 +496,19 @@ static int copy_info_records_to_user(struct fanotify_event *event, unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; /* - * Event info records order is as follows: dir fid + name, child fid. + * Event info records order is as follows: + * 1. dir fid + name + * 2. (optional) new dir fid + new name + * 3. (optional) child fid */ if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; + + /* FAN_RENAME uses special info types */ + if (event->mask & FAN_RENAME) + info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), info_type, @@ -497,6 +522,22 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + /* New dir fid+name may be reported in addition to old dir fid+name */ + if (fanotify_event_has_dir2_fh(event)) { + info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir2_fh(info), + info_type, + fanotify_info_name2(info), + info->name2_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; @@ -1057,7 +1098,7 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, - unsigned int type, + unsigned int obj_type, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; @@ -1080,7 +1121,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, } fsnotify_init_mark(mark, group); - ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); if (ret) { fsnotify_put_mark(mark); goto out_dec_ucounts; @@ -1105,7 +1146,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) } static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { @@ -1116,7 +1157,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); + fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid); if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); return PTR_ERR(fsn_mark); @@ -1275,6 +1316,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; + /* + * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID + * and is used as an indication to report both dir and child fid on all + * dirent events. + */ + if ((fid_mode & FAN_REPORT_TARGET_FID) && + (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) + return -EINVAL; + f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -1536,6 +1586,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; + /* + * FAN_RENAME uses special info type records to report the old and + * new parent+name. Reporting only old and new parent id is less + * useful and was not implemented. + */ + if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) @@ -1667,7 +1725,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4034ca566f95..ab81a0776ece 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -279,6 +279,18 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; + /* + * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. + * The only ->handle_inode_event() backend that supports FS_RENAME is + * dnotify, where it means file was renamed within same parent. + */ + if (mask & FS_RENAME) { + struct dentry *moved = fsnotify_data_dentry(data, data_type); + + if (dir != moved->d_parent->d_inode) + return 0; + } + if (parent_mark) { /* * parent_mark indicates that the parent inode is watching @@ -330,7 +342,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, /* clear ignored on inode modification */ if (mask & FS_MODIFY) { - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -340,7 +352,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, } } - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -405,7 +417,7 @@ static unsigned int fsnotify_iter_select_report_types( int type; /* Choose max prio group among groups of all queue heads */ - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) @@ -417,7 +429,7 @@ static unsigned int fsnotify_iter_select_report_types( /* Set the report mask for marks from same group as max prio group */ iter_info->report_mask = 0; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) == 0) @@ -435,7 +447,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { int type; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (fsnotify_iter_should_report_type(iter_info, type)) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); @@ -469,7 +481,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; - struct inode *parent = NULL; + struct inode *inode2 = NULL; + struct dentry *moved; + int inode2_type; int ret = 0; __u32 test_mask, marks_mask; @@ -479,12 +493,19 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; + /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ + if (mask & FS_RENAME) { + moved = fsnotify_data_dentry(data, data_type); + inode2 = moved->d_parent->d_inode; + inode2_type = FSNOTIFY_ITER_TYPE_INODE2; + } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ - parent = dir; + inode2 = dir; + inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* @@ -497,7 +518,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if (!sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!parent || !parent->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks)) return 0; marks_mask = sb->s_fsnotify_mask; @@ -505,8 +526,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, marks_mask |= mnt->mnt_fsnotify_mask; if (inode) marks_mask |= inode->i_fsnotify_mask; - if (parent) - marks_mask |= parent->i_fsnotify_mask; + if (inode2) + marks_mask |= inode2->i_fsnotify_mask; /* @@ -519,19 +540,19 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sb->s_fsnotify_marks); if (mnt) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = + iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = + iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } - if (parent) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] = - fsnotify_first_mark(&parent->i_fsnotify_marks); + if (inode2) { + iter_info.marks[inode2_type] = + fsnotify_first_mark(&inode2->i_fsnotify_marks); } /* diff --git a/fs/notify/group.c b/fs/notify/group.c index 6a297efc4788..b7d4d64f87c2 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -58,7 +58,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) fsnotify_group_stop_queueing(group); /* Clear all marks for this group and queue them for destruction */ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_ANY); /* * Some marks can still be pinned when waiting for response from diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fa1d99101f89..9007d6affff3 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -353,7 +353,7 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) { int type; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); @@ -382,7 +382,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - fsnotify_foreach_obj_type(type) + fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } @@ -496,7 +496,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int type, + unsigned int obj_type, __kernel_fsid_t *fsid) { struct inode *inode = NULL; @@ -507,7 +507,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); - conn->type = type; + conn->type = obj_type; conn->obj = connp; /* Cache fsid of filesystem containing the object */ if (fsid) { @@ -572,7 +572,8 @@ out: * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, + unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; @@ -580,7 +581,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, int cmp; int err = 0; - if (WARN_ON(!fsnotify_valid_obj_type(type))) + if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* Backend is expected to check for zero fsid (e.g. tmpfs) */ @@ -592,7 +593,8 @@ restart: conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, type, fsid); + err = fsnotify_attach_connector_to_object(connp, obj_type, + fsid); if (err) return err; goto restart; @@ -665,7 +667,7 @@ out_err: * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; @@ -686,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid); if (ret) goto err; @@ -706,13 +708,14 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int type, int allow_dups, __kernel_fsid_t *fsid) + unsigned int obj_type, int allow_dups, + __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid); mutex_unlock(&group->mark_mutex); return ret; } @@ -747,14 +750,14 @@ EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, - unsigned int type_mask) + unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ - if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) { + if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } @@ -769,7 +772,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if ((1U << mark->connector->type) & type_mask) + if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index 1667a7e590d8..f93e69a61283 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -52,6 +52,7 @@ config NTFS_DEBUG config NTFS_RW bool "NTFS write support" depends on NTFS_FS + depends on PAGE_SIZE_LESS_THAN_64KB help This enables the partial, but safe, write support in the NTFS driver. diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index d563abc3e136..2911c04a33e0 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bb247bc349e4..bf9357123bc5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; - struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct buffer_head *root_bh; /* * Update the counts and position values within all the diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 68d11c295dd3..498da317580a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1799,20 +1799,20 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret && ret != -EAGAIN) { - mlog_errno(ret); - goto out_quota; - } + if (ret) { + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } - /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. - */ - if (ret == -EAGAIN) { - BUG_ON(wc->w_target_page); - ret = 0; + mlog_errno(ret); goto out_quota; } diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 810d32815593..563881ddbf00 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(KTHREAD), }; -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; +static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; +ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) @@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = { }; static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, + .default_groups = mlog_default_groups, + .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { @@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) int i = 0; while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } - mlog_attr_ptrs[i] = NULL; + mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bd8d534f11cb..f2cc1ff29e6d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dir_entry *de, *last_de = NULL; char *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + unsigned int rec_len, new_rec_len, free_space; /* * This calculates how many free bytes we'd have in block zero, should diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index de56e6231af8..1ad7106741f8 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = { &ocfs2_filecheck_attr_set.attr, NULL }; +ATTRIBUTE_GROUPS(ocfs2_filecheck); static void ocfs2_filecheck_release(struct kobject *kobj) { @@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = { }; static struct kobj_type ocfs2_ktype_filecheck = { - .default_attrs = ocfs2_filecheck_attrs, + .default_groups = ocfs2_filecheck_groups, .sysfs_ops = &ocfs2_filecheck_ops, .release = ocfs2_filecheck_release, }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index dbf9b9e97d74..1887a2708709 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); goto done; } @@ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); diff --git a/fs/open.c b/fs/open.c index f732fb94600c..9ff2f621b760 100644 --- a/fs/open.c +++ b/fs/open.c @@ -32,6 +32,7 @@ #include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> +#include <linux/mnt_idmapping.h> #include "internal.h" @@ -640,7 +641,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns; + struct user_namespace *mnt_userns, *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -652,8 +653,9 @@ int chown_common(const struct path *path, uid_t user, gid_t group) gid = make_kgid(current_user_ns(), group); mnt_userns = mnt_user_ns(path->mnt); - uid = kuid_from_mnt(mnt_userns, uid); - gid = kgid_from_mnt(mnt_userns, gid); + fs_userns = i_user_ns(inode); + uid = mapped_kuid_user(mnt_userns, fs_userns, uid); + gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 538e839590ef..b501dc07f922 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -176,7 +176,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap) { kfree(bufmap->page_array); kfree(bufmap->desc_array); - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); kfree(bufmap); } @@ -226,8 +226,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) bufmap->desc_size = user_desc->size; bufmap->desc_shift = ilog2(bufmap->desc_size); - bufmap->buffer_index_array = - kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL); + bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL); if (!bufmap->buffer_index_array) goto out_free_bufmap; @@ -250,7 +249,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) out_free_desc_array: kfree(bufmap->desc_array); out_free_index_array: - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); out_free_bufmap: kfree(bufmap); out: diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 3627ea946402..de80b62553bb 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -894,10 +894,11 @@ static struct attribute *orangefs_default_attrs[] = { &perf_time_interval_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(orangefs_default); static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = orangefs_default_attrs, + .default_groups = orangefs_default_groups, }; static struct orangefs_attribute acache_hard_limit_attribute = @@ -931,10 +932,11 @@ static struct attribute *acache_orangefs_default_attrs[] = { &acache_timeout_msecs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(acache_orangefs_default); static struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = acache_orangefs_default_attrs, + .default_groups = acache_orangefs_default_groups, }; static struct orangefs_attribute capcache_hard_limit_attribute = @@ -968,10 +970,11 @@ static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_timeout_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(capcache_orangefs_default); static struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = capcache_orangefs_default_attrs, + .default_groups = capcache_orangefs_default_groups, }; static struct orangefs_attribute ccache_hard_limit_attribute = @@ -1005,10 +1008,11 @@ static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_timeout_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(ccache_orangefs_default); static struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = ccache_orangefs_default_attrs, + .default_groups = ccache_orangefs_default_groups, }; static struct orangefs_attribute ncache_hard_limit_attribute = @@ -1042,10 +1046,11 @@ static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_timeout_msecs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(ncache_orangefs_default); static struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = ncache_orangefs_default_attrs, + .default_groups = ncache_orangefs_default_groups, }; static struct orangefs_attribute pc_acache_attribute = @@ -1072,10 +1077,11 @@ static struct attribute *pc_orangefs_default_attrs[] = { &pc_ncache_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(pc_orangefs_default); static struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = pc_orangefs_default_attrs, + .default_groups = pc_orangefs_default_groups, }; static struct orangefs_attribute stats_reads_attribute = @@ -1095,10 +1101,11 @@ static struct attribute *stats_orangefs_default_attrs[] = { &stats_writes_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(stats_orangefs_default); static struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = stats_orangefs_default_attrs, + .default_groups = stats_orangefs_default_groups, }; static struct kobject *orangefs_obj; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 265181c110ae..7bb0a47cb615 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -873,7 +873,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } - if (mnt_user_ns(path->mnt) != &init_user_ns) { + if (is_idmapped_mnt(path->mnt)) { pr_err("idmapped layers are currently not supported\n"); goto out_put; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9323a854a60a..80acb6885cf9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -23,6 +23,7 @@ #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/namei.h> +#include <linux/mnt_idmapping.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -374,7 +375,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, goto check_perm; break; case ACL_USER: - uid = kuid_into_mnt(mnt_userns, pa->e_uid); + uid = mapped_kuid_fs(mnt_userns, + i_user_ns(inode), + pa->e_uid); if (uid_eq(uid, current_fsuid())) goto mask; break; @@ -387,7 +390,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - gid = kgid_into_mnt(mnt_userns, pa->e_gid); + gid = mapped_kgid_fs(mnt_userns, + i_user_ns(inode), + pa->e_gid); if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) @@ -734,17 +739,17 @@ static void posix_acl_fix_xattr_userns( case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); if (from_user) - uid = kuid_from_mnt(mnt_userns, uid); + uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); else - uid = kuid_into_mnt(mnt_userns, uid); + uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); if (from_user) - gid = kgid_from_mnt(mnt_userns, gid); + gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); else - gid = kgid_into_mnt(mnt_userns, gid); + gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e..43a7abde9e42 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -468,6 +468,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, u64 cgtime, gtime; unsigned long rsslim = 0; unsigned long flags; + int exit_code = task->exit_code; state = *get_task_state(task); vsize = eip = esp = 0; @@ -531,6 +532,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, maj_flt += sig->maj_flt; thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; + + if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) + exit_code = sig->group_exit_code; } sid = task_session_nr_ns(task, ns); @@ -630,7 +634,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) - seq_put_decimal_ll(m, " ", task->exit_code); + seq_put_decimal_ll(m, " ", exit_code); else seq_puts(m, " 0"); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 15c2e55d2ed2..39b823ab2564 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -61,15 +61,27 @@ static int seq_open_net(struct inode *inode, struct file *file) } #ifdef CONFIG_NET_NS p->net = net; + netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } +static void seq_file_net_put_net(struct seq_file *seq) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *priv = seq->private; + + put_net_track(priv->net, &priv->ns_tracker); +#else + put_net(&init_net); +#endif +} + static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; - put_net(seq_file_net(seq)); + seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } @@ -87,7 +99,8 @@ int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; - p->net = get_net(current->nsproxy->net_ns); + p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, + GFP_KERNEL); #endif return 0; } @@ -97,7 +110,7 @@ void bpf_iter_fini_seq_net(void *priv_data) #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; - put_net(p->net); + put_net_track(p->net, &p->ns_tracker); #endif } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/vmacache.h> +#include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> @@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 30a3b66f475a..509f85148fee 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -154,9 +154,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (!pfn_is_ram(pfn)) - memset(buf, 0, nr_bytes); - else { + if (!pfn_is_ram(pfn)) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -165,12 +169,12 @@ ssize_t read_from_oldmem(char *buf, size_t count, else tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - - if (tmp < 0) { - up_read(&vmcore_cb_rwsem); - return tmp; - } } + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); + return tmp; + } + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 392ef5162655..49650e54d2f8 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -80,7 +80,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, fs_infop->str); } - if (mnt_user_ns(mnt) != &init_user_ns) + if (is_idmapped_mnt(mnt)) seq_puts(m, ",idmapped"); } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 328da35da390..8adabde685f1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -173,7 +173,6 @@ config PSTORE_BLK tristate "Log panic/oops to a block device" depends on PSTORE depends on BLOCK - depends on BROKEN select PSTORE_ZONE default n help diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 5d1fbaffd66a..4ae0cfcd15f2 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -309,7 +309,7 @@ static int __init __best_effort_init(void) if (ret) kfree(best_effort_dev); else - pr_info("attached %s (%zu) (no dedicated panic_write!)\n", + pr_info("attached %s (%lu) (no dedicated panic_write!)\n", blkdev, best_effort_dev->zone.total_size); return ret; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 5939595f0115..776cae20af4e 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -64,20 +64,12 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = { static DEFINE_MUTEX(pstore_ftrace_lock); static bool pstore_ftrace_enabled; -static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, - size_t count, loff_t *ppos) +static int pstore_set_ftrace_enabled(bool on) { - u8 on; ssize_t ret; - ret = kstrtou8_from_user(buf, count, 2, &on); - if (ret) - return ret; - - mutex_lock(&pstore_ftrace_lock); - - if (!on ^ pstore_ftrace_enabled) - goto out; + if (on == pstore_ftrace_enabled) + return 0; if (on) { ftrace_ops_set_global_filter(&pstore_ftrace_ops); @@ -89,15 +81,30 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, if (ret) { pr_err("%s: unable to %sregister ftrace ops: %zd\n", __func__, on ? "" : "un", ret); - goto err; + } else { + pstore_ftrace_enabled = on; } - pstore_ftrace_enabled = on; -out: - ret = count; -err: + return ret; +} + +static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + u8 on; + ssize_t ret; + + ret = kstrtou8_from_user(buf, count, 2, &on); + if (ret) + return ret; + + mutex_lock(&pstore_ftrace_lock); + ret = pstore_set_ftrace_enabled(on); mutex_unlock(&pstore_ftrace_lock); + if (ret == 0) + ret = count; + return ret; } @@ -117,6 +124,11 @@ static const struct file_operations pstore_knob_fops = { static struct dentry *pstore_ftrace_dir; +static bool record_ftrace; +module_param(record_ftrace, bool, 0400); +MODULE_PARM_DESC(record_ftrace, + "enable ftrace recording immediately (default: off)"); + void pstore_register_ftrace(void) { if (!psinfo->write) @@ -124,6 +136,8 @@ void pstore_register_ftrace(void) pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); + pstore_set_ftrace_enabled(record_ftrace); + debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, NULL, &pstore_knob_fops); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 0834b101c316..a3e21160b634 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -951,7 +951,9 @@ static int reiserfs_async_progress_wait(struct super_block *s) int depth; depth = reiserfs_write_unlock_nested(s); - congestion_wait(BLK_RW_ASYNC, HZ / 10); + wait_var_event_timeout(&j->j_async_throttle, + atomic_read(&j->j_async_throttle) == 0, + HZ / 10); reiserfs_write_lock_nested(s, depth); } @@ -1058,7 +1060,8 @@ static int flush_commit_list(struct super_block *s, put_bh(tbh) ; } } - atomic_dec(&journal->j_async_throttle); + if (atomic_dec_and_test(&journal->j_async_throttle)) + wake_up_var(&journal->j_async_throttle); for (i = 0; i < (jl->j_len + 1); i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + diff --git a/fs/remap_range.c b/fs/remap_range.c index 6d4a9beaa097..231159682907 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -146,41 +146,41 @@ static int generic_remap_check_len(struct inode *inode_in, } /* Read a page's worth of file data into the page cache. */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos) { - struct page *page; + struct folio *folio; - page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); + if (IS_ERR(folio)) + return folio; + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-EIO); } - return page; + return folio; } /* - * Lock two pages, ensuring that we lock in offset order if the pages are from - * the same file. + * Lock two folios, ensuring that we lock in offset order if the folios + * are from the same file. */ -static void vfs_lock_two_pages(struct page *page1, struct page *page2) +static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) { /* Always lock in order of increasing index. */ - if (page1->index > page2->index) - swap(page1, page2); + if (folio1->index > folio2->index) + swap(folio1, folio2); - lock_page(page1); - if (page1 != page2) - lock_page(page2); + folio_lock(folio1); + if (folio1 != folio2) + folio_lock(folio2); } -/* Unlock two pages, being careful not to unlock the same page twice. */ -static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +/* Unlock two folios, being careful not to unlock the same folio twice. */ +static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) { - unlock_page(page1); - if (page1 != page2) - unlock_page(page2); + folio_unlock(folio1); + if (folio1 != folio2) + folio_unlock(folio2); } /* @@ -188,77 +188,71 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2) * Caller must have locked both inodes to prevent write races. */ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, + struct inode *dest, loff_t dstoff, loff_t len, bool *is_same) { - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; + bool same = true; + int error = -EINVAL; + while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); + struct folio *src_folio, *dst_folio; + void *src_addr, *dst_addr; + loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), + PAGE_SIZE - offset_in_page(dstoff)); + cmp_len = min(cmp_len, len); if (cmp_len <= 0) goto out_error; - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); + src_folio = vfs_dedupe_get_folio(src, srcoff); + if (IS_ERR(src_folio)) { + error = PTR_ERR(src_folio); goto out_error; } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - put_page(src_page); + dst_folio = vfs_dedupe_get_folio(dest, dstoff); + if (IS_ERR(dst_folio)) { + error = PTR_ERR(dst_folio); + folio_put(src_folio); goto out_error; } - vfs_lock_two_pages(src_page, dest_page); + vfs_lock_two_folios(src_folio, dst_folio); /* - * Now that we've locked both pages, make sure they're still + * Now that we've locked both folios, make sure they're still * mapped to the file data we're interested in. If not, * someone is invalidating pages on us and we lose. */ - if (!PageUptodate(src_page) || !PageUptodate(dest_page) || - src_page->mapping != src->i_mapping || - dest_page->mapping != dest->i_mapping) { + if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || + src_folio->mapping != src->i_mapping || + dst_folio->mapping != dest->i_mapping) { same = false; goto unlock; } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); + src_addr = kmap_local_folio(src_folio, + offset_in_folio(src_folio, srcoff)); + dst_addr = kmap_local_folio(dst_folio, + offset_in_folio(dst_folio, dstoff)); - flush_dcache_page(src_page); - flush_dcache_page(dest_page); + flush_dcache_folio(src_folio); + flush_dcache_folio(dst_folio); - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + if (memcmp(src_addr, dst_addr, cmp_len)) same = false; - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); + kunmap_local(dst_addr); + kunmap_local(src_addr); unlock: - vfs_unlock_two_pages(src_page, dest_page); - put_page(dest_page); - put_page(src_page); + vfs_unlock_two_folios(src_folio, dst_folio); + folio_put(dst_folio); + folio_put(src_folio); if (!same) break; srcoff += cmp_len; - destoff += cmp_len; + dstoff += cmp_len; len -= cmp_len; } diff --git a/fs/select.c b/fs/select.c index 945896d0ac9e..0ee55af1a55c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ +#include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> @@ -458,9 +459,11 @@ get_max: return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -527,6 +530,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -534,34 +538,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in; diff --git a/fs/signalfd.c b/fs/signalfd.c index 040e1cf90528..e20d1484c663 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -35,17 +35,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) { - wait_queue_head_t *wqh = &sighand->signalfd_wqh; - /* - * The lockless check can race with remove_wait_queue() in progress, - * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. - */ - if (likely(!waitqueue_active(wqh))) - return; - - /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ - wake_up_poll(wqh, EPOLLHUP | POLLFREE); + wake_up_pollfree(&sighand->signalfd_wqh); } struct signalfd_ctx { @@ -165,11 +155,12 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, int nonblock) { + enum pid_type type; ssize_t ret; DECLARE_WAITQUEUE(wait, current); spin_lock_irq(¤t->sighand->siglock); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); switch (ret) { case 0: if (!nonblock) @@ -184,7 +175,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info add_wait_queue(¤t->sighand->signalfd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); if (ret != 0) break; if (signal_pending(current)) { diff --git a/fs/smbfs_common/cifs_arc4.c b/fs/smbfs_common/cifs_arc4.c index 85ba15a60b13..043e4cb839fa 100644 --- a/fs/smbfs_common/cifs_arc4.c +++ b/fs/smbfs_common/cifs_arc4.c @@ -72,16 +72,3 @@ void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int l ctx->y = y; } EXPORT_SYMBOL_GPL(cifs_arc4_crypt); - -static int __init -init_smbfs_common(void) -{ - return 0; -} -static void __init -exit_smbfs_common(void) -{ -} - -module_init(init_smbfs_common) -module_exit(exit_smbfs_common) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index bb44ff4c5cc6..b1b556dbce12 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> +#include <linux/backing-dev.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } +static int squashfs_bdi_init(struct super_block *sb) +{ + int err; + unsigned int major = MAJOR(sb->s_dev); + unsigned int minor = MINOR(sb->s_dev); + + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + + err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); + if (err) + return err; + + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + + return 0; +} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); + /* + * squashfs provides 'backing_dev_info' in order to disable read-ahead. For + * squashfs, I/O is not deferred, it is done immediately in readpage, + * which means the user would always have to wait their own I/O. So the effect + * of readahead is very weak for squashfs. squashfs_bdi_init will set + * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for + * squashfs. + */ + err = squashfs_bdi_init(sb); + if (err) { + errorf(fc, "squashfs init bdi failed"); + return err; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/super.c b/fs/super.c index 3bfc0f8fbd5b..a6405d44d4ca 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1423,8 +1423,8 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -static int reconfigure_single(struct super_block *s, - int flags, void *data) +int reconfigure_single(struct super_block *s, + int flags, void *data) { struct fs_context *fc; int ret; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 925a621b432e..bafc02bf8220 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,12 +109,12 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * also the directory that is being deleted. */ inode_unlock(inode); - inode_unlock(dentry->d_inode); + inode_unlock(d_inode(dentry)); ret = tracefs_ops.rmdir(name); inode_lock_nested(inode, I_MUTEX_PARENT); - inode_lock(dentry->d_inode); + inode_lock(d_inode(dentry)); kfree(name); @@ -161,6 +161,77 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; }; +static void change_gid(struct dentry *dentry, kgid_t gid) +{ + if (!dentry->d_inode) + return; + dentry->d_inode->i_gid = gid; +} + +/* + * Taken from d_walk, but without he need for handling renames. + * Nothing can be renamed while walking the list, as tracefs + * does not support renames. This is only called when mounting + * or remounting the file system, to set all the files to + * the given gid. + */ +static void set_gid(struct dentry *parent, kgid_t gid) +{ + struct dentry *this_parent; + struct list_head *next; + + this_parent = parent; + spin_lock(&this_parent->d_lock); + + change_gid(this_parent, gid); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + change_gid(dentry, gid); + + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + rcu_read_lock(); +ascend: + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = child->d_parent; + + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* go into the first sibling still alive */ + do { + next = child->d_child.next; + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); + rcu_read_unlock(); + goto resume; + } + rcu_read_unlock(); + spin_unlock(&this_parent->d_lock); + return; +} + static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -193,6 +264,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; + set_gid(tracefs_mount->mnt_root, gid); break; case Opt_mode: if (match_octal(&args[0], &option)) @@ -212,7 +284,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) static int tracefs_apply_options(struct super_block *sb) { struct tracefs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -331,18 +403,18 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - inode_lock(parent->d_inode); - if (unlikely(IS_DEADDIR(parent->d_inode))) + inode_lock(d_inode(parent)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { - inode_unlock(parent->d_inode); + inode_unlock(d_inode(parent)); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +423,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +431,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -414,8 +486,10 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } @@ -436,12 +510,14 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 5c4b845754a7..314c80b24a76 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -5,7 +5,7 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o -ubifs-y += misc.o +ubifs-y += misc.o sysfs.o ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7c61d0ec0159..dbe72f664abf 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1207,7 +1207,7 @@ out_budg: * @inode1: first inode * @inode2: second inode * @inode3: third inode - * @inode4: fouth inode + * @inode4: fourth inode * * This function is used for 'ubifs_rename()' and @inode1 may be the same as * @inode2 whereas @inode3 and @inode4 may be %NULL. @@ -1233,7 +1233,7 @@ static void lock_4_inodes(struct inode *inode1, struct inode *inode2, * @inode1: first inode * @inode2: second inode * @inode3: third inode - * @inode4: fouth inode + * @inode4: fourth inode */ static void unlock_4_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3, struct inode *inode4) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index dc3e26e9ed7b..3134d070fcc0 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) for (i = 0; ; i++) { int space_before, space_after; + /* Maybe continue after find and break before find */ + lp.lnum = -1; + cond_resched(); /* Give the commit an opportunity to run */ @@ -753,8 +756,19 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) * caller instead of the original '-EAGAIN'. */ err = ubifs_return_leb(c, lp.lnum); - if (err) + if (err) { ret = err; + /* + * An LEB may always be "taken", + * so setting ubifs to read-only, + * and then executing sync wbuf will + * return -EROFS and enter the "out" + * error branch. + */ + ubifs_ro_mode(c, ret); + } + /* Maybe double return LEB if goto out */ + lp.lnum = -1; break; } goto out; @@ -843,7 +857,8 @@ out: ubifs_wbuf_sync_nolock(wbuf); ubifs_ro_mode(c, ret); mutex_unlock(&wbuf->io_mutex); - ubifs_return_leb(c, lp.lnum); + if (lp.lnum != -1) + ubifs_return_leb(c, lp.lnum); return ret; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 00b61dba62b7..789a7813f3fa 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -194,6 +194,24 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) return err; } +static void record_magic_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->magic_errors++; +} + +static void record_node_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->node_errors++; +} + +static void record_crc_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->crc_errors++; +} + /** * ubifs_check_node - check node. * @c: UBIFS file-system description object @@ -238,6 +256,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (!quiet) ubifs_err(c, "bad magic %#08x, expected %#08x", magic, UBIFS_NODE_MAGIC); + record_magic_error(c->stats); err = -EUCLEAN; goto out; } @@ -246,6 +265,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { if (!quiet) ubifs_err(c, "bad node type %d", type); + record_node_error(c->stats); goto out; } @@ -270,6 +290,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (!quiet) ubifs_err(c, "bad CRC: calculated %#08x, read %#08x", crc, node_crc); + record_crc_error(c->stats); err = -EUCLEAN; goto out; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 5260d3e531bb..4211e4456b1e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -106,7 +106,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) * property values should be @lp->free == @c->leb_size and * @lp->dirty == 0, but that is not the case. The reason is that * the LEB had been garbage collected before it became the bud, - * and there was not commit inbetween. The garbage collector + * and there was no commit in between. The garbage collector * resets the free and dirty space without recording it * anywhere except lprops, so if there was no commit then * lprops does not have that information. diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f0fb25727d96..aa7a1381c457 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1264,6 +1264,10 @@ static int mount_ubifs(struct ubifs_info *c) if (err) return err; + err = ubifs_sysfs_register(c); + if (err) + goto out_debugging; + err = check_volume_empty(c); if (err) goto out_free; @@ -1367,7 +1371,7 @@ static int mount_ubifs(struct ubifs_info *c) sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1375,7 +1379,6 @@ static int mount_ubifs(struct ubifs_info *c) c->bgt_name, err); goto out_wbufs; } - wake_up_process(c->bgt); } err = ubifs_read_master(c); @@ -1641,6 +1644,8 @@ out_free: vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); + ubifs_sysfs_unregister(c); +out_debugging: ubifs_debugging_exit(c); return err; } @@ -1684,6 +1689,7 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_debugging_exit(c); + ubifs_sysfs_unregister(c); } /** @@ -1780,7 +1786,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1788,7 +1794,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->bgt_name, err); goto out; } - wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); if (!c->orph_buf) { @@ -1853,7 +1858,6 @@ out: kthread_stop(c->bgt); c->bgt = NULL; } - free_wbufs(c); kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); @@ -2436,14 +2440,20 @@ static int __init ubifs_init(void) dbg_debugfs_init(); + err = ubifs_sysfs_init(); + if (err) + goto out_dbg; + err = register_filesystem(&ubifs_fs_type); if (err) { pr_err("UBIFS error (pid %d): cannot register file system, error %d", current->pid, err); - goto out_dbg; + goto out_sysfs; } return 0; +out_sysfs: + ubifs_sysfs_exit(); out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); @@ -2462,6 +2472,7 @@ static void __exit ubifs_exit(void) WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); + ubifs_sysfs_exit(); ubifs_compressors_exit(); unregister_shrinker(&ubifs_shrinker_info); diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c new file mode 100644 index 000000000000..7acc5a74e5fa --- /dev/null +++ b/fs/ubifs/sysfs.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This file is part of UBIFS. + * + * Copyright (C) 2021 Cisco Systems + * + * Author: Stefan Schaeckeler + */ + + +#include <linux/fs.h> +#include "ubifs.h" + +enum attr_id_t { + attr_errors_magic, + attr_errors_node, + attr_errors_crc, +}; + +struct ubifs_attr { + struct attribute attr; + enum attr_id_t attr_id; +}; + +#define UBIFS_ATTR(_name, _mode, _id) \ +static struct ubifs_attr ubifs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} + +#define UBIFS_ATTR_FUNC(_name, _mode) UBIFS_ATTR(_name, _mode, _name) + +UBIFS_ATTR_FUNC(errors_magic, 0444); +UBIFS_ATTR_FUNC(errors_crc, 0444); +UBIFS_ATTR_FUNC(errors_node, 0444); + +#define ATTR_LIST(name) (&ubifs_attr_##name.attr) + +static struct attribute *ubifs_attrs[] = { + ATTR_LIST(errors_magic), + ATTR_LIST(errors_node), + ATTR_LIST(errors_crc), + NULL, +}; + +static ssize_t ubifs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ubifs_info *sbi = container_of(kobj, struct ubifs_info, + kobj); + + struct ubifs_attr *a = container_of(attr, struct ubifs_attr, attr); + + switch (a->attr_id) { + case attr_errors_magic: + return sysfs_emit(buf, "%u\n", sbi->stats->magic_errors); + case attr_errors_node: + return sysfs_emit(buf, "%u\n", sbi->stats->node_errors); + case attr_errors_crc: + return sysfs_emit(buf, "%u\n", sbi->stats->crc_errors); + } + return 0; +}; + +static void ubifs_sb_release(struct kobject *kobj) +{ + struct ubifs_info *c = container_of(kobj, struct ubifs_info, kobj); + + complete(&c->kobj_unregister); +} + +static const struct sysfs_ops ubifs_attr_ops = { + .show = ubifs_attr_show, +}; + +static struct kobj_type ubifs_sb_ktype = { + .default_attrs = ubifs_attrs, + .sysfs_ops = &ubifs_attr_ops, + .release = ubifs_sb_release, +}; + +static struct kobj_type ubifs_ktype = { + .sysfs_ops = &ubifs_attr_ops, +}; + +static struct kset ubifs_kset = { + .kobj = {.ktype = &ubifs_ktype}, +}; + +int ubifs_sysfs_register(struct ubifs_info *c) +{ + int ret, n; + char dfs_dir_name[UBIFS_DFS_DIR_LEN+1]; + + c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL); + if (!c->stats) { + ret = -ENOMEM; + goto out_last; + } + n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + + if (n > UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + ret = -EINVAL; + goto out_free; + } + + c->kobj.kset = &ubifs_kset; + init_completion(&c->kobj_unregister); + + ret = kobject_init_and_add(&c->kobj, &ubifs_sb_ktype, NULL, + "%s", dfs_dir_name); + if (ret) + goto out_put; + + return 0; + +out_put: + kobject_put(&c->kobj); + wait_for_completion(&c->kobj_unregister); +out_free: + kfree(c->stats); +out_last: + ubifs_err(c, "cannot create sysfs entry for ubifs%d_%d, error %d\n", + c->vi.ubi_num, c->vi.vol_id, ret); + return ret; +} + +void ubifs_sysfs_unregister(struct ubifs_info *c) +{ + kobject_del(&c->kobj); + kobject_put(&c->kobj); + wait_for_completion(&c->kobj_unregister); + + kfree(c->stats); +} + +int __init ubifs_sysfs_init(void) +{ + int ret; + + kobject_set_name(&ubifs_kset.kobj, "ubifs"); + ubifs_kset.kobj.parent = fs_kobj; + ret = kset_register(&ubifs_kset); + + return ret; +} + +void ubifs_sysfs_exit(void) +{ + kset_unregister(&ubifs_kset); +} diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c38066ce9ab0..f55828c0a300 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -27,6 +27,8 @@ #include <linux/security.h> #include <linux/xattr.h> #include <linux/random.h> +#include <linux/sysfs.h> +#include <linux/completion.h> #include <crypto/hash_info.h> #include <crypto/hash.h> #include <crypto/algapi.h> @@ -156,6 +158,13 @@ #endif /* + * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) + +/* * Lockdep classes for UBIFS inode @ui_mutex. */ enum { @@ -990,6 +999,18 @@ struct ubifs_budg_info { int dent_budget; }; +/** + * ubifs_stats_info - per-FS statistics information. + * @magic_errors: number of bad magic numbers (will be reset with a new mount). + * @node_errors: number of bad nodes (will be reset with a new mount). + * @crc_errors: number of bad crcs (will be reset with a new mount). + */ +struct ubifs_stats_info { + unsigned int magic_errors; + unsigned int node_errors; + unsigned int crc_errors; +}; + struct ubifs_debug_info; /** @@ -1251,6 +1272,10 @@ struct ubifs_debug_info; * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information + * @stats: statistics exported over sysfs + * + * @kobj: kobject for /sys/fs/ubifs/ + * @kobj_unregister: completion to unregister sysfs kobject */ struct ubifs_info { struct super_block *vfs_sb; @@ -1286,6 +1311,9 @@ struct ubifs_info { spinlock_t cs_lock; wait_queue_head_t cmt_wq; + struct kobject kobj; + struct completion kobj_unregister; + unsigned int big_lpt:1; unsigned int space_fixup:1; unsigned int double_hash:1; @@ -1493,6 +1521,7 @@ struct ubifs_info { struct ubifs_mount_opts mount_opts; struct ubifs_debug_info *dbg; + struct ubifs_stats_info *stats; }; extern struct list_head ubifs_infos; @@ -2072,6 +2101,12 @@ void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, void *out, int *out_len, int compr_type); +/* sysfs.c */ +int ubifs_sysfs_init(void); +void ubifs_sysfs_exit(void); +int ubifs_sysfs_register(struct ubifs_info *c); +void ubifs_sysfs_unregister(struct ubifs_info *c); + #include "debug.h" #include "misc.h" #include "key.h" diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 70abdfad2df1..42e3e551fa4c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/bio.h> +#include <linux/iversion.h> #include "udf_i.h" #include "udf_sb.h" @@ -43,7 +44,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; udf_pblk_t block, iblock; - loff_t nf_pos; + loff_t nf_pos, emit_pos = 0; int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; @@ -57,6 +58,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; struct super_block *sb = dir->i_sb; + bool pos_valid = false; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -67,6 +69,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos >= size) goto out; + /* + * Something changed since last readdir (either lseek was called or dir + * changed)? We need to verify the position correctly points at the + * beginning of some dir entry so that the directory parsing code does + * not get confused. Since UDF does not have any reliable way of + * identifying beginning of dir entry (names are under user control), + * we need to scan the directory from the beginning. + */ + if (!inode_eq_iversion(dir, file->f_version)) { + emit_pos = nf_pos; + nf_pos = 0; + } else { + pos_valid = true; + } + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) { ret = -ENOMEM; @@ -122,13 +139,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) while (nf_pos < size) { struct kernel_lb_addr tloc; + loff_t cur_pos = nf_pos; - ctx->pos = (nf_pos >> 2) + 1; + /* Update file position only if we got past the current one */ + if (nf_pos >= emit_pos) { + ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; + } fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); if (!fi) goto out; + /* Still not at offset where user asked us to read from? */ + if (cur_pos < emit_pos) + continue; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; @@ -186,8 +211,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } /* end while */ ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; out: + if (pos_valid) + file->f_version = inode_query_iversion(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 2ecf0e87660e..b5d611cee749 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -77,6 +77,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) GFP_KERNEL); } if (!iinfo->i_data) { + make_bad_inode(inode); iput(inode); return ERR_PTR(-ENOMEM); } @@ -86,6 +87,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) dinfo->i_location.partitionReferenceNum, start, &err); if (err) { + make_bad_inode(inode); iput(inode); return ERR_PTR(err); } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index caeef08efed2..0ed4861b038f 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/crc-itu-t.h> #include <linux/exportfs.h> +#include <linux/iversion.h> static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) @@ -134,6 +135,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, mark_buffer_dirty_inode(fibh->ebh, inode); mark_buffer_dirty_inode(fibh->sbh, inode); } + inode_inc_iversion(inode); + return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 34247fba6df9..f26b5e0b84b6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -57,6 +57,7 @@ #include <linux/crc-itu-t.h> #include <linux/log2.h> #include <asm/byteorder.h> +#include <linux/iversion.h> #include "udf_sb.h" #include "udf_i.h" @@ -149,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); + inode_set_iversion(&ei->vfs_inode, 1); return &ei->vfs_inode; } diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore index 361294571ab0..51cdf3fb4dd4 100644 --- a/fs/unicode/.gitignore +++ b/fs/unicode/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only /mkutf8data -/utf8data.h +/utf8data.c diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 2c27b9a5cd6c..610d7bc05d6e 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -8,7 +8,16 @@ config UNICODE Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding support. +config UNICODE_UTF8_DATA + tristate "UTF-8 normalization and casefolding tables" + depends on UNICODE + default UNICODE + help + This contains a large table of case foldings, which can be loaded as + a separate module if you say M here. To be on the safe side stick + to the default of Y. Saying N here makes no sense, if you do not want + utf8 casefolding support, disable CONFIG_UNICODE instead. + config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" - depends on UNICODE - default n + depends on UNICODE_UTF8_DATA diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index b88aecc86550..2f9d9188852b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -2,14 +2,15 @@ obj-$(CONFIG_UNICODE) += unicode.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o unicode-y := utf8-norm.o utf8-core.o -$(obj)/utf8-norm.o: $(obj)/utf8data.h +$(obj)/utf8-data.o: $(obj)/utf8data.c -# In the normal build, the checked-in utf8data.h is just shipped. +# In the normal build, the checked-in utf8data.c is just shipped. # -# To generate utf8data.h from UCD, put *.txt files in this directory +# To generate utf8data.c from UCD, put *.txt files in this directory # and pass REGENERATE_UTF8DATA=1 from the command line. ifdef REGENERATE_UTF8DATA @@ -24,15 +25,15 @@ quiet_cmd_utf8data = GEN $@ -t $(srctree)/$(src)/NormalizationTest.txt \ -o $@ -$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE +$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE $(call if_changed,utf8data) else -$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE +$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE $(call if_changed,shipped) endif -targets += utf8data.h +targets += utf8data.c hostprogs += mkutf8data diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index ff2025ac5a32..bc1a7c8b5c8d 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -3287,12 +3287,10 @@ static void write_file(void) open_fail(utf8_name, errno); fprintf(file, "/* This file is generated code, do not edit. */\n"); - fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); - fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); - fprintf(file, "#endif\n"); fprintf(file, "\n"); - fprintf(file, "static const unsigned int utf8vers = %#x;\n", - unicode_maxage); + fprintf(file, "#include <linux/module.h>\n"); + fprintf(file, "#include <linux/kernel.h>\n"); + fprintf(file, "#include \"utf8n.h\"\n"); fprintf(file, "\n"); fprintf(file, "static const unsigned int utf8agetab[] = {\n"); for (i = 0; i != ages_count; i++) @@ -3339,6 +3337,22 @@ static void write_file(void) fprintf(file, "\n"); } fprintf(file, "};\n"); + fprintf(file, "\n"); + fprintf(file, "struct utf8data_table utf8_data_table = {\n"); + fprintf(file, "\t.utf8agetab = utf8agetab,\n"); + fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); + fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); + fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8data = utf8data,\n"); + fprintf(file, "};\n"); + fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); + fprintf(file, "\n"); + fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index dc25823bfed9..67aaadc3ab07 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -5,16 +5,13 @@ #include <linux/slab.h> #include <linux/parser.h> #include <linux/errno.h> -#include <linux/unicode.h> #include <linux/stringhash.h> #include "utf8n.h" int utf8_validate(const struct unicode_map *um, const struct qstr *str) { - const struct utf8data *data = utf8nfdi(um->version); - - if (utf8nlen(data, str->name, str->len) < 0) + if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0) return -1; return 0; } @@ -23,14 +20,13 @@ EXPORT_SYMBOL(utf8_validate); int utf8_strncmp(const struct unicode_map *um, const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdi(um->version); struct utf8cursor cur1, cur2; int c1, c2; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0) return -EINVAL; - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0) return -EINVAL; do { @@ -50,14 +46,13 @@ EXPORT_SYMBOL(utf8_strncmp); int utf8_strncasecmp(const struct unicode_map *um, const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur1, cur2; int c1, c2; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0) return -EINVAL; - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0) return -EINVAL; do { @@ -81,12 +76,11 @@ int utf8_strncasecmp_folded(const struct unicode_map *um, const struct qstr *cf, const struct qstr *s1) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur1; int c1, c2; int i = 0; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0) return -EINVAL; do { @@ -105,11 +99,10 @@ EXPORT_SYMBOL(utf8_strncasecmp_folded); int utf8_casefold(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur; size_t nlen = 0; - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0) return -EINVAL; for (nlen = 0; nlen < dlen; nlen++) { @@ -128,12 +121,11 @@ EXPORT_SYMBOL(utf8_casefold); int utf8_casefold_hash(const struct unicode_map *um, const void *salt, struct qstr *str) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur; int c; unsigned long hash = init_name_hash(salt); - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0) return -EINVAL; while ((c = utf8byte(&cur))) { @@ -149,11 +141,10 @@ EXPORT_SYMBOL(utf8_casefold_hash); int utf8_normalize(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdi(um->version); struct utf8cursor cur; ssize_t nlen = 0; - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0) return -EINVAL; for (nlen = 0; nlen < dlen; nlen++) { @@ -167,69 +158,59 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str, } return -EINVAL; } - EXPORT_SYMBOL(utf8_normalize); -static int utf8_parse_version(const char *version, unsigned int *maj, - unsigned int *min, unsigned int *rev) +static const struct utf8data *find_table_version(const struct utf8data *table, + size_t nr_entries, unsigned int version) { - substring_t args[3]; - char version_string[12]; - static const struct match_token token[] = { - {1, "%d.%d.%d"}, - {0, NULL} - }; - - strncpy(version_string, version, sizeof(version_string)); - - if (match_token(version_string, token, args) != 1) - return -EINVAL; - - if (match_int(&args[0], maj) || match_int(&args[1], min) || - match_int(&args[2], rev)) - return -EINVAL; + size_t i = nr_entries - 1; - return 0; + while (version < table[i].maxage) + i--; + if (version > table[i].maxage) + return NULL; + return &table[i]; } -struct unicode_map *utf8_load(const char *version) +struct unicode_map *utf8_load(unsigned int version) { - struct unicode_map *um = NULL; - int unicode_version; - - if (version) { - unsigned int maj, min, rev; - - if (utf8_parse_version(version, &maj, &min, &rev) < 0) - return ERR_PTR(-EINVAL); - - if (!utf8version_is_supported(maj, min, rev)) - return ERR_PTR(-EINVAL); - - unicode_version = UNICODE_AGE(maj, min, rev); - } else { - unicode_version = utf8version_latest(); - printk(KERN_WARNING"UTF-8 version not specified. " - "Assuming latest supported version (%d.%d.%d).", - (unicode_version >> 16) & 0xff, - (unicode_version >> 8) & 0xff, - (unicode_version & 0xff)); - } + struct unicode_map *um; um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); if (!um) return ERR_PTR(-ENOMEM); - - um->charset = "UTF-8"; - um->version = unicode_version; - + um->version = version; + + um->tables = symbol_request(utf8_data_table); + if (!um->tables) + goto out_free_um; + + if (!utf8version_is_supported(um, version)) + goto out_symbol_put; + um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata, + um->tables->utf8nfdidata_size, um->version); + if (!um->ntab[UTF8_NFDI]) + goto out_symbol_put; + um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata, + um->tables->utf8nfdicfdata_size, um->version); + if (!um->ntab[UTF8_NFDICF]) + goto out_symbol_put; return um; + +out_symbol_put: + symbol_put(um->tables); +out_free_um: + kfree(um); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL(utf8_load); void utf8_unload(struct unicode_map *um) { - kfree(um); + if (um) { + symbol_put(utf8_data_table); + kfree(um); + } } EXPORT_SYMBOL(utf8_unload); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 1d2d2e5b906a..768f8ab448b8 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -6,34 +6,17 @@ #include "utf8n.h" -struct utf8data { - unsigned int maxage; - unsigned int offset; -}; - -#define __INCLUDED_FROM_UTF8NORM_C__ -#include "utf8data.h" -#undef __INCLUDED_FROM_UTF8NORM_C__ - -int utf8version_is_supported(u8 maj, u8 min, u8 rev) +int utf8version_is_supported(const struct unicode_map *um, unsigned int version) { - int i = ARRAY_SIZE(utf8agetab) - 1; - unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); + int i = um->tables->utf8agetab_size - 1; - while (i >= 0 && utf8agetab[i] != 0) { - if (sb_utf8version == utf8agetab[i]) + while (i >= 0 && um->tables->utf8agetab[i] != 0) { + if (version == um->tables->utf8agetab[i]) return 1; i--; } return 0; } -EXPORT_SYMBOL(utf8version_is_supported); - -int utf8version_latest(void) -{ - return utf8vers; -} -EXPORT_SYMBOL(utf8version_latest); /* * UTF-8 valid ranges. @@ -168,7 +151,7 @@ typedef const unsigned char utf8trie_t; * underlying datatype: unsigned char. * * leaf[0]: The unicode version, stored as a generation number that is - * an index into utf8agetab[]. With this we can filter code + * an index into ->utf8agetab[]. With this we can filter code * points based on the unicode version in which they were * defined. The CCC of a non-defined code point is 0. * leaf[1]: Canonical Combining Class. During normalization, we need @@ -316,21 +299,19 @@ utf8hangul(const char *str, unsigned char *hangul) * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ -static utf8leaf_t *utf8nlookup(const struct utf8data *data, - unsigned char *hangul, const char *s, size_t len) +static utf8leaf_t *utf8nlookup(const struct unicode_map *um, + enum utf8_normalization n, unsigned char *hangul, const char *s, + size_t len) { - utf8trie_t *trie = NULL; + utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; int offlen; int offset; int mask; int node; - if (!data) - return NULL; if (len == 0) return NULL; - trie = utf8data + data->offset; node = 1; while (node) { offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; @@ -392,172 +373,29 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, * * Forwards to utf8nlookup(). */ -static utf8leaf_t *utf8lookup(const struct utf8data *data, - unsigned char *hangul, const char *s) +static utf8leaf_t *utf8lookup(const struct unicode_map *um, + enum utf8_normalization n, unsigned char *hangul, const char *s) { - return utf8nlookup(data, hangul, s, (size_t)-1); -} - -/* - * Maximum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if only non-assigned code points are used. - */ -int utf8agemax(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age > age) - age = leaf_age; - s += utf8clen(s); - } - return age; + return utf8nlookup(um, n, hangul, s, (size_t)-1); } -EXPORT_SYMBOL(utf8agemax); - -/* - * Minimum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if non-assigned code points are used. - */ -int utf8agemin(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - int age; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - age = data->maxage; - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age < age) - age = leaf_age; - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8agemin); - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemax(const struct utf8data *data, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - - while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age > age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8nagemax); - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemin(const struct utf8data *data, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int leaf_age; - int age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - age = data->maxage; - while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age < age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8nagemin); - -/* - * Length of the normalization of s. - * Return -1 if s is not valid UTF-8 unicode. - * - * A string of Default_Ignorable_Code_Point has length 0. - */ -ssize_t utf8len(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - size_t ret = 0; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) - ret += utf8clen(s); - else if (LEAF_CCC(leaf) == DECOMPOSE) - ret += strlen(LEAF_STR(leaf)); - else - ret += utf8clen(s); - s += utf8clen(s); - } - return ret; -} -EXPORT_SYMBOL(utf8len); /* * Length of the normalization of s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ -ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) +ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, + const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; unsigned char hangul[UTF8HANGULLEAF]; - if (!data) - return -1; while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); + leaf = utf8nlookup(um, n, hangul, s, len); if (!leaf) return -1; - if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) + if (um->tables->utf8agetab[LEAF_GEN(leaf)] > + um->ntab[n]->maxage) ret += utf8clen(s); else if (LEAF_CCC(leaf) == DECOMPOSE) ret += strlen(LEAF_STR(leaf)); @@ -568,7 +406,6 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) } return ret; } -EXPORT_SYMBOL(utf8nlen); /* * Set up an utf8cursor for use by utf8byte(). @@ -580,14 +417,13 @@ EXPORT_SYMBOL(utf8nlen); * * Returns -1 on error, 0 on success. */ -int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s, size_t len) +int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s, size_t len) { - if (!data) - return -1; if (!s) return -1; - u8c->data = data; + u8c->um = um; + u8c->n = n; u8c->s = s; u8c->p = NULL; u8c->ss = NULL; @@ -604,23 +440,6 @@ int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, return -1; return 0; } -EXPORT_SYMBOL(utf8ncursor); - -/* - * Set up an utf8cursor for use by utf8byte(). - * - * u8c : pointer to cursor. - * data : const struct utf8data to use for normalization. - * s : NUL-terminated string. - * - * Returns -1 on error, 0 on success. - */ -int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s) -{ - return utf8ncursor(u8c, data, s, (unsigned int)-1); -} -EXPORT_SYMBOL(utf8cursor); /* * Get one byte from the normalized form of the string described by u8c. @@ -678,9 +497,9 @@ int utf8byte(struct utf8cursor *u8c) /* Look up the data for the current character. */ if (u8c->p) { - leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); } else { - leaf = utf8nlookup(u8c->data, u8c->hangul, + leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul, u8c->s, u8c->len); } @@ -690,7 +509,8 @@ int utf8byte(struct utf8cursor *u8c) ccc = LEAF_CCC(leaf); /* Characters that are too new have CCC 0. */ - if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { + if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > + u8c->um->ntab[u8c->n]->maxage) { ccc = STOPPER; } else if (ccc == DECOMPOSE) { u8c->len -= utf8clen(u8c->s); @@ -704,7 +524,7 @@ int utf8byte(struct utf8cursor *u8c) goto ccc_mismatch; } - leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); if (!leaf) return -1; ccc = LEAF_CCC(leaf); @@ -765,28 +585,10 @@ ccc_mismatch: } } } -EXPORT_SYMBOL(utf8byte); - -const struct utf8data *utf8nfdi(unsigned int maxage) -{ - int i = ARRAY_SIZE(utf8nfdidata) - 1; - - while (maxage < utf8nfdidata[i].maxage) - i--; - if (maxage > utf8nfdidata[i].maxage) - return NULL; - return &utf8nfdidata[i]; -} -EXPORT_SYMBOL(utf8nfdi); - -const struct utf8data *utf8nfdicf(unsigned int maxage) -{ - int i = ARRAY_SIZE(utf8nfdicfdata) - 1; - while (maxage < utf8nfdicfdata[i].maxage) - i--; - if (maxage > utf8nfdicfdata[i].maxage) - return NULL; - return &utf8nfdicfdata[i]; -} -EXPORT_SYMBOL(utf8nfdicf); +#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +EXPORT_SYMBOL_GPL(utf8version_is_supported); +EXPORT_SYMBOL_GPL(utf8nlen); +EXPORT_SYMBOL_GPL(utf8ncursor); +EXPORT_SYMBOL_GPL(utf8byte); +#endif diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 6fe8af7edccb..eb2bbdd688d7 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -18,9 +18,7 @@ unsigned int failed_tests; unsigned int total_tests; /* Tests will be based on this version. */ -#define latest_maj 12 -#define latest_min 1 -#define latest_rev 0 +#define UTF8_LATEST UNICODE_AGE(12, 1, 0) #define _test(cond, func, line, fmt, ...) do { \ total_tests++; \ @@ -160,18 +158,22 @@ static const struct { } }; -static void check_utf8_nfdi(void) +static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, + const char *s) +{ + return utf8nlen(um, n, s, (size_t)-1); +} + +static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s) +{ + return utf8ncursor(u8c, um, n, s, (unsigned int)-1); +} + +static void check_utf8_nfdi(struct unicode_map *um) { int i; struct utf8cursor u8c; - const struct utf8data *data; - - data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); - if (!data) { - pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); @@ -179,10 +181,11 @@ static void check_utf8_nfdi(void) int j = 0; unsigned char c; - test((utf8len(data, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen)); + test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); + test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == + nlen)); - if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0) + if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) pr_err("can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { @@ -196,18 +199,10 @@ static void check_utf8_nfdi(void) } } -static void check_utf8_nfdicf(void) +static void check_utf8_nfdicf(struct unicode_map *um) { int i; struct utf8cursor u8c; - const struct utf8data *data; - - data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); - if (!data) { - pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); @@ -215,10 +210,13 @@ static void check_utf8_nfdicf(void) int j = 0; unsigned char c; - test((utf8len(data, nfdicf_test_data[i].str) == nlen)); - test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen)); + test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == + nlen)); + test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == + nlen)); - if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0) + if (utf8cursor(&u8c, um, UTF8_NFDICF, + nfdicf_test_data[i].str) < 0) pr_err("can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { @@ -232,16 +230,9 @@ static void check_utf8_nfdicf(void) } } -static void check_utf8_comparisons(void) +static void check_utf8_comparisons(struct unicode_map *table) { int i; - struct unicode_map *table = utf8_load("12.1.0"); - - if (IS_ERR(table)) { - pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -262,42 +253,49 @@ static void check_utf8_comparisons(void) test_f(!utf8_strncasecmp(table, &s1, &s2), "%s %s comparison mismatch\n", s1.name, s2.name); } - - utf8_unload(table); } -static void check_supported_versions(void) +static void check_supported_versions(struct unicode_map *um) { /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(7, 0, 0)); + test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(9, 0, 0)); + test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); + test(utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(13, 0, 0)); - test(!utf8version_is_supported(0, 0, 0)); - test(!utf8version_is_supported(-1, -1, -1)); + test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } static int __init init_test_ucd(void) { + struct unicode_map *um; + failed_tests = 0; total_tests = 0; - check_supported_versions(); - check_utf8_nfdi(); - check_utf8_nfdicf(); - check_utf8_comparisons(); + um = utf8_load(UTF8_LATEST); + if (IS_ERR(um)) { + pr_err("%s: Unable to load utf8 table.\n", __func__); + return PTR_ERR(um); + } + + check_supported_versions(um); + check_utf8_nfdi(um); + check_utf8_nfdicf(um); + check_utf8_comparisons(um); if (!failed_tests) pr_info("All %u tests passed\n", total_tests); else pr_err("%u out of %u tests failed\n", failed_tests, total_tests); + utf8_unload(um); return 0; } diff --git a/fs/unicode/utf8data.h_shipped b/fs/unicode/utf8data.c_shipped index 76e4f0e1b089..d9b62901aa96 100644 --- a/fs/unicode/utf8data.h_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -1,9 +1,8 @@ /* This file is generated code, do not edit. */ -#ifndef __INCLUDED_FROM_UTF8NORM_C__ -#error Only nls_utf8-norm.c should include this file. -#endif -static const unsigned int utf8vers = 0xc0100; +#include <linux/module.h> +#include <linux/kernel.h> +#include "utf8n.h" static const unsigned int utf8agetab[] = { 0, @@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = { 0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00, 0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00 }; + +struct utf8data_table utf8_data_table = { + .utf8agetab = utf8agetab, + .utf8agetab_size = ARRAY_SIZE(utf8agetab), + + .utf8nfdicfdata = utf8nfdicfdata, + .utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata), + + .utf8nfdidata = utf8nfdidata, + .utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata), + + .utf8data = utf8data, +}; +EXPORT_SYMBOL_GPL(utf8_data_table); +MODULE_LICENSE("GPL v2"); diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index 0acd530c2c79..bd00d587747a 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h @@ -11,53 +11,9 @@ #include <linux/export.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/unicode.h> -/* Encoding a unicode version number as a single unsigned int. */ -#define UNICODE_MAJ_SHIFT (16) -#define UNICODE_MIN_SHIFT (8) - -#define UNICODE_AGE(MAJ, MIN, REV) \ - (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ - ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ - ((unsigned int)(REV))) - -/* Highest unicode version supported by the data tables. */ -extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); -extern int utf8version_latest(void); - -/* - * Look for the correct const struct utf8data for a unicode version. - * Returns NULL if the version requested is too new. - * - * Two normalization forms are supported: nfdi and nfdicf. - * - * nfdi: - * - Apply unicode normalization form NFD. - * - Remove any Default_Ignorable_Code_Point. - * - * nfdicf: - * - Apply unicode normalization form NFD. - * - Remove any Default_Ignorable_Code_Point. - * - Apply a full casefold (C + F). - */ -extern const struct utf8data *utf8nfdi(unsigned int maxage); -extern const struct utf8data *utf8nfdicf(unsigned int maxage); - -/* - * Determine the maximum age of any unicode character in the string. - * Returns 0 if only unassigned code points are present. - * Returns -1 if the input is not valid UTF-8. - */ -extern int utf8agemax(const struct utf8data *data, const char *s); -extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len); - -/* - * Determine the minimum age of any unicode character in the string. - * Returns 0 if any unassigned code points are present. - * Returns -1 if the input is not valid UTF-8. - */ -extern int utf8agemin(const struct utf8data *data, const char *s); -extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); +int utf8version_is_supported(const struct unicode_map *um, unsigned int version); /* * Determine the length of the normalized from of the string, @@ -65,8 +21,8 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); * Returns 0 if only ignorable code points are present. * Returns -1 if the input is not valid UTF-8. */ -extern ssize_t utf8len(const struct utf8data *data, const char *s); -extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); +ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, + const char *s, size_t len); /* Needed in struct utf8cursor below. */ #define UTF8HANGULLEAF (12) @@ -75,7 +31,8 @@ extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); * Cursor structure used by the normalizer. */ struct utf8cursor { - const struct utf8data *data; + const struct unicode_map *um; + enum utf8_normalization n; const char *s; const char *p; const char *ss; @@ -92,10 +49,8 @@ struct utf8cursor { * Returns 0 on success. * Returns -1 on failure. */ -extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s); -extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s, size_t len); +int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s, size_t len); /* * Get the next byte in the normalization. @@ -105,4 +60,24 @@ extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, */ extern int utf8byte(struct utf8cursor *u8c); +struct utf8data { + unsigned int maxage; + unsigned int offset; +}; + +struct utf8data_table { + const unsigned int *utf8agetab; + int utf8agetab_size; + + const struct utf8data *utf8nfdicfdata; + int utf8nfdicfdata_size; + + const struct utf8data *utf8nfdidata; + int utf8nfdidata_size; + + const unsigned char *utf8data; +}; + +extern struct utf8data_table utf8_data_table; + #endif /* UTF8NORM_H */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/poll.h> #include <linux/slab.h> @@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/backing-dev.h> #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fbc9d816882c..23523b802539 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1077,21 +1077,18 @@ xfs_attr_node_hasname( state = xfs_da_state_alloc(args); if (statep != NULL) - *statep = NULL; + *statep = state; /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - xfs_da_state_free(state); - return error; - } + if (error) + retval = error; - if (statep != NULL) - *statep = state; - else + if (!statep) xfs_da_state_free(state); + return retval; } @@ -1112,7 +1109,7 @@ xfs_attr_node_addname_find_attr( */ retval = xfs_attr_node_hasname(args, &dac->da_state); if (retval != -ENOATTR && retval != -EEXIST) - return retval; + goto error; if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto error; @@ -1337,7 +1334,7 @@ int xfs_attr_node_removename_setup( error = xfs_attr_node_hasname(args, state); if (error != -EEXIST) - return error; + goto out; error = 0; ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4dccd4d90622..74198dd82b03 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4598,7 +4598,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index bed798792226..90aebfe9dc5f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -281,7 +281,7 @@ xchk_superblock( features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); if ((sb->sb_features2 & features_mask) != (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) - xchk_block_set_corrupt(sc, bp); + xchk_block_set_preen(sc, bp); if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ @@ -290,39 +290,38 @@ xchk_superblock( offsetof(struct xfs_dsb, sb_features_compat))) xchk_block_set_corrupt(sc, bp); } else { - /* Check compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); - if ((sb->sb_features_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) xchk_block_set_corrupt(sc, bp); - /* Check ro compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | - XFS_SB_FEAT_RO_COMPAT_FINOBT | - XFS_SB_FEAT_RO_COMPAT_RMAPBT | - XFS_SB_FEAT_RO_COMPAT_REFLINK); - if ((sb->sb_features_ro_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & - features_mask)) + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) xchk_block_set_corrupt(sc, bp); - /* Check incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | - XFS_SB_FEAT_INCOMPAT_FTYPE | - XFS_SB_FEAT_INCOMPAT_SPINODES | - XFS_SB_FEAT_INCOMPAT_META_UUID); - if ((sb->sb_features_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_incompat) & - features_mask)) - xchk_block_set_corrupt(sc, bp); + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); - /* Check log incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); - if ((sb->sb_features_log_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & - features_mask)) + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) xchk_block_set_corrupt(sc, bp); + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); + /* Don't care about sb_crc */ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d7bfed52f4cd..6da7f2ca77de 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -52,6 +52,18 @@ xrep_superblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 200a63f58fe7..38897adde7b5 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -497,6 +497,7 @@ STATIC int xchk_directory_leaf1_bestfree( struct xfs_scrub *sc, struct xfs_da_args *args, + xfs_dir2_db_t last_data_db, xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; @@ -534,10 +535,14 @@ xchk_directory_leaf1_bestfree( } /* - * There should be as many bestfree slots as there are dir data - * blocks that can fit under i_size. + * There must be enough bestfree slots to cover all the directory data + * blocks that we scanned. It is possible for there to be a hole + * between the last data block and i_disk_size. This seems like an + * oversight to the scrub author, but as we have been writing out + * directories like this (and xfs_repair doesn't mind them) for years, + * that's what we have to check. */ - if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) { + if (bestcount != last_data_db + 1) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } @@ -669,6 +674,7 @@ xchk_directory_blocks( xfs_fileoff_t lblk; struct xfs_iext_cursor icur; xfs_dablk_t dabno; + xfs_dir2_db_t last_data_db = 0; bool found; int is_block = 0; int error; @@ -712,6 +718,7 @@ xchk_directory_blocks( args.geo->fsbcount); lblk < got.br_startoff + got.br_blockcount; lblk += args.geo->fsbcount) { + last_data_db = xfs_dir2_da_to_db(args.geo, lblk); error = xchk_directory_data_bestfree(sc, lblk, is_block); if (error) @@ -734,7 +741,7 @@ xchk_directory_blocks( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } - error = xchk_directory_leaf1_bestfree(sc, &args, + error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db, leaf_lblk); if (error) goto out; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 2405b09d03d0..eac15af7b08c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -233,6 +233,7 @@ xchk_dinode( unsigned long long isize; uint64_t flags2; uint32_t nextents; + prid_t prid; uint16_t flags; uint16_t mode; @@ -267,6 +268,7 @@ xchk_dinode( * so just mark this inode for preening. */ xchk_ino_set_preen(sc, ino); + prid = 0; break; case 2: case 3: @@ -279,12 +281,17 @@ xchk_dinode( if (dip->di_projid_hi != 0 && !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); + + prid = be16_to_cpu(dip->di_projid_lo); break; default: xchk_ino_set_corrupt(sc, ino); return; } + if (xfs_has_projid32(mp)) + prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; + /* * di_uid/di_gid -- -1 isn't invalid, but there's no way that * userspace could have created that. @@ -293,6 +300,13 @@ xchk_dinode( dip->di_gid == cpu_to_be32(-1U)) xchk_ino_set_warning(sc, ino); + /* + * project id of -1 isn't supposed to be valid, but the kernel didn't + * always validate that. + */ + if (prid == -1U) + xchk_ino_set_warning(sc, ino); + /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index d6c1b00a4fc8..3c7506c7553c 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -48,10 +48,10 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->flags |= XCHK_HAS_QUOTAOFFLOCK; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); + if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 8f3cba14ada3..1e7b6b209ee8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -25,6 +25,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -912,11 +913,13 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); sc->mp->m_qflags &= ~flag; spin_lock(&sc->mp->m_sb_lock); sc->mp->m_sb.sb_qflags &= ~flag; spin_unlock(&sc->mp->m_sb_lock); xfs_log_sb(sc->tp); + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); } /* diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8d528d35b725..b11870d07c56 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -173,10 +173,6 @@ xchk_teardown( mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); - if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; - } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 80e5026bba44..3de5287e98d8 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,7 +88,6 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ #define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c8c15c3c3147..2705f91bdd0d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -359,7 +359,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -437,37 +437,37 @@ xfs_prepare_ioend( * see a ENOSPC in writeback). */ static void -xfs_discard_page( - struct page *page, - loff_t fileoff) +xfs_discard_folio( + struct folio *folio, + loff_t pos) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - unsigned int pageoff = offset_in_page(fileoff); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); + size_t offset = offset_in_folio(folio, pos); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) goto out_invalidate; xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, fileoff); + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page) - pageoff_fsb); + i_blocks_per_folio(inode, folio) - pageoff_fsb); if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); + iomap_invalidate_folio(folio, offset, folio_size(folio) - offset); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, .prepare_ioend = xfs_prepare_ioend, - .discard_page = xfs_discard_page, + .discard_folio = xfs_discard_folio, }; STATIC int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..797ea0c8b14e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1001,7 +1001,7 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. iomap_zero_range is smart + * partial block at the beginning and/or end. xfs_zero_range is smart * enough to skip any holes, including those we just created, but we * must take care not to zero beyond EOF and enlarge i_size. */ @@ -1009,15 +1009,14 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, - &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, offset, len, NULL); if (error) return error; /* * If we zeroed right up to EOF and EOF straddles a page boundary we * must make sure that the post-EOF area is also zeroed because the - * page could be mmap'd and iomap_zero_range doesn't do that for us. + * page could be mmap'd and xfs_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 631c5a61d89b..b45e0d50a405 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -394,7 +394,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } @@ -1892,6 +1892,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev); kmem_free(btp); } @@ -1932,11 +1933,10 @@ xfs_setsize_buftarg_early( return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } -xfs_buftarg_t * +struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - struct dax_device *dax_dev) + struct block_device *bdev) { xfs_buftarg_t *btp; @@ -1945,7 +1945,7 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = dax_dev; + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 6b0200b8007d..edcb6254fa6a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -89,6 +89,7 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -338,8 +339,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, + struct block_device *bdev); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 70ca5751b13e..e484251dc9c8 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -816,7 +816,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8310005af00f..a7174a5b3203 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -138,7 +138,8 @@ xfs_dir2_sf_getdents( STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, - struct dir_context *ctx) + struct dir_context *ctx, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ @@ -146,7 +147,6 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; - int lock_mode; unsigned int offset, next_offset; unsigned int end; @@ -156,12 +156,13 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(args->trans, dp, &bp); - xfs_iunlock(dp, lock_mode); if (error) return error; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -344,7 +345,8 @@ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, - size_t bufsize) + size_t bufsize, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; @@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ - int lock_mode; unsigned int offset = 0; int error = 0; /* error return value */ @@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents( bp = NULL; } - lock_mode = xfs_ilock_data_map_shared(dp); + if (*lock_mode == 0) + *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); - xfs_iunlock(dp, lock_mode); if (error || !bp) break; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. @@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents( * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the - * transaction. The caller should ensure that the inode is locked + * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int @@ -507,8 +511,9 @@ xfs_readdir( size_t bufsize) { struct xfs_da_args args = { NULL }; - int rval; - int v; + unsigned int lock_mode; + int isblock; + int error; trace_xfs_readdir(dp); @@ -516,6 +521,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; @@ -523,13 +529,22 @@ xfs_readdir( args.trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(&args, ctx); - else if ((rval = xfs_dir2_isblock(&args, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(&args, ctx); - else - rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + return xfs_dir2_sf_getdents(&args, ctx); - return rval; + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir2_isblock(&args, &isblock); + if (error) + goto out_unlock; + + if (isblock) { + error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); + goto out_unlock; + } + + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + +out_unlock: + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e48ae227bb11..5afedcbc78c7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -289,13 +289,12 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) */ STATIC int xfs_dquot_disk_alloc( - struct xfs_trans **tpp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - struct xfs_trans *tp = *tpp; - struct xfs_mount *mp = tp->t_mountp; + struct xfs_trans *tp; + struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); @@ -304,29 +303,35 @@ xfs_dquot_disk_alloc( trace_xfs_dqalloc(dqp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + return error; + xfs_ilock(quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, 0); + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return -ESRCH; + error = -ESRCH; + goto err_cancel; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - return error; + goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) - return error; + goto err_cancel; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -341,7 +346,7 @@ xfs_dquot_disk_alloc( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) - return error; + goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -371,16 +376,25 @@ xfs_dquot_disk_alloc( * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. + * + * Keep the quota inode ILOCKed until after the transaction commit to + * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp); + error = xfs_trans_commit(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { - xfs_trans_bhold_release(*tpp, bp); - xfs_trans_brelse(*tpp, bp); + xfs_buf_relse(bp); return error; } + *bpp = bp; return 0; + +err_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return error; } /* @@ -629,43 +643,6 @@ xfs_dquot_to_disk( ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } -/* Allocate and initialize the dquot buffer for this in-core dquot. */ -static int -xfs_qm_dqread_alloc( - struct xfs_mount *mp, - struct xfs_dquot *dqp, - struct xfs_buf **bpp) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto err; - - error = xfs_dquot_disk_alloc(&tp, dqp, bpp); - if (error) - goto err_cancel; - - error = xfs_trans_commit(tp); - if (error) { - /* - * Buffer was held to the transaction, so we have to unlock it - * manually here because we're not passing it back. - */ - xfs_buf_relse(*bpp); - *bpp = NULL; - goto err; - } - return 0; - -err_cancel: - xfs_trans_cancel(tp); -err: - return error; -} - /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any @@ -689,7 +666,7 @@ xfs_qm_dqread( /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) - error = xfs_qm_dqread_alloc(mp, dqp, &bp); + error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 81c445e9489b..749fd18c4f32 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,11 +213,12 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), NULL, }; +ATTRIBUTE_GROUPS(xfs_errortag); static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, - .default_attrs = xfs_errortag_attrs, + .default_groups = xfs_errortag_groups, }; int diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..8d4c5ca261bd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -437,8 +437,7 @@ restart: } trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; } else diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e1472004170e..2e718728986f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -87,6 +87,7 @@ xfs_inode_alloc( /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; VFS_I(ip)->i_state = 0; + mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -289,22 +290,6 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } -static inline void -xfs_inew_wait( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); - - do { - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (!xfs_iflags_test(ip, XFS_INEW)) - break; - schedule(); - } while (true); - finish_wait(wq, &wait.wq_entry); -} - /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -336,6 +321,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + mapping_set_large_folios(inode->i_mapping); return error; } @@ -368,18 +354,13 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); if (error) { - bool wake; - /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); @@ -770,7 +751,8 @@ again: /* * If we have a real type for an on-disk inode, we can setup the inode - * now. If it's a new inode being created, xfs_ialloc will handle it. + * now. If it's a new inode being created, xfs_init_new_inode will + * handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 64b9bf334806..04bf467b1090 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -988,8 +988,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1142,8 +1142,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -3122,7 +3122,6 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e635a3d64cba..c447bf04205a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -231,8 +231,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define __XFS_INEW_BIT 3 /* inode has just been allocated */ -#define XFS_INEW (1 << __XFS_INEW_BIT) +#define XFS_INEW (1 << 3) /* inode has just been allocated */ #define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ @@ -492,7 +491,6 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 174cd8950cb6..8ea47a9d5aad 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -372,7 +372,7 @@ int xfs_ioc_attr_list( struct xfs_inode *dp, void __user *ubuf, - int bufsize, + size_t bufsize, int flags, struct xfs_attrlist_cursor __user *ucursor) { @@ -687,7 +687,8 @@ xfs_ioc_space( if (bf->l_start > XFS_ISIZE(ip)) { error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); + bf->l_start - XFS_ISIZE(ip), + XFS_BMAPI_PREALLOC); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 28453a6d4461..845d3bcab74b 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -38,8 +38,9 @@ xfs_readlink_by_handle( int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, uint32_t opcode, void __user *uname, void __user *value, uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, - int flags, struct xfs_attrlist_cursor __user *ucursor); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 093758440ad5..e552ce541ec2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,7 +28,6 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" - #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -54,7 +53,8 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - u16 flags) + unsigned int mapping_flags, + u16 iomap_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -71,16 +71,22 @@ xfs_bmbt_to_iomap( iomap->type = IOMAP_DELALLOC; } else { iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + if (mapping_flags & IOMAP_DAX) + iomap->addr += target->bt_dax_part_off; + if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else iomap->type = IOMAP_MAPPED; + } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = target->bt_bdev; - iomap->dax_dev = target->bt_daxdev; - iomap->flags = flags; + if (mapping_flags & IOMAP_DAX) + iomap->dax_dev = target->bt_daxdev; + else + iomap->bdev = target->bt_bdev; + iomap->flags = iomap_flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) @@ -188,6 +194,7 @@ xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, + unsigned int flags, struct xfs_bmbt_irec *imap) { struct xfs_mount *mp = ip->i_mount; @@ -229,7 +236,7 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { + if (flags & IOMAP_DAX) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; @@ -620,7 +627,7 @@ imap_needs_alloc( imap->br_startblock == DELAYSTARTBLOCK) return true; /* we convert unwritten extents before copying the data for DAX */ - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) return true; return false; } @@ -800,7 +807,7 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); allocate_blocks: error = -EAGAIN; @@ -826,23 +833,24 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - &imap); + flags, &imap); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + iomap_flags | IOMAP_F_NEW); out_found_cow: xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); out_unlock: if (lockmode) @@ -1052,23 +1060,24 @@ retry: */ xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); found_imap: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); found_cow: xfs_iunlock(ip, XFS_ILOCK_EXCL); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1177,7 +1186,8 @@ xfs_read_iomap_begin( if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + shared ? IOMAP_F_SHARED : 0); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1236,7 +1246,8 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1258,7 +1269,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1305,9 +1316,40 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; + +int +xfs_zero_range( + struct xfs_inode *ip, + loff_t pos, + loff_t len, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_zero_range(inode, pos, len, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_zero_range(inode, pos, len, did_zero, + &xfs_buffered_write_iomap_ops); +} + +int +xfs_truncate_page( + struct xfs_inode *ip, + loff_t pos, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_truncate_page(inode, pos, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_truncate_page(inode, pos, did_zero, + &xfs_buffered_write_iomap_ops); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7d3703556d0e..e88dc162c785 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,19 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); + xfs_fileoff_t count_fsb, unsigned int flags, + struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); -int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, u16); +int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, + u16 iomap_flags); + +int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, + bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a607d6aca5c4..b79b3846e71b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -511,27 +511,6 @@ xfs_vn_get_link( return ERR_PTR(error); } -STATIC const char * -xfs_vn_get_link_inline( - struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct xfs_inode *ip = XFS_I(inode); - char *link; - - ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL); - - /* - * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if - * if_data is junk. - */ - link = ip->i_df.if_u1.if_data; - if (XFS_IS_CORRUPT(ip->i_mount, !link)) - return ERR_PTR(-EFSCORRUPTED); - return link; -} - static uint32_t xfs_stat_blksize( struct xfs_inode *ip) @@ -911,8 +890,8 @@ xfs_setattr_size( */ if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); - error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, oldsize, newsize - oldsize, + &did_zeroing); } else { /* * iomap won't detect a dirty page over an unwritten block (or a @@ -924,8 +903,7 @@ xfs_setattr_size( newsize); if (error) return error; - error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_buffered_write_iomap_ops); + error = xfs_truncate_page(ip, newsize, &did_zeroing); } if (error) @@ -1250,14 +1228,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; -static const struct inode_operations xfs_inline_symlink_inode_operations = { - .get_link = xfs_vn_get_link_inline, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .listxattr = xfs_vn_listxattr, - .update_time = xfs_vn_update_time, -}; - /* Figure out if this file actually supports DAX. */ static bool xfs_inode_supports_dax( @@ -1332,9 +1302,9 @@ xfs_diflags_to_iflags( * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, - * when creating a new inode it is called from xfs_ialloc after setting up the - * inode. These callers have different criteria for clearing XFS_INEW, so leave - * it up to the caller to deal with unlocking the inode appropriately. + * when creating a new inode it is called from xfs_init_new_inode after setting + * up the inode. These callers have different criteria for clearing XFS_INEW, so + * leave it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1408,10 +1378,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) - inode->i_op = &xfs_inline_symlink_inode_operations; - else - inode->i_op = &xfs_symlink_inode_operations; + inode->i_op = &xfs_symlink_inode_operations; break; default: inode->i_op = &xfs_inode_operations; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c174262a074e..09a8fba84ff9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -61,6 +61,7 @@ typedef __u32 xfs_nlink_t; #include <linux/ratelimit.h> #include <linux/rhashtable.h> #include <linux/xattr.h> +#include <linux/mnt_idmapping.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 6c93c8ada6f3..83a039762b81 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -103,6 +103,39 @@ xlog_cil_iovec_space( } /* + * shadow buffers can be large, so we need to use kvmalloc() here to ensure + * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall + * back to vmalloc, so we can't actually do anything useful with gfp flags to + * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do + * direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or buddy + * allocator. Hence we have to open code kvmalloc outselves here. + * + * Also, we are in memalloc_nofs_save task context here, so despite the use of + * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This + * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets + * just all pretend this is a GFP_KERNEL context operation.... + */ +static inline void * +xlog_cil_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + +/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -203,25 +236,16 @@ xlog_cil_alloc_shadow_bufs( */ if (!lip->li_lv_shadow || buf_size > lip->li_lv_shadow->lv_size) { - /* * We free and allocate here as a realloc would copy - * unnecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kvzalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kmem_free(lip->li_lv_shadow); + lv = xlog_cil_kvmalloc(buf_size); - /* - * We are in transaction context, which means this - * allocation will pick up GFP_NOFS from the - * memalloc_nofs_save/restore context the transaction - * holds. This means we can use GFP_KERNEL here so the - * generic kvmalloc() code will run vmalloc on - * contiguous page allocation failure as we require. - */ - lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -1442,9 +1466,9 @@ out_shutdown: */ bool xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) + struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; if (list_empty(&lip->li_cil)) return false; @@ -1454,7 +1478,7 @@ xfs_log_item_in_current_chkpt( * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - return lip->li_seq == ctx->sequence; + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 53366cc0bc9e..96c997ed2ec8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -27,7 +27,7 @@ #include "xfs_buf_item.h" #include "xfs_ag.h" #include "xfs_quota.h" - +#include "xfs_reflink.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -3498,6 +3498,28 @@ xlog_recover_finish( xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); + + /* + * Recover any CoW staging blocks that are still referenced by the + * ondisk refcount metadata. During mount there cannot be any live + * staging extents as we have not permitted any user modifications. + * Therefore, it is safe to free them all right now, even on a + * read-only mount. + */ + error = xfs_reflink_recover_cow(log->l_mp); + if (error) { + xfs_alert(log->l_mp, + "Failed to recover leftover CoW staging extents, err %d.", + error); + /* + * If we get an error here, make sure the log is shut down + * but return zero so that any log items committed since the + * end of intents processing can be pushed through the CIL + * and AIL. + */ + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + } + return 0; } @@ -3528,8 +3550,6 @@ xlog_recover_check_summary( uint64_t ifree; int error; - mp = log->l_mp; - freeblks = 0LL; itotal = 0LL; ifree = 0LL; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 359109b6f0d3..bed73e8002a5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -936,15 +936,6 @@ xfs_mountfs( xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_quota; - } - /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) @@ -955,7 +946,6 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); - out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 5e1d29d8b2e7..d6334abbc0b3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -155,7 +155,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, &imap); + end_fsb - offset_fsb, 0, &imap); if (error) goto out_unlock; @@ -173,7 +173,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 47fe60e1a887..7d5a31827681 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,6 @@ xfs_qm_scall_setqlim( return 0; /* - * We don't want to race with a quotaoff so take the quotaoff lock. - * We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening. - */ - mutex_lock(&q->qi_quotaofflock); - - /* * Get the dquot (locked) before we start, as we need to do a * transaction to allocate it if it doesn't exist. Once we have the * dquot, unlock it so we can start the next transaction safely. We hold @@ -319,7 +312,7 @@ xfs_qm_scall_setqlim( error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); - goto out_unlock; + return error; } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -415,8 +408,6 @@ xfs_qm_scall_setqlim( out_rele: xfs_qm_dqrele(dqp); -out_unlock: - mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cb0edb1d68ef..db70060e7bf6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -749,7 +749,10 @@ xfs_reflink_end_cow( } /* - * Free leftover CoW reservations that didn't get cleaned out. + * Free all CoW staging blocks that are still referenced by the ondisk refcount + * metadata. The ondisk metadata does not track which inode created the + * staging extent, so callers must ensure that there are no cached inodes with + * live CoW staging extents. */ int xfs_reflink_recover_cow( @@ -1269,8 +1272,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_buffered_write_iomap_ops); + return xfs_zero_range(ip, isize, pos - isize, NULL); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e21459f9923a..e8f37bdc8354 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -331,13 +331,34 @@ xfs_set_inode_alloc( return xfs_is_inode32(mp) ? maxagi : agcount; } -static bool -xfs_buftarg_is_dax( - struct super_block *sb, - struct xfs_buftarg *bt) +static int +xfs_setup_dax_always( + struct xfs_mount *mp) { - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, - bdev_nr_sectors(bt->bt_bdev)); + if (!mp->m_ddev_targp->bt_daxdev && + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { + xfs_alert(mp, + "DAX unsupported by block device. Turning off DAX."); + goto disable_dax; + } + + if (mp->m_super->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "DAX not supported for blocksize. Turning off DAX."); + goto disable_dax; + } + + if (xfs_has_reflink(mp)) { + xfs_alert(mp, "DAX and reflink cannot be used together!"); + return -EINVAL; + } + + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + return 0; + +disable_dax: + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); + return 0; } STATIC int @@ -370,26 +391,19 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; - if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp->m_ddev_targp); - fs_put_dax(dax_ddev); } /* @@ -407,8 +421,6 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -418,8 +430,7 @@ xfs_open_devices( if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - goto out; - dax_logdev = fs_dax_get_by_bdev(logdev); + return error; } if (mp->m_rtname) { @@ -433,25 +444,24 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } - dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -467,14 +477,9 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) { + if (logdev && logdev != ddev) xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); - } - out: - fs_put_dax(dax_ddev); return error; } @@ -1593,26 +1598,9 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (xfs_has_dax_always(mp)) { - bool rtdev_is_dax = false, datadev_is_dax; - - xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); - if (mp->m_rtdev_targp) - rtdev_is_dax = xfs_buftarg_is_dax(sb, - mp->m_rtdev_targp); - if (!rtdev_is_dax && !datadev_is_dax) { - xfs_alert(mp, - "DAX unsupported by block device. Turning off DAX."); - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); - } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, - "DAX and reflink cannot be used together!"); - error = -EINVAL; + error = xfs_setup_dax_always(mp); + if (error) goto out_filestream_unmount; - } } if (xfs_has_discard(mp)) { @@ -1739,15 +1727,6 @@ xfs_remount_rw( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ @@ -1765,7 +1744,10 @@ static int xfs_remount_ro( struct xfs_mount *mp) { - int error; + struct xfs_icwalk icw = { + .icw_flags = XFS_ICWALK_FLAG_SYNC, + }; + int error; /* * Cancel background eofb scanning so it cannot race with the final @@ -1773,8 +1755,13 @@ xfs_remount_ro( */ xfs_blockgc_stop(mp); - /* Get rid of any leftover CoW reservations... */ - error = xfs_blockgc_free_space(mp, NULL); + /* + * Clear out all remaining COW staging extents and speculative post-EOF + * preallocations so that we don't leave inodes requiring inactivation + * cleanups during reclaim on a read-only mount. We must process every + * cached inode, so this requires a synchronous cache scan. + */ + error = xfs_blockgc_free_space(mp, &icw); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fc2c6a404647..affbedf78160 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_error.h" /* ----- Kernel only functions below ----- */ int @@ -96,17 +97,15 @@ xfs_readlink_bmap_ilocked( int xfs_readlink( - struct xfs_inode *ip, - char *link) + struct xfs_inode *ip, + char *link) { - struct xfs_mount *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = -EFSCORRUPTED; trace_xfs_readlink(ip); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (xfs_is_shutdown(mp)) return -EIO; @@ -121,12 +120,22 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = -EFSCORRUPTED; goto out; } - - error = xfs_readlink_bmap_ilocked(ip, link); + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED + * if if_data is junk. + */ + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + goto out; + + memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + error = 0; + } else { + error = xfs_readlink_bmap_ilocked(ip, link); + } out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -184,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8608f804388f..574b80c29fe1 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -67,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = { static struct attribute *xfs_mp_attrs[] = { NULL, }; +ATTRIBUTE_GROUPS(xfs_mp); struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, + .default_groups = xfs_mp_groups, }; #ifdef DEBUG @@ -239,11 +240,12 @@ static struct attribute *xfs_dbg_attrs[] = { #endif NULL, }; +ATTRIBUTE_GROUPS(xfs_dbg); struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_dbg_attrs, + .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ @@ -296,11 +298,12 @@ static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats_clear), NULL, }; +ATTRIBUTE_GROUPS(xfs_stats); struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_stats_attrs, + .default_groups = xfs_stats_groups, }; /* xlog */ @@ -381,11 +384,12 @@ static struct attribute *xfs_log_attrs[] = { ATTR_LIST(write_grant_head), NULL, }; +ATTRIBUTE_GROUPS(xfs_log); struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_log_attrs, + .default_groups = xfs_log_groups, }; /* @@ -534,12 +538,12 @@ static struct attribute *xfs_error_attrs[] = { ATTR_LIST(retry_timeout_seconds), NULL, }; - +ATTRIBUTE_GROUPS(xfs_error); static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_error_attrs, + .default_groups = xfs_error_groups, }; static struct kobj_type xfs_error_ktype = { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 234a9d9c2f43..59e2f9031b9f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -942,8 +942,17 @@ xfs_trans_cancel( trace_xfs_trans_cancel(tp, _RET_IP_); - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + /* + * It's never valid to cancel a transaction with deferred ops attached, + * because the transaction is effectively dirty. Complain about this + * loudly before freeing the in-memory defer items. + */ + if (!list_empty(&tp->t_dfops)) { + ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + dirty = true; xfs_defer_cancel(tp); + } /* * See if the caller is relying on us to shut down the diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 259ee2bda492..b76dfb310ab6 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1787,5 +1787,6 @@ static void __exit zonefs_exit(void) MODULE_AUTHOR("Damien Le Moal"); MODULE_DESCRIPTION("Zone file system for zoned block devices"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("zonefs"); module_init(zonefs_init); module_exit(zonefs_exit); |