diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 10:02:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 10:02:55 -0700 |
commit | 69e1aaddd63104f37021d0b0f6abfd9623c9134c (patch) | |
tree | 14ad49741b428d270b681694bb2df349465455b9 /fs | |
parent | 56b59b429b4c26e5e730bc8c3d837de9f7d0a966 (diff) | |
parent | 9d547c35799a4ddd235f1565cec2fff6c9263504 (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates for 3.4 from Ted Ts'o:
"Ext4 commits for 3.3 merge window; mostly cleanups and bug fixes
The changes to export dirty_writeback_interval are from Artem's s_dirt
cleanup patch series. The same is true of the change to remove the
s_dirt helper functions which never got used by anyone in-tree. I've
run these changes by Al Viro, and am carrying them so that Artem can
more easily fix up the rest of the file systems during the next merge
window. (Originally we had hopped to remove the use of s_dirt from
ext4 during this merge window, but his patches had some bugs, so I
ultimately ended dropping them from the ext4 tree.)"
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (66 commits)
vfs: remove unused superblock helpers
mm: export dirty_writeback_interval
ext4: remove useless s_dirt assignment
ext4: write superblock only once on unmount
ext4: do not mark superblock as dirty unnecessarily
ext4: correct ext4_punch_hole return codes
ext4: remove restrictive checks for EOFBLOCKS_FL
ext4: always set then trimmed blocks count into len
ext4: fix trimmed block count accunting
ext4: fix start and len arguments handling in ext4_trim_fs()
ext4: update s_free_{inodes,blocks}_count during online resize
ext4: change some printk() calls to use ext4_msg() instead
ext4: avoid output message interleaving in ext4_error_<foo>()
ext4: remove trailing newlines from ext4_msg() and ext4_error() messages
ext4: add no_printk argument validation, fix fallout
ext4: remove redundant "EXT4-fs: " from uses of ext4_msg
ext4: give more helpful error message in ext4_ext_rm_leaf()
ext4: remove unused code from ext4_ext_map_blocks()
ext4: rewrite punch hole to use ext4_ext_remove_space()
jbd2: cleanup journal tail after transaction commit
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/balloc.c | 63 | ||||
-rw-r--r-- | fs/ext4/dir.c | 13 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 34 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 4 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 128 | ||||
-rw-r--r-- | fs/ext4/extents.c | 330 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 2 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 260 | ||||
-rw-r--r-- | fs/ext4/inode.c | 95 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 342 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 20 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 4 | ||||
-rw-r--r-- | fs/ext4/namei.c | 2 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 18 | ||||
-rw-r--r-- | fs/ext4/resize.c | 37 | ||||
-rw-r--r-- | fs/ext4/super.c | 1075 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 25 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 140 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 47 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 361 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 5 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 12 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 48 |
24 files changed, 1479 insertions, 1588 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9e2cd8cf711..4bbd07a6fa18 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -336,10 +336,10 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; - struct buffer_head *bh = NULL; + struct buffer_head *bh; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } @@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ + set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { - put_bh(bh); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + return bh; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); - return NULL; + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ ext4_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, - * continue with corrupt bitmap - */ + return 0; +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); + return NULL; + } return bh; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c56092e58..ad56866d729a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 0; if (filp) - ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else - ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "EXT4-fs: call_filldir: called with " - "null fname?!?\n"); + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); return 0; } curr_pos = hash2pos(fname->hash, fname->minor_hash); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 513004fc3d84..ded731ac8a32 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,7 +53,7 @@ printk(KERN_DEBUG f, ## a); \ } while (0) #else -#define ext4_debug(f, a...) do {} while (0) +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ @@ -184,6 +184,8 @@ struct mpage_da_data { #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 +#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -192,18 +194,25 @@ struct ext4_io_page { #define MAX_IO_PAGES 128 +/* + * For converting uninitialized extents on a work queue. + * + * 'page' is only used from the writepage() path; 'pages' is only used for + * buffered writes; they are used to keep page references until conversion + * takes place. For AIO/DIO, neither field is filled in. + */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - struct page *page; /* page struct for buffer write */ + struct page *page; /* for writepage() path */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; - struct ext4_io_page *pages[MAX_IO_PAGES]; + int num_io_pages; /* for writepages() */ + struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ } ext4_io_end_t; struct ext4_io_submit { @@ -923,6 +932,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ @@ -941,7 +951,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ @@ -1142,6 +1151,7 @@ struct ext4_sb_info { unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned int s_mount_flags; + unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; uid_t s_resuid; gid_t s_resgid; @@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); extern void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t group, @@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a52db3a69a30..0f58b86e3a02 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -47,9 +47,9 @@ */ #define EXT_DEBUG__ #ifdef EXT_DEBUG -#define ext_debug(a...) printk(a) +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ext_debug(a...) +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1dab18..83b20fcf9400 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -104,6 +104,78 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); +} + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74f23c292e1b..1421938e6792 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,14 @@ #include <trace/events/ext4.h> +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle, int split_flag, int flags); +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + if (len == 0) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_extent *ex; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf\n", start); + ext_debug("truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext_debug(" border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ - if (end <= ex_ee_block) { + if (end < ex_ee_block) { ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { - EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", - start, end); + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); err = -EIO; goto out; } else if (a != ex_ee_block) { @@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) handle_t *handle; int i, err; - ext_debug("truncate since %u\n", start); + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start(inode, depth + 1); @@ -2504,6 +2525,61 @@ again: trace_ext4_ext_remove_space(inode, start, depth); /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block; + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, end, NULL); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) + goto cont; + + ee_block = le32_to_cpu(ex->ee_block); + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + int split_flag = 0; + + if (ext4_ext_is_uninitialized(ex)) + split_flag = EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent + */ + err = ext4_split_extent_at(handle, inode, path, + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (err < 0) + goto out; + } + ext4_ext_drop_refs(path); + kfree(path); + } +cont: + + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ @@ -2515,6 +2591,7 @@ again: } path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; @@ -2526,7 +2603,7 @@ again: /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial_cluster, start, - EXT_MAX_BLOCKS - 1); + end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) - printk(KERN_INFO "EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST - printk(", aggressive tests"); + ", aggressive tests" #endif #ifdef CHECK_BINSEARCH - printk(", check binsearch"); + ", check binsearch" #endif #ifdef EXTENTS_STATS - printk(", stats"); + ", stats" #endif - printk("\n"); + "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } /* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -/* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle @@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, depth = ext_depth(inode); eh = path[depth].p_hdr; - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - return -EIO; - } + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; last_ex = EXT_LAST_EXTENT(eh); /* * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, for (i = depth-1; i >= 0; i--) if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) return 0; +out: ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); return ext4_mark_inode_dirty(handle, inode); } @@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; - unsigned int punched_out = 0; - unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext4_lblk_t cluster_offset; @@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && - ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) @@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { - struct ext4_map_blocks punch_map; - ext4_fsblk_t partial_cluster = 0; - newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - - /* - * Punch out the map length, but only to the - * end of the extent - */ - punched_out = allocated < map->m_len ? - allocated : map->m_len; - /* - * Sense extents need to be converted to - * uninitialized, they must fit in an - * uninitialized extent + * Do not put uninitialized extent + * in the cache */ - if (punched_out > EXT_UNINIT_MAX_LEN) - punched_out = EXT_UNINIT_MAX_LEN; - - punch_map.m_lblk = map->m_lblk; - punch_map.m_pblk = newblock; - punch_map.m_len = punched_out; - punch_map.m_flags = 0; - - /* Check to see if the extent needs to be split */ - if (punch_map.m_len != ee_len || - punch_map.m_lblk != ee_block) { - - ret = ext4_split_extent(handle, inode, - path, &punch_map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (ret < 0) { - err = ret; - goto out2; - } - /* - * find extent for the block at - * the start of the hole - */ - ext4_ext_drop_refs(path); - kfree(path); - - path = ext4_ext_find_extent(inode, - map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - ee_block = le32_to_cpu(ex->ee_block); - ee_start = ext4_ext_pblock(ex); - - } - - ext4_ext_mark_uninitialized(ex); - - ext4_ext_invalidate_cache(inode); - - err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, map->m_lblk, - map->m_lblk + punched_out); - - if (!err && path->p_hdr->eh_entries == 0) { - /* - * Punch hole freed all of this sub tree, - * so we need to correct eh_depth - */ - err = ext4_ext_get_access(handle, inode, path); - if (err == 0) { - ext_inode_hdr(inode)->eh_depth = 0; - ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root( - inode, 0)); - - err = ext4_ext_dirty( - handle, inode, path); - } + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; } - - goto out2; + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; } } @@ -4165,13 +4146,11 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? - punched_out : allocated; trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : result); + newblock, map->m_len, err ? err : allocated); - return err ? err : result; + return err ? err : allocated; } void ext4_ext_truncate(struct inode *inode) @@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, map.m_len); + ext4_msg(inode->i_sb, KERN_ERR, + "%s:%d: inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + __func__, __LINE__, inode->i_ino, map.m_lblk, + map.m_len, ret); } ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; - struct ext4_ext_cache cache_ex; - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - struct ext4_map_blocks map; handle_t *handle; loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; - int ret, credits, blocks_released, err = 0; + int credits, err = 0; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } - /* * If i_size is contained in the last page, we need to * unmap and zero the partial page after i_size @@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + /* If there are no blocks to remove, return now */ - if (first_block >= last_block) + if (first_block >= stop_block) goto out; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); - /* - * Loop over all the blocks and identify blocks - * that need to be punched out - */ - iblock = first_block; - blocks_released = 0; - while (iblock < last_block) { - max_blocks = last_block - iblock; - num_blocks = 1; - memset(&map, 0, sizeof(map)); - map.m_lblk = iblock; - map.m_len = max_blocks; - ret = ext4_ext_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (ret > 0) { - blocks_released += ret; - num_blocks = ret; - } else if (ret == 0) { - /* - * If map blocks could not find the block, - * then it is in a hole. If the hole was - * not already cached, then map blocks should - * put it in the cache. So we can get the hole - * out of the cache - */ - memset(&cache_ex, 0, sizeof(cache_ex)); - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && - !cache_ex.ec_start) { - - /* The hole is cached */ - num_blocks = cache_ex.ec_block + - cache_ex.ec_len - iblock; - - } else { - /* The block could not be identified */ - err = -EIO; - break; - } - } else { - /* Map blocks error */ - err = ret; - break; - } - - if (num_blocks == 0) { - /* This condition should never happen */ - ext_debug("Block lookup failed"); - err = -EIO; - break; - } - - iblock += num_blocks; - } + err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - if (blocks_released > 0) { - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - } + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 00a2cb753efd..bb6c7d811313 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode) io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); list_del_init(&io->list); + io->flag |= EXT4_IO_END_IN_FSYNC; /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode) if (ret < 0) ret2 = ret; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + io->flag &= ~EXT4_IO_END_IN_FSYNC; } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25d8c9781ad9..409c2ee7750a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, return EXT4_INODES_PER_GROUP(sb); } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return NULL; } return bh; @@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - if (atomic_read(&inode->i_count) > 1) { - printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); return; } - if (inode->i_nlink) { - printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); return; } - if (!sb) { - printk(KERN_ERR "ext4_free_inode: inode on " - "nonexistent device\n"); + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); @@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode - */ -static int ext4_claim_inode(struct super_block *sb, - struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, umode_t mode) -{ - int free = 0, retval = 0, count; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - - /* - * We have to be sure that new inode allocation does not race with - * inode table initialization, because otherwise we may end up - * allocating and writing new inode right before sb_issue_zeroout - * takes place and overwriting our new inode with zeroes. So we - * take alloc_sem to prevent it. - */ - down_read(&grp->alloc_sem); - ext4_lock_group(sb, group); - if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { - /* not a free inode */ - retval = 1; - goto err_ret; - } - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - ext4_error(sb, "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - return 1; - } - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); - - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - return retval; -} - -/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -741,6 +664,11 @@ got_group: if (ret2 == -1) goto out; + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; @@ -757,51 +685,24 @@ repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - - if (ino < EXT4_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - group_desc_bh); - if (err) - goto fail; - if (!ext4_claim_inode(sb, inode_bitmap_bh, - ino, group, mode)) { - /* we won it */ - BUFFER_TRACE(inode_bitmap_bh, - "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, - NULL, - inode_bitmap_bh); - if (err) - goto fail; - /* zero bit is inode number 1*/ - ino++; - goto got; - } - /* we lost it */ - ext4_handle_release_buffer(handle, inode_bitmap_bh); - ext4_handle_release_buffer(handle, group_desc_bh); - - if (++ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; + if (ino >= EXT4_INODES_PER_GROUP(sb)) { + if (++group == ngroups) + group = 0; + continue; } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ - if (++group == ngroups) - group = 0; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; } err = -ENOSPC; goto out; @@ -838,6 +739,59 @@ got: if (err) goto fail; } + + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; + + /* Update the relevant bg descriptor fields */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + } + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) @@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) @@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u\n" - "Used itable blocks: %d" - "itable unused count: %u\n", + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index feaa82fe629d..c77b0bd2c711 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode, trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " - "with only %d reserved data blocks\n", + "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); @@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) */ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " - "data blocks\n", inode->i_ino, to_free, + "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; @@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_CRIT "Total free blocks count %lld\n", + struct super_block *sb = inode->i_sb; + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(inode->i_sb))); - printk(KERN_CRIT "Free/Dirty block details\n"); - printk(KERN_CRIT "free_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); - printk(KERN_CRIT "dirty_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); - printk(KERN_CRIT "Block reservation details\n"); - printk(KERN_CRIT "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + EXT4_I(inode)->i_reserved_data_blocks); + ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: return ext4_ordered_write_end(file, mapping, pos, len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { + case EXT4_INODE_WRITEBACK_DATA_MODE: return ext4_writeback_write_end(file, mapping, pos, len, copied, page, fsdata); - } else { + default: BUG(); } } @@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -2795,9 +2798,6 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - printk("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); + ext4_msg(io_end->inode->i_sb, KERN_INFO, + "sb umounted, discard end_io request for inode %lu", + io_end->inode->i_ino); ext4_free_io_end(io_end); goto out; } @@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode, GFP_NOFS); - if (!iocb->private) + ext4_io_end_t *io_end = + ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) return -ENOMEM; + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* * we save the io structure for current async * direct IO, so that later ext4_map_blocks() @@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_get_block_write, ext4_end_io_dio, NULL, - DIO_LOCKING | DIO_SKIP_HOLES); + DIO_LOCKING); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; /* @@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; - else + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } } @@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; if (!S_ISREG(inode->i_mode)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { /* TODO: Add support for non extent hole punching */ - return -ENOTSUPP; + return -EOPNOTSUPP; } if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ - return -ENOTSUPP; + return -EOPNOTSUPP; } return ext4_ext_punch_hole(file, offset, length); @@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle, ext4_update_dynamic_rev(sb); EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, NULL, - EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super(handle, sb); } } raw_inode->i_generation = cpu_to_le32(inode->i_generation); @@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { + if (attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - ext4_truncate(inode); + ext4_truncate(inode); } if (!rc) { @@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (test_opt(inode->i_sb, I_VERSION)) + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); /* the do_update_inode consumes one bh->b_count */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb990b21c698..99ab428bcfa0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,6 +21,7 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" #include <linux/debugfs.h> #include <linux/slab.h> @@ -339,7 +340,7 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); - return EXT4_MB_BITMAP(e4b); + return e4b->bd_bitmap; } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; @@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= ngroups) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - grinfo = ext4_get_group_info(sb, first_group + i); + grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; goto out; - - if (bitmap_uptodate(bh[i])) - continue; - - lock_buffer(bh[i]); - if (bitmap_uptodate(bh[i])) { - unlock_buffer(bh[i]); - continue; - } - ext4_lock_group(sb, first_group + i); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_bitmap_uptodate(bh[i]); - set_buffer_uptodate(bh[i]); - ext4_unlock_group(sb, first_group + i); - unlock_buffer(bh[i]); - continue; } - ext4_unlock_group(sb, first_group + i); - if (buffer_uptodate(bh[i])) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh[i]); - unlock_buffer(bh[i]); - continue; - } - get_bh(bh[i]); - /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. - */ - set_bitmap_uptodate(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug(1, "read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page; i++) - if (bh[i]) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page; i++) - if (bh[i] && !buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } - err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, /* let's maintain fragments counter */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); + block = !mb_test_bit(first - 1, e4b->bd_bitmap); if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(first + count, e4b->bd_bitmap); if (block && max) e4b->bd_info->bb_fragments--; else if (!block && !max) @@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, block = first++; order = 0; - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { + if (!mb_test_bit(block, e4b->bd_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u)", block); } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); + mb_clear_bit(block, e4b->bd_bitmap); e4b->bd_info->bb_counters[order]++; /* start of the buddy */ @@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); @@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } @@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; out_free_locality_groups: @@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb, * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_cluster, entry->count); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_unload_buddy(&e4b); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } @@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void) return -ENOMEM; } - ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; @@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void) rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); + kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } @@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " - "fs metadata\n", block, block+len); + "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - loff_t size, orig_size, start_off; + loff_t size, start_off; + loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_cluster, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", ac->ac_ex_scanned, ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4428,9 +4376,9 @@ out: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - cluster = new_entry->start_cluster; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (cluster < entry->start_cluster) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (cluster >= (entry->start_cluster + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(entry, new_entry)) { - new_entry->start_cluster = entry->start_cluster; - new_entry->count += entry->count; + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4691,15 +4634,15 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { err = -ENOMEM; goto error_return; } - new_entry->start_cluster = bit; - new_entry->group = block_group; - new_entry->count = count_clusters; - new_entry->t_tid = handle->h_transaction->t_tid; + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; - while (start < max) { - start = mb_find_next_zero_bit(bitmap, max, start); - if (start >= max) + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) break; - next = mb_find_next_bit(bitmap, max, start); + next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { ext4_trim_extent(sb, start, @@ -5027,37 +4970,36 @@ out: int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_group_info *grp; - ext4_group_t first_group, last_group; - ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; - uint64_t start, len, minlen, trimmed = 0; + uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; - len = range->len >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) return -EINVAL; - if (start + len <= first_data_blk) + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) goto out; - if (start < first_data_blk) { - len -= first_data_blk - start; + if (start < first_data_blk) start = first_data_blk; - } - /* Determine first and last group to examine based on start and len */ + /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } /* - * For all the groups except the last one, last block will - * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case start + - * len < EXT4_BLOCKS_PER_GROUP(sb). + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() */ - if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) - last_cluster = first_cluster + len; - len -= last_cluster - first_cluster; + if (group == last_group) + end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - last_cluster, minlen); + end, minlen); if (cnt < 0) { ret = cnt; break; } + trimmed += cnt; } - trimmed += cnt; + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ first_cluster = 0; } - range->len = trimmed * sb->s_blocksize; if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: + range->len = trimmed * sb->s_blocksize; return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3285e3..c070618c21ce 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -96,21 +96,23 @@ extern u8 mb_enable_debug; struct ext4_free_data { - /* this links the free block information from group_info */ - struct rb_node node; + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ - /* this links the free block information from ext4_sb_info */ - struct list_head list; + /* this links the free block information from group_info */ + struct rb_node efd_node; /* group which free block extent belongs */ - ext4_group_t group; + ext4_group_t efd_group; /* free block extent */ - ext4_grpblk_t start_cluster; - ext4_grpblk_t count; + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; /* transaction which freed this extent */ - tid_t t_tid; + tid_t efd_tid; }; struct ext4_prealloc_space { @@ -210,8 +212,6 @@ struct ext4_buddy { __u16 bd_blkbits; ext4_group_t bd_group; }; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7d6bb0acfa6..f39f80f8f2c5 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode) tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = PTR_ERR(inode); + retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 7ea4ba4eff2a..ed6548d89165 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb, * If check_interval in MMP block is larger, use that instead of * update_interval from the superblock. */ - if (mmp->mmp_check_interval > mmp_check_interval) - mmp_check_interval = mmp->mmp_check_interval; + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); seq = le32_to_cpu(mmp->mmp_seq); if (seq == EXT4_MMP_SEQ_CLEAN) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2043f482375d..349d7b3671c8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -468,7 +468,7 @@ fail2: fail: if (*err == ERR_BAD_DX_DIR) ext4_warning(dir->i_sb, - "Corrupt dir inode %ld, running e2fsck is " + "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); return NULL; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 475851896518..74cd1f7f1f88 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode) static void put_io_page(struct ext4_io_page *io_page) { if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); put_page(io_page->p_page); kmem_cache_free(io_page_cachep, io_page); } @@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->iocb) aio_complete(io->iocb, io->result, 0); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); @@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (io->flag & EXT4_IO_END_IN_FSYNC) + goto requeue; if (list_empty(&io->list)) { spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); goto free; } if (!mutex_trylock(&inode->i_mutex)) { + bool was_queued; +requeue: + was_queued = !!(io->flag & EXT4_IO_END_QUEUED); + io->flag |= EXT4_IO_END_QUEUED; spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work @@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work) * yield the cpu if it sees an end_io request that has already * been requeued. */ - if (io->flag & EXT4_IO_END_QUEUED) + if (was_queued) yield(); - io->flag |= EXT4_IO_END_QUEUED; return; } list_del_init(&io->list); @@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - put_io_page(io_end->pages[i]); + if (atomic_read(&io_end->pages[i]->p_count) == 1) + end_page_writeback(io_end->pages[i]->p_page); } - io_end->num_io_pages = 0; inode = io_end->inode; if (error) { @@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ + if (atomic_read(&io_page->p_count) == 1) + end_page_writeback(page); put_io_page(io_page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f9d948f0eb86..59fa0be27251 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb, do_div(reserved_blocks, 100); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); /* * We need to protect s_groups_count against other CPUs seeing @@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, } ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ @@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", - o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT4-fs: filesystem on %s:" - " too large to resize to %llu blocks safely\n", - sb->s_id, n_blocks_count); + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); if (sizeof(sector_t) < 8) ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; @@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_group_t o_group; ext4_group_t n_group; - ext4_grpblk_t offset; + ext4_grpblk_t offset, add; unsigned long n_desc_blocks; unsigned long o_desc_blocks; unsigned long desc_blocks; @@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) return 0; ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); - ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / EXT4_DESC_PER_BLOCK(sb); @@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) } brelse(bh); - if (offset != 0) { - /* extend the last group */ - ext4_grpblk_t add; - add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; @@ -1674,7 +1681,7 @@ out: iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " + "upto %llu blocks", o_blocks_count, n_blocks_count); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 933900909ed0..ceebaf853beb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); @@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb) return bdi->dev == NULL; } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce, *tmp; + + spin_lock(&sbi->s_md_lock); + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk(KERN_CONT "block %llu: ", block); - printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function, path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); - if (block) - printk(KERN_CONT "block %llu: ", block); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->dio_unwritten_wq); lock_super(sb); - if (sb->s_dirt) - ext4_commit_super(sb, 1); - if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, 1); } + if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode) } } -static inline void ext4_show_quota_options(struct seq_file *seq, - struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ - int def_errors; - unsigned long def_mount_opts; - struct super_block *sb = root->d_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - def_errors = le16_to_cpu(es->s_errors); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%llu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT4_DEF_RESUID || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); - } - if (sbi->s_resgid != EXT4_DEF_RESGID || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); - } - if (test_opt(sb, ERRORS_RO)) { - if (def_errors == EXT4_ERRORS_PANIC || - def_errors == EXT4_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER)) - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { - seq_printf(seq, ",min_batch_time=%u", - (unsigned) sbi->s_min_batch_time); - } - if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { - seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_max_batch_time); - } - - /* - * We're changing the default of barrier mount option, so - * let's always display its mount state so it's clear what its - * status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - seq_puts(seq, ",journal_async_commit"); - else if (test_opt(sb, JOURNAL_CHECKSUM)) - seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, I_VERSION)) - seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC) && - !(def_mount_opts & EXT4_DEFM_NODELALLOC)) - seq_puts(seq, ",nodelalloc"); - - if (!test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",nomblk_io_submit"); - if (sbi->s_stripe) - seq_printf(seq, ",stripe=%lu", sbi->s_stripe); - /* - * journal mode get enabled in different ways - * So just print the value even if we didn't specify it - */ - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - - if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) - seq_printf(seq, ",inode_readahead_blks=%u", - sbi->s_inode_readahead_blks); - - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",noauto_da_alloc"); - - if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) - seq_puts(seq, ",discard"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - if (test_opt(sb, DIOREAD_NOLOCK)) - seq_puts(seq, ",dioread_nolock"); - - if (test_opt(sb, BLOCK_VALIDITY) && - !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) - seq_puts(seq, ",block_validity"); - - if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_itable"); - else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_itable=%u", - (unsigned) sbi->s_li_wait_mult); - - ext4_show_quota_options(seq, sb); - - return 0; -} - static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_dev, - Opt_journal_checksum, Opt_journal_async_commit, + Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1350,20 +1203,19 @@ static const match_table_t tokens = { {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_noload, "noload"}, {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, {Opt_commit, "commit=%u"}, {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_update, "journal=update"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -1389,7 +1241,6 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_mblk_io_submit, "mblk_io_submit"}, @@ -1408,6 +1259,11 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ {Opt_err, NULL}, }; @@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype) } #endif -static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, - ext4_fsblk_t *n_blocks_count, int is_remount) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; - substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 #ifdef CONFIG_QUOTA - int qfmt; +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT #endif - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, MINIX_DF); - break; - case Opt_minix_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, MINIX_DF); - - break; - case Opt_grpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, GRPID); - - break; - case Opt_nogrpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, GRPID); - - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resuid = option; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resgid = option; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_RO); - set_opt(sb, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt(sb, ERRORS_RO); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt(sb, NO_UID32); - break; - case Opt_debug: - set_opt(sb, DEBUG); - break; - case Opt_oldalloc: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; +#define MOPT_DATAJ 0x0080 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, + {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, #ifdef CONFIG_EXT4_FS_XATTR - case Opt_user_xattr: - set_opt(sb, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sb, XATTR_USER); - break; + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); - break; + {Opt_user_xattr, 0, MOPT_NOSUPPORT}, + {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL - case Opt_acl: - set_opt(sb, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sb, POSIX_ACL); - break; + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else - case Opt_acl: - case Opt_noacl: - ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); - break; + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; - } - set_opt(sb, UPDATE_JOURNAL); - break; - case Opt_journal_dev: - if (is_remount) { + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + int arg = 0; + + if (args->from && match_int(args, &arg)) + return -1; + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, + "Ignoring removed %s option", opt); + return 1; + case Opt_resuid: + sbi->s_resuid = arg; + return 1; + case Opt_resgid: + sbi->s_resgid = arg; + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + return 1; + case Opt_journal_ioprio: + if (arg < 0 || arg > 7) + return -1; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + if (token != m->token) + continue; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg > (1 << 30)) + return -1; + if (arg && !is_power_of_2(arg)) { ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); + return -1; } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_checksum: - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_journal_async_commit: - set_opt(sb, JOURNAL_ASYNC_COMMIT); - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_noload: - set_opt(sb, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_max_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = option; - break; - case Opt_min_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_min_batch_time = option; - break; - case Opt_data_journal: - data_opt = EXT4_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT4_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT4_MOUNT_WRITEBACK_DATA; - datacheck: + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != data_opt) { + else if (test_opt(sb, DATA_FLAGS) != + m->mount_opt) { ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return 0; + "Cannot change data mode on remount"); + return -1; } } else { clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; + sbi->s_mount_opt |= m->mount_opt; } - break; - case Opt_data_err_abort: - set_opt(sb, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sb, DATA_ERR_ABORT); - break; #ifdef CONFIG_QUOTA - case Opt_usrjquota: + } else if (token == Opt_usrjquota) { if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: + return -1; + } else if (token == Opt_grpjquota) { if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: + return -1; + } else if (token == Opt_offusrjquota) { if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: + return -1; + } else if (token == Opt_offgrpjquota) { if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: + return -1; + } else if (m->flags & MOPT_QFMT) { if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sb, QUOTA); - set_opt(sb, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sb, QUOTA); - set_opt(sb, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return 0; + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot " + "change journaled quota options " + "when quota turned on"); + return -1; } - clear_opt(sb, QUOTA); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext4_msg(sb, KERN_ERR, - "quota options not supported"); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext4_msg(sb, KERN_ERR, - "journaled quota options not supported"); - break; - case Opt_noquota: - break; + sbi->s_jquota_fmt = m->mount_opt; #endif - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - break; - case Opt_nobarrier: - clear_opt(sb, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sb, BARRIER); - else - clear_opt(sb, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext4_msg(sb, KERN_ERR, - "resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated nobh option"); - break; - case Opt_bh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated bh option"); - break; - case Opt_i_version: - set_opt(sb, I_VERSION); - sb->s_flags |= MS_I_VERSION; - break; - case Opt_nodelalloc: - clear_opt(sb, DELALLOC); - clear_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_mblk_io_submit: - set_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_nomblk_io_submit: - clear_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_stripe: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_stripe = option; - break; - case Opt_delalloc: - set_opt(sb, DELALLOC); - set_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_block_validity: - set_opt(sb, BLOCK_VALIDITY); - break; - case Opt_noblock_validity: - clear_opt(sb, BLOCK_VALIDITY); - break; - case Opt_inode_readahead_blks: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > (1 << 30)) - return 0; - if (option && !is_power_of_2(option)) { - ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return 0; + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; } - sbi->s_inode_readahead_blks = option; - break; - case Opt_journal_ioprio: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > 7) - break; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, - option); - break; - case Opt_noauto_da_alloc: - set_opt(sb, NO_AUTO_DA_ALLOC); - break; - case Opt_auto_da_alloc: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - clear_opt(sb, NO_AUTO_DA_ALLOC); + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; else - set_opt(sb,NO_AUTO_DA_ALLOC); - break; - case Opt_discard: - set_opt(sb, DISCARD); - break; - case Opt_nodiscard: - clear_opt(sb, DISCARD); - break; - case Opt_dioread_nolock: - set_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_dioread_lock: - clear_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_init_itable: - set_opt(sb, INIT_INODE_TABLE); - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = EXT4_DEF_LI_WAIT_MULT; - if (option < 0) - return 0; - sbi->s_li_wait_mult = option; - break; - case Opt_noinit_itable: - clear_opt(sb, INIT_INODE_TABLE); - break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " - "or missing value", p); - return 0; + sbi->s_mount_opt &= ~m->mount_opt; } + return 1; + } + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) + return 0; } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { @@ -1942,6 +1651,160 @@ set_qf_format: return 1; } +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static const char *token2str(int token) +{ + static const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); + if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE(inode)->data); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void) ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; - printk(KERN_CRIT "EXT4: error %d creating inode table " + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; @@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { - ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", - "2.6.38"); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); - } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ @@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, NULL, 0)) { + &journal_devnum, &journal_ioprio, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", sbi->s_es->s_mount_opts); } + sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, NULL, 0)) + &journal_ioprio, 0)) goto failed_mount; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #else es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif - sb->s_dirt = 1; } /* Handle clustersize */ @@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + /* * The journal may have updated the bg summary counts, so we * need to update the global counters. @@ -3861,6 +3727,7 @@ failed_mount2: ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = jbd2_journal_update_format(journal); - if (err) { - ext4_msg(sb, KERN_ERR, "error updating journal"); - jbd2_journal_destroy(journal); - return err; - } - } - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { @@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; int enable_quota = 0; @@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &journal_ioprio, - &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; } @@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > ext4_blocks_count(es)) { + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; @@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext4_group_extend(sb, es, n_blocks_count))) - goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; if (EXT4_HAS_INCOMPAT_FEATURE(sb, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93a00d89a220..e88748e55c0f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -82,8 +82,8 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_cache_insert(struct buffer_head *); @@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) static inline int ext4_xattr_check_block(struct buffer_head *bh) { - int error; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); - return error; + return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); } static inline int @@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) goto cleanup; @@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = 0; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); error = -EIO; if (!bh) @@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); } - unlock_buffer(bh); out: ext4_std_error(inode->i_sb, error); return; @@ -834,7 +834,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); new_bh = sb_getblk(sb, block); if (!new_bh) { diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d49d202903fb..c78841ee81cf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh) * whole transaction. * * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __try_to_free_cp_buf(struct journal_head *jh) { int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { /* * Get our reference so that bh cannot be freed before @@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - jbd_unlock_bh_state(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); - } else { - jbd_unlock_bh_state(bh); } return ret; } @@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } /* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk. Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) - __releases(journal->j_list_lock) -{ - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_lock_bh_state(bh); - jbd_unlock_bh_state(bh); - put_bh(bh); -} - -/* * Clean up transaction's list of buffers submitted for io. * We wait for any pending IO to complete and remove any clean * buffers. Note that we take the buffers in the opposite ordering @@ -222,15 +203,9 @@ restart: while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -246,7 +221,6 @@ restart: * it has been written out and so we can drop it from the list */ released = __jbd2_journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); __brelse(bh); } @@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count) for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; - clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); } @@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count) * be written out. * * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, int *batch_count, transaction_t *transaction) @@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so starting and @@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (unlikely(buffer_write_io_error(bh))) ret = -EIO; get_bh(bh); - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); __brelse(bh); } else { /* @@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); - set_buffer_jwrite(bh); journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == JBD2_NR_BATCH) { @@ -407,15 +373,7 @@ restart: int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { - struct buffer_head *bh; - jh = transaction->t_checkpoint_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - retry = 1; - break; - } retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) @@ -478,79 +436,28 @@ out: int jbd2_cleanup_journal_tail(journal_t *journal) { - transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned long blocknr; if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. */ - - write_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - write_unlock(&journal->j_state_lock); + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - } - - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - write_unlock(&journal->j_state_lock); + J_ASSERT(blocknr != 0); /* - * If there is an external journal, we need to make sure that - * any data blocks that were recently written out --- perhaps - * by jbd2_log_do_checkpoint() --- are flushed out before we - * drop the transactions from the external journal. It's - * unlikely this will be necessary, especially with a - * appropriately sized journal, but we need this to guarantee - * correctness. Fortunately jbd2_cleanup_journal_tail() - * doesn't get called all that often. + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. */ - if ((journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) + if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!(journal->j_flags & JBD2_ABORT)) - jbd2_journal_update_superblock(journal, 1); + + __jbd2_update_log_tail(journal, first_tid, blocknr); return 0; } @@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) do { jh = next_jh; next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; } } /* @@ -673,9 +577,7 @@ out: * The function can free jh and bh. * * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ - int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { struct transaction_chp_stats_s *stats; @@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); - kfree(transaction); + jbd2_journal_free_transaction(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd2_drop_transaction(journal, transaction); + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index c067a8cae63b..17f557f01cf0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; struct blk_plug plug; + /* Tail of the journal */ + unsigned long first_block; + tid_t first_tid; + int update_tail; /* * First job: lock down the current transaction and wait for @@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - jbd2_journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -677,10 +692,30 @@ start_journal_io: err = 0; } + /* + * Get current oldest transaction in the log before we issue flush + * to the filesystem device. After the flush we can be sure that + * blocks of all older transactions are checkpointed to persistent + * storage and we will be safe to update journal start in the + * superblock with the numbers we get here. + */ + update_tail = + jbd2_journal_get_log_tail(journal, &first_tid, &first_block); + write_lock(&journal->j_state_lock); + if (update_tail) { + long freed = first_block - journal->j_tail; + + if (first_block < journal->j_tail) + freed += journal->j_last - journal->j_first; + /* Update tail only if we free significant amount of space */ + if (freed < journal->j_maxlen / 4) + update_tail = 0; + } J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -831,6 +866,14 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); + /* + * Now disk caches for filesystem device are flushed so we are safe to + * erase checkpointed transactions from the log by updating journal + * superblock. + */ + if (update_tail) + jbd2_update_log_tail(journal, first_tid, first_block); + /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on @@ -1048,7 +1091,7 @@ restart_loop: jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) - kfree(commit_transaction); + jbd2_journal_free_transaction(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 839377e3d624..98ed6dbfe381 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke); EXPORT_SYMBOL(jbd2_journal_init_dev); EXPORT_SYMBOL(jbd2_journal_init_inode); -EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); @@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); @@ -746,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); +} + +/* + * This is a variaon of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + mutex_lock(&journal->j_checkpoint_mutex); + if (tid_gt(tid, journal->j_tail_sequence)) + __jbd2_update_log_tail(journal, tid, block); + mutex_unlock(&journal->j_checkpoint_mutex); +} + struct jbd2_stats_proc_session { journal_t *journal; struct transaction_stats_s *stats; @@ -1114,40 +1204,45 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - jbd2_journal_update_superblock(journal, 1); - return jbd2_journal_start_thread(journal); -} - -/** - * void jbd2_journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void jbd2_journal_update_superblock(journal_t *journal, int wait) -{ - journal_superblock_t *sb = journal->j_superblock; - struct buffer_head *bh = journal->j_sb_buffer; - /* * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JBD2_FLUSHED. This avoids + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JBD2_FLUSHED. This avoids * attempting a write to a potential-readonly device. */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { + if (sb->s_start == 0) { jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); - goto out; + journal->j_flags |= JBD2_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); } + return jbd2_journal_start_thread(journal); +} +static void jbd2_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_jbd2_write_superblock(journal, write_op); + if (!(journal->j_flags & JBD2_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1163,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: Error %d detected when updating " + "journal superblock for %s.\n", ret, + journal->j_devname); + } +} + +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + jbd2_write_superblock(journal, write_op); + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - printk(KERN_ERR "JBD2: I/O error detected " - "when updating journal superblock for %s.\n", - journal->j_devname); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); - -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + jbd2_write_superblock(journal, WRITE_FUA); + /* Log is no longer empty */ write_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JBD2_FLUSHED; - else - journal->j_flags |= JBD2_FLUSHED; + journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); } + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void jbd2_journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + read_lock(&journal->j_state_lock); + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + read_unlock(&journal->j_state_lock); + + jbd2_write_superblock(journal, WRITE_SYNC); +} + /* * Read the superblock for a given journal, performing initial * validation of the format. */ - static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; @@ -1398,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = - ++journal->j_transaction_sequence; - jbd2_journal_update_superblock(journal, 1); - } else { + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } @@ -1552,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, EXPORT_SYMBOL(jbd2_journal_clear_features); /** - * int jbd2_journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int jbd2_journal_update_format (journal_t *journal) -{ - journal_superblock_t *sb; - int err; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V2: - return 0; - case JBD2_SUPERBLOCK_V1: - return journal_convert_superblock_v1(journal, sb); - default: - break; - } - return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, - journal_superblock_t *sb) -{ - int offset, blocksize; - struct buffer_head *bh; - - printk(KERN_WARNING - "JBD2: Converting superblock from version 1 to 2.\n"); - - /* Pre-initialise new fields to zero */ - offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); - blocksize = be32_to_cpu(sb->s_blocksize); - memset(&sb->s_feature_compat, 0, blocksize-offset); - - sb->s_nr_users = cpu_to_be32(1); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - journal->j_format_version = 2; - - bh = journal->j_sb_buffer; - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); - return 0; -} - - -/** * int jbd2_journal_flush () - Flush journal * @journal: Journal to act on. * @@ -1619,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; write_lock(&journal->j_state_lock); @@ -1654,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); jbd2_cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1661,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - write_unlock(&journal->j_state_lock); - jbd2_journal_update_superblock(journal, 1); - write_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1708,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); - if (write) - jbd2_journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1759,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); if (errno) - jbd2_journal_update_superblock(journal, 1); + jbd2_journal_update_sb_errno(journal); } /** @@ -2017,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache; static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif -static int journal_init_jbd2_journal_head_cache(void) +static int jbd2_journal_init_journal_head_cache(void) { int retval; @@ -2035,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void) return retval; } -static void jbd2_journal_destroy_jbd2_journal_head_cache(void) +static void jbd2_journal_destroy_journal_head_cache(void) { if (jbd2_journal_head_cache) { kmem_cache_destroy(jbd2_journal_head_cache); @@ -2323,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; -static int __init journal_init_handle_cache(void) +static int __init jbd2_journal_init_handle_cache(void) { jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (jbd2_handle_cache == NULL) { @@ -2358,17 +2452,20 @@ static int __init journal_init_caches(void) ret = jbd2_journal_init_revoke_caches(); if (ret == 0) - ret = journal_init_jbd2_journal_head_cache(); + ret = jbd2_journal_init_journal_head_cache(); + if (ret == 0) + ret = jbd2_journal_init_handle_cache(); if (ret == 0) - ret = journal_init_handle_cache(); + ret = jbd2_journal_init_transaction_cache(); return ret; } static void jbd2_journal_destroy_caches(void) { jbd2_journal_destroy_revoke_caches(); - jbd2_journal_destroy_jbd2_journal_head_cache(); + jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index da6d7baf1390..c1a03354a22f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/crc32.h> +#include <linux/blkdev.h> #endif /* @@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 30b2867d6cc9..6973705d6a3d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void) J_ASSERT(!jbd2_revoke_record_cache); J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", - sizeof(struct jbd2_revoke_record_s), - 0, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, - NULL); + jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) goto record_cache_failure; - jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", - sizeof(struct jbd2_revoke_table_s), - 0, SLAB_TEMPORARY, NULL); + jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, + SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) goto table_cache_failure; return 0; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5aba56e1fd5..ddcd3549c6c2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -33,6 +33,35 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ + J_ASSERT(!transaction_cache); + transaction_cache = kmem_cache_create("jbd2_transaction_s", + sizeof(transaction_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (transaction_cache) + return 0; + return -ENOMEM; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ + if (transaction_cache) { + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; + } +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ + if (unlikely(ZERO_OR_NULL_PTR(transaction))) + return; + kmem_cache_free(transaction_cache, transaction); +} + /* * jbd2_get_transaction: obtain a new transaction_t object. * @@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); + new_transaction = kmem_cache_alloc(transaction_cache, + gfp_mask | __GFP_ZERO); if (!new_transaction) { /* * If __GFP_FS is not present, then we may be @@ -162,7 +192,7 @@ repeat: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return -EROFS; } @@ -284,7 +314,7 @@ repeat: read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return 0; } @@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * of these pointers, it could go bad. Generally the caller needs to re-read * the pointer from the transaction_t. * - * Called under j_list_lock. The journal may not be locked. + * Called under j_list_lock. */ -void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; @@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ - if (jh->b_jlist == BJ_None) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __jbd2_journal_remove_checkpoint(jh); } spin_unlock(&journal->j_list_lock); out: @@ -1949,6 +1977,8 @@ zap_buffer_unlocked: clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } |