summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-17 16:31:18 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-17 16:31:18 -0700
commitfaeb20ecfa398b043c3224607f512c009c51653d (patch)
treeffd185ffb5e499a76f261c700de72241e6781ecf /fs/ext4
parent364e8dd9d636fea7def862919aac092b19b7c581 (diff)
parent0304688676bdfc8159e165313d71da19c118ba27 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Performance improvements in SEEK_DATA and xattr scalability improvements, plus a lot of clean ups and bug fixes" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (38 commits) ext4: clean up error handling in the MMP support jbd2: do not fail journal because of frozen_buffer allocation failure ext4: use __GFP_NOFAIL in ext4_free_blocks() ext4: fix compile error while opening the macro DOUBLE_CHECK ext4: print ext4 mount option data_err=abort correctly ext4: fix NULL pointer dereference in ext4_mark_inode_dirty() ext4: drop unneeded BUFFER_TRACE in ext4_delete_inline_entry() ext4: fix misspellings in comments. jbd2: fix FS corruption possibility in jbd2_journal_destroy() on umount path ext4: more efficient SEEK_DATA implementation ext4: cleanup handling of bh->b_state in DAX mmap ext4: return hole from ext4_map_blocks() ext4: factor out determining of hole size ext4: fix setting of referenced bit in ext4_es_lookup_extent() ext4: remove i_ioend_count ext4: simplify io_end handling for AIO DIO ext4: move trans handling and completion deferal out of _ext4_get_block ext4: rename and split get blocks functions ext4: use i_mutex to serialize unaligned AIO DIO ext4: pack ioend structure better ...
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h45
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/extents.c128
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/file.c129
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c29
-rw-r--r--fs/ext4/inline.c8
-rw-r--r--fs/ext4/inode.c388
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h12
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c34
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/super.c35
-rw-r--r--fs/ext4/xattr.c166
-rw-r--r--fs/ext4/xattr.h3
17 files changed, 597 insertions, 475 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458a69d4..393689dfa1af 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -42,6 +42,18 @@
*/
/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
* Define EXT4FS_DEBUG to produce debug messages
*/
#undef EXT4FS_DEBUG
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
+ atomic_t count; /* reference counter */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
* transaction reserved
*/
struct list_head i_rsv_conversion_list;
- /*
- * Completed IOs that need unwritten extents handling and don't have
- * transaction reserved
- */
- atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1513,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
}
}
-static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
-{
- return inode->i_private;
-}
-
-static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
-{
- inode->i_private = io;
-}
-
/*
* Inode dynamic state flags
*/
@@ -2506,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+ struct buffer_head *bh_result, int create);
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2559,6 +2558,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len,
+ struct extent_status *result);
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3285,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
EXT4_WQ_HASH_SZ])
-#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
#define EXT4_RESIZING 0
extern int ext4_resize_begin(struct super_block *sb);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3753ceb0b0dd..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
+ /*
+ * The check for IO to unwritten extent is somewhat racy as we
+ * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+ * dropping i_data_sem. But reserved blocks should save us in that
+ * case.
+ */
if (ext4_ext_is_unwritten(ex1) &&
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
+ * ext4_ext_determine_hole - determine hole around given block
+ * @inode: inode we lookup in
+ * @path: path in extent tree to @lblk
+ * @lblk: pointer to logical block around which we want to determine hole
+ *
+ * Determine hole length (and start if easily possible) around given logical
+ * block. We don't try too hard to find the beginning of the hole but @path
+ * actually points to extent before @lblk, we provide it.
+ *
+ * The function returns the length of a hole starting at @lblk. We update @lblk
+ * to the beginning of the hole if we managed to find it.
*/
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t block)
+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
- ext4_lblk_t len;
- ext4_lblk_t lblock;
struct ext4_extent *ex;
- struct extent_status es;
+ ext4_lblk_t len;
ex = path[depth].p_ext;
if (ex == NULL) {
/* there is no extent yet, so gap is [0;-] */
- lblock = 0;
+ *lblk = 0;
len = EXT_MAX_BLOCKS;
- ext_debug("cache gap(whole file):");
- } else if (block < le32_to_cpu(ex->ee_block)) {
- lblock = block;
- len = le32_to_cpu(ex->ee_block) - block;
- ext_debug("cache gap(before): %u [%u:%u]",
- block,
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
- } else if (block >= le32_to_cpu(ex->ee_block)
+ } else if (*lblk < le32_to_cpu(ex->ee_block)) {
+ len = le32_to_cpu(ex->ee_block) - *lblk;
+ } else if (*lblk >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
- lblock = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
+ *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
next = ext4_ext_next_allocated_block(path);
- ext_debug("cache gap(after): [%u:%u] %u",
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex),
- block);
- BUG_ON(next == lblock);
- len = next - lblock;
+ BUG_ON(next == *lblk);
+ len = next - *lblk;
} else {
BUG();
}
+ return len;
+}
- ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+ ext4_lblk_t hole_len)
+{
+ struct extent_status es;
+
+ ext4_es_find_delayed_extent_range(inode, hole_start,
+ hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
- if (es.es_lblk <= lblock)
+ if (es.es_lblk <= hole_start)
return;
- len = min(es.es_lblk - lblock, len);
+ hole_len = min(es.es_lblk - hole_start, hole_len);
}
- ext_debug(" -> %u:%u\n", lblock, len);
- ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+ ext_debug(" -> %u:%u\n", hole_start, hole_len);
+ ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
+ EXTENT_STATUS_HOLE);
}
/*
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
+ struct ext4_ext_path **ppath,
unsigned int allocated)
{
struct ext4_ext_path *path = *ppath;
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path = *ppath;
int ret = 0;
int err = 0;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
- /*
- * Flag the inode(non aio case) or end_io struct (aio case)
- * that this IO needs to conversion to written when IO is
- * completed
- */
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
- int set_unwritten = 0;
bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
allocated = convert_initialized_extent(
handle, inode, map, &path,
- flags, allocated);
+ allocated);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we couldn't try to create block if create flag is zero
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ ext4_lblk_t hole_start, hole_len;
+
+ hole_start = map->m_lblk;
+ hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
/*
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+
+ /* Update hole_len to reflect hole size after map->m_lblk */
+ if (hole_start != map->m_lblk)
+ hole_len -= map->m_lblk - hole_start;
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, hole_len);
+
goto out2;
}
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
- /*
- * io_end structure was created for every IO write to an
- * unwritten extent. To avoid unnecessary conversion,
- * here we flag the IO that really needs the conversion.
- * For non asycn direct IO case, flag the inode state
- * that we need to perform conversion when IO is done.
- */
- if (flags & EXT4_GET_BLOCKS_PRE_IO)
- set_unwritten = 1;
}
err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
err = ext4_ext_insert_extent(handle, inode, &path,
&newex, flags);
- if (!err && set_unwritten) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
-
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
- if (!ext4_es_is_referenced(es))
- ext4_es_set_referenced(es);
+ if (!ext4_es_is_referenced(es1))
+ ext4_es_set_referenced(es1);
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4cd318f31cbe..6659e216385e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
- struct mutex *aio_mutex = NULL;
struct blk_plug plug;
int o_direct = iocb->ki_flags & IOCB_DIRECT;
+ int unaligned_aio = 0;
int overwrite = 0;
ssize_t ret;
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
/*
- * Unaligned direct AIO must be serialized; see comment above
- * In the case of O_APPEND, assume that we must always serialize
+ * Unaligned direct AIO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned AIOs can result in data
+ * corruption.
*/
- if (o_direct &&
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
- (iocb->ki_flags & IOCB_APPEND ||
- ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
- aio_mutex = ext4_aio_mutex(inode);
- mutex_lock(aio_mutex);
+ ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
+ unaligned_aio = 1;
ext4_unwritten_wait(inode);
}
- inode_lock(inode);
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_start_plug(&plug);
/* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct)
blk_finish_plug(&plug);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
return ret;
out:
inode_unlock(inode);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
return ret;
}
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
*/
static int ext4_find_unwritten_pgoff(struct inode *inode,
int whence,
- struct ext4_map_blocks *map,
+ ext4_lblk_t end_blk,
loff_t *offset)
{
struct pagevec pvec;
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)end_blk << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -550,12 +544,11 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t dataoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
inode_lock(inode);
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
dataoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
- break;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret <= 0) {
+ /* No extent found -> no data */
+ if (ret == 0)
+ ret = -ENXIO;
+ inode_unlock(inode);
+ return ret;
}
- /*
- * If there is a delay extent at this offset,
- * it will be as a data.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ last = es.es_lblk;
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ if (!ext4_es_is_unwritten(&es))
break;
- }
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- &map, &dataoff);
- if (unwritten)
- break;
- }
-
- last++;
+ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ es.es_lblk + es.es_len, &dataoff))
+ break;
+ last += es.es_len;
dataoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
inode_unlock(inode);
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t holeoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
inode_lock(inode);
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
holeoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret < 0) {
+ inode_unlock(inode);
+ return ret;
}
-
- /*
- * If there is a delay extent at this offset,
- * we will skip this extent.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- last = es.es_lblk + es.es_len;
- holeoff = (loff_t)last << blkbits;
- continue;
+ /* Found a hole? */
+ if (ret == 0 || es.es_lblk > last) {
+ if (last != start)
+ holeoff = (loff_t)last << blkbits;
+ break;
}
-
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- &map, &holeoff);
- if (!unwritten) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
- }
+ if (ext4_es_is_unwritten(&es) &&
+ ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ last + es.es_len, &holeoff))
+ break;
- /* find a hole */
- break;
+ last += es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acc0ad56bf2f..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
sbi = EXT4_SB(sb);
/*
- * Initalize owners and quota early so that we don't have to account
+ * Initialize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
goto got_it;
}
- /* Next simple case - plain lookup or failed read of indirect block */
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ /* Next simple case - plain lookup failed */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
+ int i;
+
+ /* Count number blocks in a subtree under 'partial' */
+ count = 1;
+ for (i = 0; partial + i != chain + depth - 1; i++)
+ count *= epb;
+ /* Fill in size of a hole we found */
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, count);
+ goto cleanup;
+ }
+
+ /* Failed read of indirect block */
+ if (err == -EIO)
goto cleanup;
/*
@@ -693,21 +708,21 @@ retry:
}
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, 0);
+ ext4_dio_get_block, NULL, 0);
else
ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter,
- offset, ext4_get_block, NULL,
- NULL, 0);
+ offset, ext4_dio_get_block,
+ NULL, NULL, 0);
inode_dio_end(inode);
} else {
locked:
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, DIO_LOCKING);
+ ext4_dio_get_block, NULL, DIO_LOCKING);
else
ret = blockdev_direct_IO(iocb, inode, iter, offset,
- ext4_get_block);
+ ext4_dio_get_block);
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index dfe3b9bafc0d..7cbdd3752ba5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ retry:
if (ret)
goto out;
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, from, to, ext4_get_block_write);
- else
+ if (ext4_should_dioread_nolock(inode)) {
+ ret = __block_write_begin(page, from, to,
+ ext4_get_block_unwritten);
+ } else
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
if (err)
goto out;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_mark_inode_dirty(handle, dir);
if (unlikely(err))
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aee960b1af34..b2e9576450eb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
}
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
-
/*
* Protect us against freezing - iput() caller didn't have to have any
* protection against it
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated.
- * if create==0 and the blocks are pre-allocated and unwritten block,
- * the result buffer head is unmapped. If the create ==1, it will make sure
- * the buffer head is mapped.
+ * On success, it returns the number of blocks being mapped or allocated. if
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
*
* It returns the error in case of allocation failure.
*/
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = map->m_len;
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ map->m_pblk = 0;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
retval = 0;
} else {
BUG_ON(1);
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
cmpxchg(&bh->b_state, old_state, new_state) != old_state));
}
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
- handle_t *handle = ext4_journal_current_handle();
struct ext4_map_blocks map;
- int ret = 0, started = 0;
- int dio_credits;
+ int ret = 0;
if (ext4_has_inline_data(inode))
return -ERANGE;
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
- /* Direct IO write... */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
- }
- started = 1;
- }
-
- ret = ext4_map_blocks(handle, inode, &map, flags);
+ ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
+ flags);
if (ret > 0) {
- ext4_io_end_t *io_end = ext4_inode_aio(inode);
-
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
- if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
- set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
- if (started)
- ext4_journal_stop(handle);
return ret;
}
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
}
/*
+ * Get block function used when preparing for buffered write if we require
+ * creating an unwritten extent if blocks haven't been allocated. The extent
+ * will be converted to written after the IO is complete.
+ */
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static handle_t *start_dio_trans(struct inode *inode,
+ struct buffer_head *bh_result)
+{
+ int dio_credits;
+
+ /* Trim mapping request to maximum we can map at once for DIO */
+ if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
+ bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ bh_result->b_size >> inode->i_blkbits);
+ return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+}
+
+/* Get block function for DIO reads and writes to inodes without extents */
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ if (create) {
+ handle = start_dio_trans(inode, bh);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+ ret = _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+ if (create)
+ ext4_journal_stop(handle);
+ return ret;
+}
+
+/*
+ * Get block function for AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete.
+ */
+static int ext4_dio_get_block_unwritten_async(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ handle = start_dio_trans(inode, bh_result);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ ext4_journal_stop(handle);
+
+ /*
+ * When doing DIO using unwritten extents, we need io_end to convert
+ * unwritten extents to written on IO completion. We allocate io_end
+ * once we spot unwritten extent and store it in b_private. Generic
+ * DIO code keeps b_private set and furthermore passes the value to
+ * our completion callback in 'private' argument.
+ */
+ if (!ret && buffer_unwritten(bh_result)) {
+ if (!bh_result->b_private) {
+ ext4_io_end_t *io_end;
+
+ io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!io_end)
+ return -ENOMEM;
+ bh_result->b_private = io_end;
+ ext4_set_io_unwritten_flag(inode, io_end);
+ }
+ set_buffer_defer_completion(bh_result);
+ }
+
+ return ret;
+}
+
+/*
+ * Get block function for non-AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete from ext4_ext_direct_IO() function.
+ */
+static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ handle = start_dio_trans(inode, bh_result);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ ext4_journal_stop(handle);
+
+ /*
+ * Mark inode as having pending DIO writes to unwritten extents.
+ * ext4_ext_direct_IO() checks this flag and converts extents to
+ * written.
+ */
+ if (!ret && buffer_unwritten(bh_result))
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+
+ return ret;
+}
+
+static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+
+ ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ ret = _ext4_get_block(inode, iblock, bh_result, 0);
+ /*
+ * Blocks should have been preallocated! ext4_file_write_iter() checks
+ * that.
+ */
+ WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
+
+ return ret;
+}
+
+
+/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1079,13 +1206,14 @@ retry_journal:
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
- ext4_get_block_write);
+ ext4_get_block_unwritten);
else
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block);
#else
if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ ret = __block_write_begin(page, pos, len,
+ ext4_get_block_unwritten);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-/*
- * ext4_get_block used when preparing for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after the IO is complete.
- */
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
- inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-}
-
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result));
-
- return ret;
-}
-
#ifdef CONFIG_FS_DAX
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -3179,13 +3276,12 @@ out:
WARN_ON_ONCE(ret == 0 && create);
if (ret > 0) {
map_bh(bh_result, inode->i_sb, map.m_pblk);
- bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
- map.m_flags;
/*
* At least for now we have to clear BH_New so that DAX code
* doesn't attempt to zero blocks again in a racy way.
*/
- bh_result->b_state &= ~(1 << BH_New);
+ map.m_flags &= ~EXT4_MAP_NEW;
+ ext4_update_bh_state(bh_result, map.m_flags);
bh_result->b_size = map.m_len << inode->i_blkbits;
ret = 0;
}
@@ -3196,7 +3292,7 @@ out:
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
- ext4_io_end_t *io_end = iocb->private;
+ ext4_io_end_t *io_end = private;
/* if not async direct IO just return */
if (!io_end)
@@ -3204,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- iocb->private, io_end->inode->i_ino, iocb, offset,
- size);
+ io_end, io_end->inode->i_ino, iocb, offset, size);
- iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
ext4_put_io_end(io_end);
@@ -3243,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
- ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3268,16 +3361,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/*
* We could direct write to holes and fallocate.
*
- * Allocated blocks to fill the hole are marked as
- * unwritten to prevent parallel buffered read to expose
- * the stale data before DIO complete the data IO.
+ * Allocated blocks to fill the hole are marked as unwritten to prevent
+ * parallel buffered read to expose the stale data before DIO complete
+ * the data IO.
*
- * As to previously fallocated extents, ext4 get_block will
- * just simply mark the buffer mapped but still keep the
- * extents unwritten.
+ * As to previously fallocated extents, ext4 get_block will just simply
+ * mark the buffer mapped but still keep the extents unwritten.
*
- * For non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
+ * For non AIO case, we will convert those unwritten extents to written
+ * after return back from blockdev_direct_IO. That way we save us from
+ * allocating io_end structure and also the overhead of offloading
+ * the extent convertion to a workqueue.
*
* For async DIO, the conversion needs to be deferred when the
* IO is completed. The ext4 end_io callback function will be
@@ -3285,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb.
*/
iocb->private = NULL;
- if (overwrite) {
- get_block_func = ext4_get_block_overwrite;
+ if (overwrite)
+ get_block_func = ext4_dio_get_block_overwrite;
+ else if (is_sync_kiocb(iocb)) {
+ get_block_func = ext4_dio_get_block_unwritten_sync;
+ dio_flags = DIO_LOCKING;
} else {
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- /*
- * Grab reference for DIO. Will be dropped in
- * ext4_end_io_dio()
- */
- iocb->private = ext4_get_io_end(io_end);
- /*
- * we save the io structure for current async direct
- * IO, so that later ext4_map_blocks() could flag the
- * io structure whether there is a unwritten extents
- * needs to be converted when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
- get_block_func = ext4_get_block_write;
+ get_block_func = ext4_dio_get_block_unwritten_async;
dio_flags = DIO_LOCKING;
}
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3323,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);
- /*
- * Put our reference to io_end. This can free the io_end structure e.g.
- * in sync IO case or in case of error. It can even perform extent
- * conversion if all bios we submitted finished before we got here.
- * Note that in that case iocb->private can be already set to NULL
- * here.
- */
- if (io_end) {
- ext4_inode_aio_set(inode, NULL);
- ext4_put_io_end(io_end);
- /*
- * When no IO was submitted ext4_end_io_dio() was not
- * called so we have to put iocb's reference.
- */
- if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
- WARN_ON(iocb->private != io_end);
- WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- ext4_put_io_end(io_end);
- iocb->private = NULL;
- }
- }
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
@@ -3358,7 +3414,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
-retake_lock:
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
@@ -5261,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
might_sleep();
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ return err;
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5291,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
}
}
}
- if (!err)
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- return err;
+ return ext4_mark_iloc_dirty(handle, inode, &iloc);
}
/*
@@ -5502,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unlock_page(page);
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
- get_block = ext4_get_block_write;
+ get_block = ext4_get_block_unwritten;
else
get_block = ext4_get_block;
retry_alloc:
@@ -5545,3 +5600,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return err;
}
+
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len, struct extent_status *result)
+{
+ struct ext4_map_blocks map;
+ struct extent_status es = {};
+ int ret;
+
+ map.m_lblk = lblk;
+ map.m_len = map_len;
+
+ /*
+ * For non-extent based files this loop may iterate several times since
+ * we do not determine full hole size.
+ */
+ while (map.m_len > 0) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ /* There's extent covering m_lblk? Just return it. */
+ if (ret > 0) {
+ int status;
+
+ ext4_es_store_pblock(result, map.m_pblk);
+ result->es_lblk = map.m_lblk;
+ result->es_len = map.m_len;
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ status = EXTENT_STATUS_UNWRITTEN;
+ else
+ status = EXTENT_STATUS_WRITTEN;
+ ext4_es_store_status(result, status);
+ return 1;
+ }
+ ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+ map.m_lblk + map.m_len - 1,
+ &es);
+ /* Is delalloc data before next block in extent tree? */
+ if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+ ext4_lblk_t offset = 0;
+
+ if (es.es_lblk < lblk)
+ offset = lblk - es.es_lblk;
+ result->es_lblk = es.es_lblk + offset;
+ ext4_es_store_pblock(result,
+ ext4_es_pblock(&es) + offset);
+ result->es_len = es.es_len - offset;
+ ext4_es_store_status(result, ext4_es_status(&es));
+
+ return 1;
+ }
+ /* There's a hole at m_lblk, advance us after it */
+ map.m_lblk += map.m_len;
+ map_len -= map.m_len;
+ map.m_len = map_len;
+ cond_resched();
+ }
+ result->es_len = 0;
+ return 0;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4424b7bf8ac6..50e05df28f66 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* for this page; do not hold this lock when calling this routine!
*/
-static int ext4_mb_init_cache(struct page *page, char *incore)
+static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
int blocksize;
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
i = sizeof(struct buffer_head *) * groups_per_page;
- bh = kzalloc(i, GFP_NOFS);
+ bh = kzalloc(i, gfp);
if (bh == NULL) {
err = -ENOMEM;
goto out;
@@ -983,7 +983,7 @@ out:
* are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum, poff;
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block++;
pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
* calling this routine!
*/
static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{
struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* The call to ext4_mb_get_buddy_page_lock will mark the
* page accessed.
*/
- ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
/* init buddy cache */
page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
* calling this routine!
*/
static noinline_for_stack int
-ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
- struct ext4_buddy *e4b)
+ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b, gfp_t gfp)
{
int blocks_per_page;
int block;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* we need full data about the group
* to make a good selection
*/
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, gfp);
if (ret)
return ret;
}
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* wait for it to initialize.
*/
page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1247,6 +1248,12 @@ err:
return ret;
}
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+{
+ return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
+}
+
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group);
+ int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
}
/*
- * We need to make sure we don't reuse the freed block until
- * after the transaction is committed, which we can do by
- * treating the block as metadata, below. We make an
- * exception if the inode is to be written in writeback mode
- * since writeback mode has weak data consistency guarantees.
- */
- if (!ext4_should_writeback_data(inode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
-
- /*
* If the extent to be freed does not begin on a cluster
* boundary, we need to deal with partial clusters at the
* beginning and end of the extent. Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
for (i = 0; i < count; i++) {
cond_resched();
- bh = sb_find_get_block(inode->i_sb, block + i);
- if (!bh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block + i);
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
@@ -4815,16 +4811,23 @@ do_more:
#endif
trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
if (err)
goto error_return;
- if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+ /*
+ * We need to make sure we don't reuse the freed block until after the
+ * transaction is committed. We make an exception if the inode is to be
+ * written in writeback mode since writeback mode has weak data
+ * consistency guarantees.
+ */
+ if (ext4_handle_valid(handle) &&
+ ((flags & EXT4_FREE_BLOCKS_METADATA) ||
+ !ext4_should_writeback_data(inode))) {
struct ext4_free_data *new_entry;
/*
- * blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed
- *
* We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
* to fail.
*/
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
grp = ext4_get_group_info(sb, group);
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
break;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
#include "ext4.h"
/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
*/
#ifdef CONFIG_EXT4_DEBUG
extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* blocks.
*
* While converting to extents we need not
- * update the orignal inode i_blocks for extent blocks
+ * update the original inode i_blocks for extent blocks
* via quota APIs. The quota update happened via tmp_inode already.
*/
spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
- brelse(*bh);
- *bh = NULL;
ret = -EIO;
goto warn_exit;
}
-
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
- else if (!ext4_mmp_csum_verify(sb, mmp))
+ goto warn_exit;
+ }
+ if (!ext4_mmp_csum_verify(sb, mmp)) {
ret = -EFSBADCRC;
- else
- return 0;
-
+ goto warn_exit;
+ }
+ return 0;
warn_exit:
+ brelse(*bh);
+ *bh = NULL;
ext4_warning(sb, "Error %d while reading MMP block %llu",
ret, mmp_block);
return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
-
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
- goto failed;
+ put_bh(bh_check);
+ retval = -EBUSY;
+ goto exit_thread;
}
put_bh(bh_check);
}
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-failed:
+exit_thread:
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(data);
brelse(bh);
return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..349d7aa04fe7 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
WARN_ON(io_end->handle);
- if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io_end->inode));
-
for (bio = io_end->bio; bio; bio = next_bio) {
next_bio = bio->bi_private;
ext4_finish_bio(bio);
@@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
if (io) {
- atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec011d7..99996e9a8f57 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static int ext4_mballoc_ready;
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
ext4_clear_feature_journal_needs_recovery(sb);
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -1425,9 +1422,9 @@ static const struct mount_opts {
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_SET},
+ MOPT_NO_EXT2},
{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_CLEAR},
+ MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1705,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
#endif
+ } else if (token == Opt_data_err_abort) {
+ sbi->s_mount_opt |= m->mount_opt;
+ } else if (token == Opt_data_err_ignore) {
+ sbi->s_mount_opt &= ~m->mount_opt;
} else {
if (!args->from)
arg = 1;
@@ -1914,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+ if (test_opt(sb, DATA_ERR_ABORT))
+ SEQ_OPTS_PUTS("data_err=abort");
ext4_show_quota_options(seq, sb);
return 0;
@@ -3796,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
no_journal:
- if (ext4_mballoc_ready) {
- sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
- if (!sbi->s_mb_cache) {
- ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
- goto failed_mount_wq;
- }
+ sbi->s_mb_cache = ext4_xattr_create_cache();
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4027,6 +4028,10 @@ failed_mount4:
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -5321,7 +5326,6 @@ MODULE_ALIAS_FS("ext4");
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
static int __init ext4_init_fs(void)
{
@@ -5334,10 +5338,8 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
- mutex_init(&ext4__aio_mutex[i]);
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
init_waitqueue_head(&ext4__ioend_wq[i]);
- }
err = ext4_init_es();
if (err)
@@ -5358,8 +5360,6 @@ static int __init ext4_init_fs(void)
err = ext4_init_mballoc();
if (err)
goto out2;
- else
- ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
@@ -5375,7 +5375,6 @@ out:
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
ext4_exit_sysfs();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a95151e875bd..0441e055c8e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
- struct mb_cache_entry *ce = NULL;
- int error = 0;
struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ u32 hash, ref;
+ int error = 0;
- ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
lock_buffer(bh);
- if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+ hash = le32_to_cpu(BHDR(bh)->h_hash);
+ ref = le32_to_cpu(BHDR(bh)->h_refcount);
+ if (ref == 1) {
ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect freed block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
get_bh(bh);
unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
} else {
- le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
+ ref--;
+ BHDR(bh)->h_refcount = cpu_to_le32(ref);
+ if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
+ struct mb_cache_entry *ce;
+
+ ce = mb_cache_entry_get(ext4_mb_cache, hash,
+ bh->b_blocknr);
+ if (ce) {
+ ce->e_reusable = 1;
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ }
+ }
+
/*
* Beware of this ugliness: Releasing of xattr block references
* from different inodes can race and so we have to protect
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
- bs->bh->b_blocknr);
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
lock_buffer(bs->bh);
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
- if (ce) {
- mb_cache_entry_free(ce);
- ce = NULL;
- }
+ __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect modified
+ * block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash,
+ bs->bh->b_blocknr);
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s);
if (!error) {
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- if (ce) {
- mb_cache_entry_release(ce);
- ce = NULL;
- }
ea_bdebug(bs->bh, "cloning");
s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
@@ -872,6 +885,8 @@ inserted:
if (new_bh == bs->bh)
ea_bdebug(new_bh, "keeping");
else {
+ u32 ref;
+
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -884,9 +899,40 @@ inserted:
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
- le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+ /*
+ * We have to be careful about races with
+ * freeing, rehashing or adding references to
+ * xattr block. Once we hold buffer lock xattr
+ * block's state is stable so we can check
+ * whether the block got freed / rehashed or
+ * not. Since we unhash mbcache entry under
+ * buffer lock when freeing / rehashing xattr
+ * block, checking whether entry is still
+ * hashed is reliable. Same rules hold for
+ * e_reusable handling.
+ */
+ if (hlist_bl_unhashed(&ce->e_hash_list) ||
+ !ce->e_reusable) {
+ /*
+ * Undo everything and check mbcache
+ * again.
+ */
+ unlock_buffer(new_bh);
+ dquot_free_block(inode,
+ EXT4_C2B(EXT4_SB(sb),
+ 1));
+ brelse(new_bh);
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ ce = NULL;
+ new_bh = NULL;
+ goto inserted;
+ }
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
+ if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
- le32_to_cpu(BHDR(new_bh)->h_refcount));
+ ref);
unlock_buffer(new_bh);
error = ext4_handle_dirty_xattr_block(handle,
inode,
@@ -894,7 +940,8 @@ inserted:
if (error)
goto cleanup_dquot;
}
- mb_cache_entry_release(ce);
+ mb_cache_entry_touch(ext4_mb_cache, ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
ce = NULL;
} else if (bs->bh && s->base == bs->bh->b_data) {
/* We were modifying this block in-place. */
@@ -959,7 +1006,7 @@ getblk_failed:
cleanup:
if (ce)
- mb_cache_entry_release(ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
brelse(new_bh);
if (!(bs->bh && s->base == bs->bh->b_data))
kfree(s->base);
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
return 0;
}
+static int ext4_xattr_value_same(struct ext4_xattr_search *s,
+ struct ext4_xattr_info *i)
+{
+ void *value;
+
+ if (le32_to_cpu(s->here->e_value_size) != i->value_len)
+ return 0;
+ value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
+ return !memcmp(value, i->value, i->value_len);
+}
+
/*
* ext4_xattr_set_handle()
*
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
+ error = 0;
+ /* Xattr value did not change? Save us some work and bail out */
+ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
+ goto cleanup;
+ if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
+ goto cleanup;
+
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
@@ -1512,17 +1577,6 @@ cleanup:
}
/*
- * ext4_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext4_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-/*
* ext4_xattr_cache_insert()
*
* Create a new entry in the extended attribute cache, and insert
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
static void
ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
- __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
- struct mb_cache_entry *ce;
+ struct ext4_xattr_header *header = BHDR(bh);
+ __u32 hash = le32_to_cpu(header->h_hash);
+ int reusable = le32_to_cpu(header->h_refcount) <
+ EXT4_XATTR_REFCOUNT_MAX;
int error;
- ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
- if (!ce) {
- ea_bdebug(bh, "out of memory");
- return;
- }
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
+ bh->b_blocknr, reusable);
if (error) {
- mb_cache_entry_free(ce);
- if (error == -EBUSY) {
+ if (error == -EBUSY)
ea_bdebug(bh, "already in cache");
- error = 0;
- }
- } else {
+ } else
ea_bdebug(bh, "inserting [%x]", (int)hash);
- mb_cache_entry_release(ce);
- }
}
/*
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
- ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
- hash);
+ ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
while (ce) {
struct buffer_head *bh;
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long) ce->e_block);
- } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
- EXT4_XATTR_REFCOUNT_MAX) {
- ea_idebug(inode, "block %lu refcount %d>=%d",
- (unsigned long) ce->e_block,
- le32_to_cpu(BHDR(bh)->h_refcount),
- EXT4_XATTR_REFCOUNT_MAX);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
}
return NULL;
}
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#define HASH_BUCKET_BITS 10
struct mb_cache *
-ext4_xattr_create_cache(char *name)
+ext4_xattr_create_cache(void)
{
- return mb_cache_create(name, HASH_BUCKET_BITS);
+ return mb_cache_create(HASH_BUCKET_BITS);
}
void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
#ifdef CONFIG_EXT4_FS_SECURITY