summaryrefslogtreecommitdiff
path: root/fs/block_dev.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 11:57:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 11:57:03 -0700
commit382625d0d4325fb14a29444eb8dce8dcc2eb9b51 (patch)
treede35ff523e65c3a98fd3ac3a3595d517856d03da /fs/block_dev.c
parent99f6cf61f175c1239ed8e86d4a1757c380da52d1 (diff)
parentd958e343bdc3de2643ce25225bed082dc222858d (diff)
Merge tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "Good amount of cleanups and tech debt removals in here, and as a result, the diffstat shows a nice net reduction in code. - Softirq completion cleanups (Christoph) - Stop using ->queuedata (Christoph) - Cleanup bd claiming (Christoph) - Use check_events, moving away from the legacy media change (Christoph) - Use inode i_blkbits consistently (Christoph) - Remove old unused writeback congestion bits (Christoph) - Cleanup/unify submission path (Christoph) - Use bio_uninit consistently, instead of bio_disassociate_blkg (Christoph) - sbitmap cleared bits handling (John) - Request merging blktrace event addition (Jan) - sysfs add/remove race fixes (Luis) - blk-mq tag fixes/optimizations (Ming) - Duplicate words in comments (Randy) - Flush deferral cleanup (Yufen) - IO context locking/retry fixes (John) - struct_size() usage (Gustavo) - blk-iocost fixes (Chengming) - blk-cgroup IO stats fixes (Boris) - Various little fixes" * tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block: (135 commits) block: blk-timeout: delete duplicated word block: blk-mq-sched: delete duplicated word block: blk-mq: delete duplicated word block: genhd: delete duplicated words block: elevator: delete duplicated word and fix typos block: bio: delete duplicated words block: bfq-iosched: fix duplicated word iocost_monitor: start from the oldest usage index iocost: Fix check condition of iocg abs_vdebt block: Remove callback typedefs for blk_mq_ops block: Use non _rcu version of list functions for tag_set_list blk-cgroup: show global disk stats in root cgroup io.stat blk-cgroup: make iostat functions visible to stat printing block: improve discard bio alignment in __blkdev_issue_discard() block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers block: defer flush request no matter whether we have elevator block: make blk_timeout_init() static block: remove retry loop in ioc_release_fn() block: remove unnecessary ioc nested locking block: integrate bd_start_claiming into __blkdev_get ...
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r--fs/block_dev.c315
1 files changed, 99 insertions, 216 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0ae656e022fd..3f94a06a0946 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -105,16 +105,7 @@ EXPORT_SYMBOL(invalidate_bdev);
static void set_init_blocksize(struct block_device *bdev)
{
- unsigned bsize = bdev_logical_block_size(bdev);
- loff_t size = i_size_read(bdev->bd_inode);
-
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+ bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
}
int set_blocksize(struct block_device *bdev, int size)
@@ -128,9 +119,8 @@ int set_blocksize(struct block_device *bdev, int size)
return -EINVAL;
/* Don't change the size if it is same as current */
- if (bdev->bd_block_size != size) {
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
- bdev->bd_block_size = size;
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
@@ -703,12 +693,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -739,7 +729,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
@@ -752,7 +742,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -783,7 +773,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
@@ -799,9 +788,6 @@ static void bdev_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- spin_lock(&bdev_lock);
- list_del_init(&bdev->bd_list);
- spin_unlock(&bdev_lock);
/* Detach inode from wb early as bdi_put() may free bdi->wb */
inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
@@ -876,8 +862,6 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-static LIST_HEAD(all_bdevs);
-
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
@@ -895,7 +879,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
- bdev->bd_block_size = i_blocksize(inode);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
@@ -903,9 +886,6 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- spin_lock(&bdev_lock);
- list_add(&bdev->bd_list, &all_bdevs);
- spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
@@ -926,13 +906,14 @@ EXPORT_SYMBOL(bdgrab);
long nr_blockdev_pages(void)
{
- struct block_device *bdev;
+ struct inode *inode;
long ret = 0;
- spin_lock(&bdev_lock);
- list_for_each_entry(bdev, &all_bdevs, bd_list) {
- ret += bdev->bd_inode->i_mapping->nrpages;
- }
- spin_unlock(&bdev_lock);
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ ret += inode->i_mapping->nrpages;
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
return ret;
}
@@ -1034,30 +1015,28 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
}
/**
- * bd_prepare_to_claim - prepare to claim a block device
+ * bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
* @whole: the whole device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
- * Prepare to claim @bdev. This function fails if @bdev is already
- * claimed by another holder and waits if another claiming is in
- * progress. This function doesn't actually claim. On successful
- * return, the caller has ownership of bd_claiming and bd_holder[s].
- *
- * CONTEXT:
- * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
- * it multiple times.
+ * Claim @bdev. This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
*
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
-static int bd_prepare_to_claim(struct block_device *bdev,
- struct block_device *whole, void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
retry:
+ spin_lock(&bdev_lock);
/* if someone else claimed, fail */
- if (!bd_may_claim(bdev, whole, holder))
+ if (!bd_may_claim(bdev, whole, holder)) {
+ spin_unlock(&bdev_lock);
return -EBUSY;
+ }
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) {
@@ -1068,13 +1047,15 @@ retry:
spin_unlock(&bdev_lock);
schedule();
finish_wait(wq, &wait);
- spin_lock(&bdev_lock);
goto retry;
}
/* yay, all mine */
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
return 0;
}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
{
@@ -1097,78 +1078,6 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
return disk;
}
-/**
- * bd_start_claiming - start claiming a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * @bdev is about to be opened exclusively. Check @bdev can be opened
- * exclusively and mark that an exclusive open is in progress. Each
- * successful call to this function must be matched with a call to
- * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * fail).
- *
- * This function is used to gain exclusive access to the block device
- * without actually causing other exclusive open attempts to fail. It
- * should be used when the open sequence itself requires exclusive
- * access but may subsequently fail.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to the block device containing @bdev on success, ERR_PTR()
- * value on failure.
- */
-struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
-{
- struct gendisk *disk;
- struct block_device *whole;
- int partno, err;
-
- might_sleep();
-
- /*
- * @bdev might not have been initialized properly yet, look up
- * and grab the outer block device the hard way.
- */
- disk = bdev_get_gendisk(bdev, &partno);
- if (!disk)
- return ERR_PTR(-ENXIO);
-
- /*
- * Normally, @bdev should equal what's returned from bdget_disk()
- * if partno is 0; however, some drivers (floppy) use multiple
- * bdev's for the same physical device and @bdev may be one of the
- * aliases. Keep @bdev if partno is 0. This means claimer
- * tracking is broken for those devices but it has always been that
- * way.
- */
- if (partno)
- whole = bdget_disk(disk, 0);
- else
- whole = bdgrab(bdev);
-
- put_disk_and_module(disk);
- if (!whole)
- return ERR_PTR(-ENOMEM);
-
- /* prepare to claim, if successful, mark claiming in progress */
- spin_lock(&bdev_lock);
-
- err = bd_prepare_to_claim(bdev, whole, holder);
- if (err == 0) {
- whole->bd_claiming = holder;
- spin_unlock(&bdev_lock);
- return whole;
- } else {
- spin_unlock(&bdev_lock);
- bdput(whole);
- return ERR_PTR(err);
- }
-}
-EXPORT_SYMBOL(bd_start_claiming);
-
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
lockdep_assert_held(&bdev_lock);
@@ -1181,14 +1090,14 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
/**
* bd_finish_claiming - finish claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Finish exclusive open of a block device. Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish.
*/
-void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
- void *holder)
+static void bd_finish_claiming(struct block_device *bdev,
+ struct block_device *whole, void *holder)
{
spin_lock(&bdev_lock);
BUG_ON(!bd_may_claim(bdev, whole, holder));
@@ -1203,12 +1112,11 @@ void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
bd_clear_claiming(whole, holder);
spin_unlock(&bdev_lock);
}
-EXPORT_SYMBOL(bd_finish_claiming);
/**
* bd_abort_claiming - abort claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Abort claiming of a block device when the exclusive open failed. This can be
@@ -1368,26 +1276,6 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif
/**
- * flush_disk - invalidates all buffer-cache entries on a disk
- *
- * @bdev: struct block device to be flushed
- * @kill_dirty: flag to guide handling of dirty inodes
- *
- * Invalidates all buffer-cache entries on a disk. It should be called
- * when a disk has been changed -- either by a media change or online
- * resize.
- */
-static void flush_disk(struct block_device *bdev, bool kill_dirty)
-{
- if (__invalidate_device(bdev, kill_dirty)) {
- printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n",
- bdev->bd_disk ? bdev->bd_disk->disk_name : "");
- }
- bdev->bd_invalidated = 1;
-}
-
-/**
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
@@ -1411,8 +1299,9 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size)
- flush_disk(bdev, false);
+ if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ pr_warn("VFS: busy inodes on resized disk %s\n",
+ disk->disk_name);
}
bdev->bd_invalidated = 0;
}
@@ -1471,7 +1360,10 @@ int check_disk_change(struct block_device *bdev)
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
- flush_disk(bdev, true);
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ disk->disk_name);
+ bdev->bd_invalidated = 1;
if (bdops->revalidate_disk)
bdops->revalidate_disk(bdev->bd_disk);
return 1;
@@ -1547,13 +1439,15 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
* mutex_lock_nested(whole->bd_mutex, 1)
*/
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
+ int for_part)
{
+ struct block_device *whole = NULL, *claiming = NULL;
struct gendisk *disk;
int ret;
int partno;
int perm = 0;
- bool first_open = false;
+ bool first_open = false, unblock_events = true, need_restart;
if (mode & FMODE_READ)
perm |= MAY_READ;
@@ -1569,18 +1463,36 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
restart:
-
+ need_restart = false;
ret = -ENXIO;
disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
goto out;
+ if (partno) {
+ whole = bdget_disk(disk, 0);
+ if (!whole) {
+ ret = -ENOMEM;
+ goto out_put_disk;
+ }
+ }
+
+ if (!for_part && (mode & FMODE_EXCL)) {
+ WARN_ON_ONCE(!holder);
+ if (whole)
+ claiming = whole;
+ else
+ claiming = bdev;
+ ret = bd_prepare_to_claim(bdev, claiming, holder);
+ if (ret)
+ goto out_put_whole;
+ }
+
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
first_open = true;
bdev->bd_disk = disk;
- bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_partno = partno;
@@ -1593,20 +1505,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
- if (ret == -ERESTARTSYS) {
- /* Lost a race with 'disk' being
- * deleted, try again.
- * See md.c
- */
- disk_put_part(bdev->bd_part);
- bdev->bd_part = NULL;
- bdev->bd_disk = NULL;
- bdev->bd_queue = NULL;
- mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
- put_disk_and_module(disk);
- goto restart;
- }
+ /*
+ * If we lost a race with 'disk' being deleted,
+ * try again. See md.c
+ */
+ if (ret == -ERESTARTSYS)
+ need_restart = true;
}
if (!ret) {
@@ -1627,18 +1531,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_clear;
} else {
- struct block_device *whole;
- whole = bdget_disk(disk, 0);
- ret = -ENOMEM;
- if (!whole)
- goto out_clear;
BUG_ON(for_part);
- ret = __blkdev_get(whole, mode, 1);
- if (ret) {
- bdput(whole);
+ ret = __blkdev_get(whole, mode, NULL, 1);
+ if (ret)
goto out_clear;
- }
- bdev->bd_contains = whole;
+ bdev->bd_contains = bdgrab(whole);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1667,27 +1564,52 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
+ if (claiming)
+ bd_finish_claiming(bdev, claiming, holder);
+
+ /*
+ * Block event polling for write claims if requested. Any write holder
+ * makes the write_holder state stick until all are released. This is
+ * good enough and tracking individual writeable reference is too
+ * fragile given the way @mode is used in blkdev_get/put().
+ */
+ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+
/* only one opener holds refs to the module and disk */
if (!first_open)
put_disk_and_module(disk);
+ if (whole)
+ bdput(whole);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
- bdev->bd_queue = NULL;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
+ if (claiming)
+ bd_abort_claiming(bdev, claiming, holder);
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
+ out_put_whole:
+ if (whole)
+ bdput(whole);
+ out_put_disk:
put_disk_and_module(disk);
+ if (need_restart)
+ goto restart;
out:
-
return ret;
}
@@ -1712,50 +1634,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- struct block_device *whole = NULL;
int res;
- WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
-
- if ((mode & FMODE_EXCL) && holder) {
- whole = bd_start_claiming(bdev, holder);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return PTR_ERR(whole);
- }
- }
-
- res = __blkdev_get(bdev, mode, 0);
-
- if (whole) {
- struct gendisk *disk = whole->bd_disk;
-
- /* finish claiming */
- mutex_lock(&bdev->bd_mutex);
- if (!res)
- bd_finish_claiming(bdev, whole, holder);
- else
- bd_abort_claiming(bdev, whole, holder);
- /*
- * Block event polling for write claims if requested. Any
- * write holder makes the write_holder state stick until
- * all are released. This is good enough and tracking
- * individual writeable reference is too fragile given the
- * way @mode is used in blkdev_get/put().
- */
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- disk_block_events(disk);
- }
-
- mutex_unlock(&bdev->bd_mutex);
- bdput(whole);
- }
-
+ res =__blkdev_get(bdev, mode, holder, 0);
if (res)
bdput(bdev);
-
return res;
}
EXPORT_SYMBOL(blkdev_get);